diff mbox series

[2/4] x86: record SSP at non-guest entry points

Message ID 0ad4543b-8eed-4147-b32d-b68d21fade98@suse.com (mailing list archive)
State New, archived
Headers show
Series x86: CET-SS related adjustments | expand

Commit Message

Jan Beulich Feb. 28, 2024, 1:52 p.m. UTC
We will want to use that value for call trace generation, and likely
also to eliminate the somewhat fragile shadow stack searching done in
fixup_exception_return(). For those purposes, guest-only entry points do
not need to record that value.

To keep the saving code simple, record our own SSP that corresponds to
an exception frame, pointing to the top of the shadow stack counterpart
of what the CPU has saved on the regular stack. Consuming code can then
work its way from there.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
To record the full 64-bit value, some of the unused bits in the %cs slot
could be used. Sadly that slot has saved_upcall_mask in an unhelpful
location, otherwise simply storing low and high 32 bits in those two
separate half-slots would be a pretty obvious choice. As long as
"entry_ssp" is used in non-guest-entry frames only, we could of course
put half of it into a union with saved_upcall_mask ...

Else may want to put a BUILD_BUG_ON(VADDR_BITS > 48) somewhere, but I'm
afraid I can't really identify a good place for such to live.

Leveraging that the CPU stores zero in the upper bits of the selector
register slots, the save sequence could also be

        shl   $16, %rcx
        or    %rcx, UREGS_entry_ssp-2(%rsp)

That's shorter and avoids a 16-bit operation, but may be less desirable,
for being a read-modify-write access.

Comments

Andrew Cooper Feb. 28, 2024, 3:16 p.m. UTC | #1
On 28/02/2024 1:52 pm, Jan Beulich wrote:
> We will want to use that value for call trace generation, and likely
> also to eliminate the somewhat fragile shadow stack searching done in
> fixup_exception_return(). For those purposes, guest-only entry points do
> not need to record that value.
>
> To keep the saving code simple, record our own SSP that corresponds to
> an exception frame, pointing to the top of the shadow stack counterpart
> of what the CPU has saved on the regular stack. Consuming code can then
> work its way from there.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> To record the full 64-bit value, some of the unused bits in the %cs slot
> could be used. Sadly that slot has saved_upcall_mask in an unhelpful
> location, otherwise simply storing low and high 32 bits in those two
> separate half-slots would be a pretty obvious choice. As long as
> "entry_ssp" is used in non-guest-entry frames only, we could of course
> put half of it into a union with saved_upcall_mask ...
>
> Else may want to put a BUILD_BUG_ON(VADDR_BITS > 48) somewhere, but I'm
> afraid I can't really identify a good place for such to live.

Perhaps in reinit_bsp_stack() when we enable SHSTK on the BSP?

Having it anywhere vaguely relevant is better than not having it.

>
> Leveraging that the CPU stores zero in the upper bits of the selector
> register slots, the save sequence could also be
>
>         shl   $16, %rcx
>         or    %rcx, UREGS_entry_ssp-2(%rsp)
>
> That's shorter and avoids a 16-bit operation, but may be less desirable,
> for being a read-modify-write access.

I doubt you'll be able to measure a difference between the two options
(it's into the active stack, after all), but the two stores is probably
marginally better.  When shstks are active, we're taking a large hit
from the busy token handling.

I was concerned by the misaligned access, but it's not misaligned, its
it?  It's the start of entry_ssp which is misaligned and the -2 brings
it back to being properly aligned.

> --- a/xen/arch/x86/include/asm/asm_defns.h
> +++ b/xen/arch/x86/include/asm/asm_defns.h
> @@ -221,7 +221,7 @@ static always_inline void stac(void)
>  #endif
>  
>  #ifdef __ASSEMBLY__
> -.macro SAVE_ALL compat=0
> +.macro SAVE_ALL compat=0 ssp=IS_ENABLED(CONFIG_XEN_SHSTK)

I'm not sure this is what you want to do.  Because it's only the
default, we'll still....

>          addq  $-(UREGS_error_code-UREGS_r15), %rsp
>          cld
>          movq  %rdi,UREGS_rdi(%rsp)
> @@ -235,6 +235,9 @@ static always_inline void stac(void)
>          movq  %rax,UREGS_rax(%rsp)
>          xor   %eax, %eax
>  .if !\compat
> +.if \ssp
> +        rdsspq %rcx

... pick this up even in !CONFIG_XEN_SHSTK builds, and ...

> +.endif
>          movq  %r8,UREGS_r8(%rsp)
>          movq  %r9,UREGS_r9(%rsp)
>          movq  %r10,UREGS_r10(%rsp)
> @@ -264,6 +267,11 @@ static always_inline void stac(void)
>          xor   %r13d, %r13d
>          xor   %r14d, %r14d
>          xor   %r15d, %r15d
> +.if \ssp && !\compat
> +        mov   %cx, UREGS_entry_ssp(%rsp)
> +        shr   $16, %rcx
> +        mov   %ecx, UREGS_entry_ssp+2(%rsp)

... store it here.

I think you need to use ssp=1 by default, and

#ifdef CONFIG_XEN_SHSTK
.if
    ...

for these two blocks, so they disappear properly in !SHSTK builds.

But for the rest of the behaviour, there are two overlapping things,
because you end up getting entry_ssp=0 in SHSTK builds running on
hardware without shstk active.

And with that in mind, to confirm, the RDSSP block depends on the xor
%ecx,%ecx between the two hunks in order to function as intended?

> --- a/xen/include/public/arch-x86/xen-x86_64.h
> +++ b/xen/include/public/arch-x86/xen-x86_64.h
> @@ -183,7 +183,19 @@ struct cpu_user_regs {
>      uint8_t  _pad1[3];
>      __DECL_REG_LO16(flags); /* rflags.IF == !saved_upcall_mask */
>      __DECL_REG_LO8(sp);
> -    uint16_t ss, _pad2[3];
> +    uint16_t ss;
> +#if !defined(__XEN__)
> +    uint16_t _pad2[3];
> +#elif defined(COMPILE_OFFSETS)
> +    uint16_t entry_ssp[3];
> +#else
> +    /*
> +     * This points _at_ the corresponding shadow stack frame; it is _not_ the
> +     * outer context's SSP.  That, if the outer context has CET-SS enabled,
> +     * is stored in the top slot of the pointed to shadow stack frame.
> +     */
> +    signed long entry_ssp:48;
> +#endif

I have to admit that I dislike this.  (And only some of that is because
it's work I'm going to have to revert in order to make FRED support work...)

We desperately need to break our use of this structure to start with,
and with that done, we don't need to play games about hiding SSP in a
spare 6 bytes in an ABI dubiously made public nearly two decades ago.

How hard would it be just:

#define cpu_user_regs abi_cpu_user_regs
#include <public/xen.h>
#undef cpu_user_regs

and make a copy of cpu_user_regs that we can really put an SSP field into?

It would make this patch more simple, and we could finally get the vm86
block off the stack (which also fixes OoB accesses in the #DF handler -
cant remember if I finished the bodge around that or not.)

Irrespective of anything else, why do we have COMPILE_OFFSETS getting in
here?

~Andrew
Jan Beulich Feb. 29, 2024, 9:39 a.m. UTC | #2
On 28.02.2024 16:16, Andrew Cooper wrote:
> On 28/02/2024 1:52 pm, Jan Beulich wrote:
>> We will want to use that value for call trace generation, and likely
>> also to eliminate the somewhat fragile shadow stack searching done in
>> fixup_exception_return(). For those purposes, guest-only entry points do
>> not need to record that value.
>>
>> To keep the saving code simple, record our own SSP that corresponds to
>> an exception frame, pointing to the top of the shadow stack counterpart
>> of what the CPU has saved on the regular stack. Consuming code can then
>> work its way from there.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> To record the full 64-bit value, some of the unused bits in the %cs slot
>> could be used. Sadly that slot has saved_upcall_mask in an unhelpful
>> location, otherwise simply storing low and high 32 bits in those two
>> separate half-slots would be a pretty obvious choice. As long as
>> "entry_ssp" is used in non-guest-entry frames only, we could of course
>> put half of it into a union with saved_upcall_mask ...
>>
>> Else may want to put a BUILD_BUG_ON(VADDR_BITS > 48) somewhere, but I'm
>> afraid I can't really identify a good place for such to live.
> 
> Perhaps in reinit_bsp_stack() when we enable SHSTK on the BSP?
> 
> Having it anywhere vaguely relevant is better than not having it.

Okay.

>> Leveraging that the CPU stores zero in the upper bits of the selector
>> register slots, the save sequence could also be
>>
>>         shl   $16, %rcx
>>         or    %rcx, UREGS_entry_ssp-2(%rsp)
>>
>> That's shorter and avoids a 16-bit operation, but may be less desirable,
>> for being a read-modify-write access.
> 
> I doubt you'll be able to measure a difference between the two options
> (it's into the active stack, after all), but the two stores is probably
> marginally better.  When shstks are active, we're taking a large hit
> from the busy token handling.
> 
> I was concerned by the misaligned access, but it's not misaligned, its
> it?  It's the start of entry_ssp which is misaligned and the -2 brings
> it back to being properly aligned.

Correct.

>> --- a/xen/arch/x86/include/asm/asm_defns.h
>> +++ b/xen/arch/x86/include/asm/asm_defns.h
>> @@ -221,7 +221,7 @@ static always_inline void stac(void)
>>  #endif
>>  
>>  #ifdef __ASSEMBLY__
>> -.macro SAVE_ALL compat=0
>> +.macro SAVE_ALL compat=0 ssp=IS_ENABLED(CONFIG_XEN_SHSTK)
> 
> I'm not sure this is what you want to do.  Because it's only the
> default, we'll still....
> 
>>          addq  $-(UREGS_error_code-UREGS_r15), %rsp
>>          cld
>>          movq  %rdi,UREGS_rdi(%rsp)
>> @@ -235,6 +235,9 @@ static always_inline void stac(void)
>>          movq  %rax,UREGS_rax(%rsp)
>>          xor   %eax, %eax
>>  .if !\compat
>> +.if \ssp
>> +        rdsspq %rcx
> 
> ... pick this up even in !CONFIG_XEN_SHSTK builds, and ...
> 
>> +.endif
>>          movq  %r8,UREGS_r8(%rsp)
>>          movq  %r9,UREGS_r9(%rsp)
>>          movq  %r10,UREGS_r10(%rsp)
>> @@ -264,6 +267,11 @@ static always_inline void stac(void)
>>          xor   %r13d, %r13d
>>          xor   %r14d, %r14d
>>          xor   %r15d, %r15d
>> +.if \ssp && !\compat
>> +        mov   %cx, UREGS_entry_ssp(%rsp)
>> +        shr   $16, %rcx
>> +        mov   %ecx, UREGS_entry_ssp+2(%rsp)
> 
> ... store it here.
> 
> I think you need to use ssp=1 by default, and
> 
> #ifdef CONFIG_XEN_SHSTK
> .if
>     ...
> 
> for these two blocks, so they disappear properly in !SHSTK builds.

I'm afraid I don't follow: The macro parameter exists for use sites
to pass 0 even in SHSTK builds, for the entry-from-guest paths where
its recording is of no interest. Non-zero should never be passed
explicitly. Perhaps I ought to add a comment to this effect:

/* Use sites may override ssp to 0. It should never be overridden to 1. */

If that doesn't address your concern, then I'm afraid I'm not fully
understanding your comments, or I'm overlooking something crucial.

> But for the rest of the behaviour, there are two overlapping things,
> because you end up getting entry_ssp=0 in SHSTK builds running on
> hardware without shstk active.

Yes. I guess I don't understand what you're trying to indicate to
me.

> And with that in mind, to confirm, the RDSSP block depends on the xor
> %ecx,%ecx between the two hunks in order to function as intended?

Yes. In fact initially I had used %edx, with "interesting" effects.
It's not said anywhere in the macro that %edx needs to be zero by
the point the macro completes; comments to this effect exist only
past several of the use sites of the macro.

>> --- a/xen/include/public/arch-x86/xen-x86_64.h
>> +++ b/xen/include/public/arch-x86/xen-x86_64.h
>> @@ -183,7 +183,19 @@ struct cpu_user_regs {
>>      uint8_t  _pad1[3];
>>      __DECL_REG_LO16(flags); /* rflags.IF == !saved_upcall_mask */
>>      __DECL_REG_LO8(sp);
>> -    uint16_t ss, _pad2[3];
>> +    uint16_t ss;
>> +#if !defined(__XEN__)
>> +    uint16_t _pad2[3];
>> +#elif defined(COMPILE_OFFSETS)
>> +    uint16_t entry_ssp[3];
>> +#else
>> +    /*
>> +     * This points _at_ the corresponding shadow stack frame; it is _not_ the
>> +     * outer context's SSP.  That, if the outer context has CET-SS enabled,
>> +     * is stored in the top slot of the pointed to shadow stack frame.
>> +     */
>> +    signed long entry_ssp:48;
>> +#endif
> 
> I have to admit that I dislike this.  (And only some of that is because
> it's work I'm going to have to revert in order to make FRED support work...)

Right, some of the bits in the various slots which have space available
are used there. But the frame layout there is different anyway, so by
that point we won't get away without re-working our exception frame
layout. Taking care of the new extra field then is, I expect, not going
to make much of a difference, when without doing the transformation now
we can have some immediate gain.

> We desperately need to break our use of this structure to start with,
> and with that done, we don't need to play games about hiding SSP in a
> spare 6 bytes in an ABI dubiously made public nearly two decades ago.
> 
> How hard would it be just:
> 
> #define cpu_user_regs abi_cpu_user_regs
> #include <public/xen.h>
> #undef cpu_user_regs
> 
> and make a copy of cpu_user_regs that we can really put an SSP field into?

Well, that's easy to write here, but this pattern would then need
repeating in a few dozen places. Even abstracting it by way of a
helper header would seem problematic to me: We can't very well
forbid common code to include public/xen.h directly. We also have
a couple of direct inclusions of public/arch-x86/xen.h, which would
similarly need taking care of.

A better approach might be an #ifdef __XEN__ inside the header
itself. I could try that, but I'd stop as soon as I consider knock-
on effects too heavy for the simple purpose here.

I know you've been pointing out that we want to change how the stack
is organized. This not having happened yet was, for me, largely
because of quite likely not being straightforward to achieve.

> It would make this patch more simple, and we could finally get the vm86
> block off the stack (which also fixes OoB accesses in the #DF handler -
> cant remember if I finished the bodge around that or not.)
> 
> Irrespective of anything else, why do we have COMPILE_OFFSETS getting in
> here?

Because offsetof() implies determining the address of a field, which
is not allowed for bitfields.

Jan
diff mbox series

Patch

--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1354,6 +1354,7 @@  void arch_get_info_guest(struct vcpu *v,
     if ( !compat )
     {
         memcpy(&c.nat->user_regs, &v->arch.user_regs, sizeof(c.nat->user_regs));
+        c.nat->user_regs.entry_ssp = 0;
         if ( is_pv_domain(d) )
             memcpy(c.nat->trap_ctxt, v->arch.pv.trap_ctxt,
                    sizeof(c.nat->trap_ctxt));
--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -94,7 +94,7 @@  __UNLIKELY_END(nsvm_hap)
         sti
         vmrun
 
-        SAVE_ALL
+        SAVE_ALL ssp=0
 
         GET_CURRENT(bx)
 
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -25,7 +25,7 @@ 
 #define VMLAUNCH     .byte 0x0f,0x01,0xc2
 
 ENTRY(vmx_asm_vmexit_handler)
-        SAVE_ALL
+        SAVE_ALL ssp=0
 
         mov  %cr2,%rax
         GET_CURRENT(bx)
@@ -119,7 +119,7 @@  UNLIKELY_END(realmode)
 
 .Lvmx_vmentry_fail:
         sti
-        SAVE_ALL
+        SAVE_ALL ssp=0
 
         /*
          * SPEC_CTRL_ENTRY notes
--- a/xen/arch/x86/include/asm/asm_defns.h
+++ b/xen/arch/x86/include/asm/asm_defns.h
@@ -221,7 +221,7 @@  static always_inline void stac(void)
 #endif
 
 #ifdef __ASSEMBLY__
-.macro SAVE_ALL compat=0
+.macro SAVE_ALL compat=0 ssp=IS_ENABLED(CONFIG_XEN_SHSTK)
         addq  $-(UREGS_error_code-UREGS_r15), %rsp
         cld
         movq  %rdi,UREGS_rdi(%rsp)
@@ -235,6 +235,9 @@  static always_inline void stac(void)
         movq  %rax,UREGS_rax(%rsp)
         xor   %eax, %eax
 .if !\compat
+.if \ssp
+        rdsspq %rcx
+.endif
         movq  %r8,UREGS_r8(%rsp)
         movq  %r9,UREGS_r9(%rsp)
         movq  %r10,UREGS_r10(%rsp)
@@ -264,6 +267,11 @@  static always_inline void stac(void)
         xor   %r13d, %r13d
         xor   %r14d, %r14d
         xor   %r15d, %r15d
+.if \ssp && !\compat
+        mov   %cx, UREGS_entry_ssp(%rsp)
+        shr   $16, %rcx
+        mov   %ecx, UREGS_entry_ssp+2(%rsp)
+.endif
 .endm
 
 #define LOAD_ONE_REG(reg, compat) \
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -48,6 +48,7 @@  void __dummy__(void)
     OFFSET(UREGS_eflags, struct cpu_user_regs, rflags);
     OFFSET(UREGS_rsp, struct cpu_user_regs, rsp);
     OFFSET(UREGS_ss, struct cpu_user_regs, ss);
+    OFFSET(UREGS_entry_ssp, struct cpu_user_regs, entry_ssp);
     OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es);
     BLANK();
 
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -257,7 +257,7 @@  FUNC(lstar_enter)
         pushq $0
         BUILD_BUG_ON(TRAP_syscall & 0xff)
         movb  $TRAP_syscall >> 8, EFRAME_entry_vector + 1(%rsp)
-        SAVE_ALL
+        SAVE_ALL ssp=0
 
         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
@@ -296,7 +296,7 @@  FUNC(cstar_enter)
         pushq $0
         BUILD_BUG_ON(TRAP_syscall & 0xff)
         movb  $TRAP_syscall >> 8, EFRAME_entry_vector + 1(%rsp)
-        SAVE_ALL
+        SAVE_ALL ssp=0
 
         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
@@ -339,7 +339,7 @@  LABEL(sysenter_eflags_saved, 0)
         pushq $0
         BUILD_BUG_ON(TRAP_syscall & 0xff)
         movb  $TRAP_syscall >> 8, EFRAME_entry_vector + 1(%rsp)
-        SAVE_ALL
+        SAVE_ALL ssp=0
 
         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
@@ -394,7 +394,7 @@  FUNC(entry_int80)
         ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP
         pushq $0
         movb  $0x80, EFRAME_entry_vector(%rsp)
-        SAVE_ALL
+        SAVE_ALL ssp=0
 
         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
--- a/xen/include/public/arch-x86/xen-x86_64.h
+++ b/xen/include/public/arch-x86/xen-x86_64.h
@@ -183,7 +183,19 @@  struct cpu_user_regs {
     uint8_t  _pad1[3];
     __DECL_REG_LO16(flags); /* rflags.IF == !saved_upcall_mask */
     __DECL_REG_LO8(sp);
-    uint16_t ss, _pad2[3];
+    uint16_t ss;
+#if !defined(__XEN__)
+    uint16_t _pad2[3];
+#elif defined(COMPILE_OFFSETS)
+    uint16_t entry_ssp[3];
+#else
+    /*
+     * This points _at_ the corresponding shadow stack frame; it is _not_ the
+     * outer context's SSP.  That, if the outer context has CET-SS enabled,
+     * is stored in the top slot of the pointed to shadow stack frame.
+     */
+    signed long entry_ssp:48;
+#endif
     uint16_t es, _pad3[3];
     uint16_t ds, _pad4[3];
     uint16_t fs, _pad5[3];