diff mbox

[v3,2/4] x86: suppress SMEP and SMAP while running 32-bit PV guest code

Message ID 56EA72C402000078000DD92F@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich March 17, 2016, 8:03 a.m. UTC
Since such guests' kernel code runs in ring 1, their memory accesses,
at the paging layer, are supervisor mode ones, and hence subject to
SMAP/SMEP checks. Such guests cannot be expected to be aware of those
two features though (and so far we also don't expose the respective
feature flags), and hence may suffer page faults they cannot deal with.

While the placement of the re-enabling slightly weakens the intended
protection, it was selected such that 64-bit paths would remain
unaffected where possible. At the expense of a further performance hit
the re-enabling could be put right next to the CLACs.

Note that this introduces a number of extra TLB flushes - CR4.SMEP
transitioning from 0 to 1 always causes a flush, and it transitioning
from 1 to 0 may also do.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Move up CR4_PV32_RESTORE in handle_ist_exception. Don't clear SMEP/
    SMAP upon exiting to guest user mode. Read current CR4 value from
    memory cache even in assembly code.
v2: Use more generic symbol/label names. Comment the BUG in assembly
    code and restrict it to debug builds. Add C equivalent to #PF
    re-execution condition in a comment. Use .skip instead of .org in
    handle_exception to avoid gas bug (and its slightly ugly
    workaround). Use a properly named label instead of a numeric one
    in handle_exception.
x86: suppress SMEP and SMAP while running 32-bit PV guest code

Since such guests' kernel code runs in ring 1, their memory accesses,
at the paging layer, are supervisor mode ones, and hence subject to
SMAP/SMEP checks. Such guests cannot be expected to be aware of those
two features though (and so far we also don't expose the respective
feature flags), and hence may suffer page faults they cannot deal with.

While the placement of the re-enabling slightly weakens the intended
protection, it was selected such that 64-bit paths would remain
unaffected where possible. At the expense of a further performance hit
the re-enabling could be put right next to the CLACs.

Note that this introduces a number of extra TLB flushes - CR4.SMEP
transitioning from 0 to 1 always causes a flush, and it transitioning
from 1 to 0 may also do.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Move up CR4_PV32_RESTORE in handle_ist_exception. Don't clear SMEP/
    SMAP upon exiting to guest user mode. Read current CR4 value from
    memory cache even in assembly code.
v2: Use more generic symbol/label names. Comment the BUG in assembly
    code and restrict it to debug builds. Add C equivalent to #PF
    re-execution condition in a comment. Use .skip instead of .org in
    handle_exception to avoid gas bug (and its slightly ugly
    workaround). Use a properly named label instead of a numeric one
    in handle_exception.

--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -67,6 +67,8 @@ boolean_param("smep", opt_smep);
 static bool_t __initdata opt_smap = 1;
 boolean_param("smap", opt_smap);
 
+unsigned long __read_mostly cr4_pv32_mask;
+
 /* Boot dom0 in pvh mode */
 static bool_t __initdata opt_dom0pvh;
 boolean_param("dom0pvh", opt_dom0pvh);
@@ -1364,6 +1366,8 @@ void __init noreturn __start_xen(unsigne
     if ( cpu_has_smap )
         set_in_cr4(X86_CR4_SMAP);
 
+    cr4_pv32_mask = mmu_cr4_features & (X86_CR4_SMEP | X86_CR4_SMAP);
+
     if ( cpu_has_fsgsbase )
         set_in_cr4(X86_CR4_FSGSBASE);
 
@@ -1500,7 +1504,10 @@ void __init noreturn __start_xen(unsigne
      * copy_from_user().
      */
     if ( cpu_has_smap )
+    {
+        cr4_pv32_mask &= ~X86_CR4_SMAP;
         write_cr4(read_cr4() & ~X86_CR4_SMAP);
+    }
 
     printk("%sNX (Execute Disable) protection %sactive\n",
            cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
@@ -1517,7 +1524,10 @@ void __init noreturn __start_xen(unsigne
         panic("Could not set up DOM0 guest OS");
 
     if ( cpu_has_smap )
+    {
         write_cr4(read_cr4() | X86_CR4_SMAP);
+        cr4_pv32_mask |= X86_CR4_SMAP;
+    }
 
     /* Scrub RAM that is still free and so may go to an unprivileged domain. */
     scrub_heap_pages();
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -135,6 +135,7 @@ void __dummy__(void)
     OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs);
     OFFSET(CPUINFO_processor_id, struct cpu_info, processor_id);
     OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu);
+    OFFSET(CPUINFO_cr4, struct cpu_info, cr4);
     DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info));
     BLANK();
 
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -16,14 +16,16 @@ ENTRY(compat_hypercall)
         ASM_CLAC
         pushq $0
         SAVE_VOLATILE type=TRAP_syscall compat=1
+        CR4_PV32_RESTORE
 
         cmpb  $0,untrusted_msi(%rip)
 UNLIKELY_START(ne, msi_check)
         movl  $HYPERCALL_VECTOR,%edi
         call  check_for_unexpected_msi
-        LOAD_C_CLOBBERED
+        LOAD_C_CLOBBERED compat=1 ax=0
 UNLIKELY_END(msi_check)
 
+        movl  UREGS_rax(%rsp),%eax
         GET_CURRENT(%rbx)
 
         cmpl  $NR_hypercalls,%eax
@@ -33,7 +35,6 @@ UNLIKELY_END(msi_check)
         pushq UREGS_rbx(%rsp); pushq %rcx; pushq %rdx; pushq %rsi; pushq %rdi
         pushq UREGS_rbp+5*8(%rsp)
         leaq  compat_hypercall_args_table(%rip),%r10
-        movl  %eax,%eax
         movl  $6,%ecx
         subb  (%r10,%rax,1),%cl
         movq  %rsp,%rdi
@@ -48,7 +49,6 @@ UNLIKELY_END(msi_check)
 #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */
 #else
         /* Relocate argument registers and zero-extend to 64 bits. */
-        movl  %eax,%eax              /* Hypercall #  */
         xchgl %ecx,%esi              /* Arg 2, Arg 4 */
         movl  %edx,%edx              /* Arg 3        */
         movl  %edi,%r8d              /* Arg 5        */
@@ -174,10 +174,61 @@ compat_bad_hypercall:
 /* %rbx: struct vcpu, interrupts disabled */
 ENTRY(compat_restore_all_guest)
         ASSERT_INTERRUPTS_DISABLED
+.Lcr4_orig:
+        ASM_NOP8 /* testb $3,UREGS_cs(%rsp) */
+        ASM_NOP2 /* jpe   .Lcr4_alt_end */
+        ASM_NOP8 /* mov   CPUINFO_cr4...(%rsp), %rax */
+        ASM_NOP6 /* and   $..., %rax */
+        ASM_NOP8 /* mov   %rax, CPUINFO_cr4...(%rsp) */
+        ASM_NOP3 /* mov   %rax, %cr4 */
+.Lcr4_orig_end:
+        .pushsection .altinstr_replacement, "ax"
+.Lcr4_alt:
+        testb $3,UREGS_cs(%rsp)
+        jpe   .Lcr4_alt_end
+        mov   CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp), %rax
+        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
+        mov   %rax, CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp)
+        mov   %rax, %cr4
+.Lcr4_alt_end:
+        .section .altinstructions, "a"
+        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, \
+                             (.Lcr4_orig_end - .Lcr4_orig), \
+                             (.Lcr4_alt_end - .Lcr4_alt)
+        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, \
+                             (.Lcr4_orig_end - .Lcr4_orig), \
+                             (.Lcr4_alt_end - .Lcr4_alt)
+        .popsection
         RESTORE_ALL adj=8 compat=1
 .Lft0:  iretq
         _ASM_PRE_EXTABLE(.Lft0, handle_exception)
 
+/* This mustn't modify registers other than %rax. */
+ENTRY(cr4_pv32_restore)
+        push  %rdx
+        GET_CPUINFO_FIELD(cr4, %rdx)
+        mov   (%rdx), %rax
+        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
+        jnz   0f
+        or    cr4_pv32_mask(%rip), %rax
+        mov   %rax, %cr4
+        mov   %rax, (%rdx)
+        pop   %rdx
+        ret
+0:
+#ifndef NDEBUG
+        /* Check that _all_ of the bits intended to be set actually are. */
+        mov   %cr4, %rax
+        and   cr4_pv32_mask(%rip), %eax
+        cmp   cr4_pv32_mask(%rip), %eax
+        je    1f
+        BUG
+1:
+#endif
+        pop   %rdx
+        xor   %eax, %eax
+        ret
+
 /* %rdx: trap_bounce, %rbx: struct vcpu */
 ENTRY(compat_post_handle_exception)
         testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
@@ -190,6 +241,7 @@ ENTRY(compat_post_handle_exception)
 /* See lstar_enter for entry register state. */
 ENTRY(cstar_enter)
         sti
+        CR4_PV32_RESTORE
         movq  8(%rsp),%rax /* Restore %rax. */
         movq  $FLAT_KERNEL_SS,8(%rsp)
         pushq %r11
@@ -225,6 +277,7 @@ UNLIKELY_END(compat_syscall_gpf)
         jmp   .Lcompat_bounce_exception
 
 ENTRY(compat_sysenter)
+        CR4_PV32_RESTORE
         movq  VCPU_trap_ctxt(%rbx),%rcx
         cmpb  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
         movzwl VCPU_sysenter_sel(%rbx),%eax
@@ -238,6 +291,7 @@ ENTRY(compat_sysenter)
         jmp   compat_test_all_events
 
 ENTRY(compat_int80_direct_trap)
+        CR4_PV32_RESTORE
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
 
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable)
 
 ENTRY(common_interrupt)
         SAVE_ALL CLAC
+        CR4_PV32_RESTORE
         movq %rsp,%rdi
         callq do_IRQ
         jmp ret_from_intr
@@ -454,13 +455,67 @@ ENTRY(page_fault)
 GLOBAL(handle_exception)
         SAVE_ALL CLAC
 handle_exception_saved:
+        GET_CURRENT(%rbx)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
         jz    exception_with_ints_disabled
+
+.Lcr4_pv32_orig:
+        jmp   .Lcr4_pv32_done
+        .skip (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt) - (. - .Lcr4_pv32_orig), 0xcc
+        .pushsection .altinstr_replacement, "ax"
+.Lcr4_pv32_alt:
+        mov   VCPU_domain(%rbx),%rax
+.Lcr4_pv32_alt_end:
+        .section .altinstructions, "a"
+        altinstruction_entry .Lcr4_pv32_orig, .Lcr4_pv32_alt, \
+                             X86_FEATURE_SMEP, \
+                             (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt), \
+                             (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt)
+        altinstruction_entry .Lcr4_pv32_orig, .Lcr4_pv32_alt, \
+                             X86_FEATURE_SMAP, \
+                             (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt), \
+                             (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt)
+        .popsection
+
+        testb $3,UREGS_cs(%rsp)
+        jz    .Lcr4_pv32_done
+        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
+        je    .Lcr4_pv32_done
+        call  cr4_pv32_restore
+        /*
+         * An NMI or #MC may occur between clearing CR4.SMEP / CR4.SMAP in
+         * compat_restore_all_guest and it actually returning to guest
+         * context, in which case the guest would run with the two features
+         * enabled. The only bad that can happen from this is a kernel mode
+         * #PF which the guest doesn't expect. Rather than trying to make the
+         * NMI/#MC exit path honor the intended CR4 setting, simply check
+         * whether the wrong CR4 was in use when the #PF occurred, and exit
+         * back to the guest (which will in turn clear the two CR4 bits) to
+         * re-execute the instruction. If we get back here, the CR4 bits
+         * should then be found clear (unless another NMI/#MC occurred at
+         * exactly the right time), and we'll continue processing the
+         * exception as normal.
+         */
+        test  %rax,%rax
+        jnz   .Lcr4_pv32_done
+        /*
+         * The below effectively is
+         * if ( regs->entry_vector == TRAP_page_fault &&
+         *      (regs->error_code & PFEC_page_present) &&
+         *      !(regs->error_code & ~(PFEC_write_access|PFEC_insn_fetch)) )
+         *     goto compat_test_all_events;
+         */
+        mov   $PFEC_page_present,%al
+        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
+        jne   .Lcr4_pv32_done
+        xor   UREGS_error_code(%rsp),%eax
+        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
+        jz    compat_test_all_events
+.Lcr4_pv32_done:
         sti
 1:      movq  %rsp,%rdi
         movzbl UREGS_entry_vector(%rsp),%eax
         leaq  exception_table(%rip),%rdx
-        GET_CURRENT(%rbx)
         PERFC_INCR(exceptions, %rax, %rbx)
         callq *(%rdx,%rax,8)
         testb $3,UREGS_cs(%rsp)
@@ -590,6 +645,7 @@ ENTRY(nmi)
         movl  $TRAP_nmi,4(%rsp)
 handle_ist_exception:
         SAVE_ALL CLAC
+        CR4_PV32_RESTORE
         testb $3,UREGS_cs(%rsp)
         jz    1f
         /* Interrupted guest context. Copy the context to stack bottom. */
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -209,6 +209,16 @@ void ret_from_intr(void);
 
 #define ASM_STAC ASM_AC(STAC)
 #define ASM_CLAC ASM_AC(CLAC)
+
+#define CR4_PV32_RESTORE                                           \
+        667: ASM_NOP5;                                             \
+        .pushsection .altinstr_replacement, "ax";                  \
+        668: call cr4_pv32_restore;                                \
+        .section .altinstructions, "a";                            \
+        altinstruction_entry 667b, 668b, X86_FEATURE_SMEP, 5, 5;   \
+        altinstruction_entry 667b, 668b, X86_FEATURE_SMAP, 5, 5;   \
+        .popsection
+
 #else
 static always_inline void clac(void)
 {
@@ -308,14 +318,18 @@ static always_inline void stac(void)
  *
  * For the way it is used in RESTORE_ALL, this macro must preserve EFLAGS.ZF.
  */
-.macro LOAD_C_CLOBBERED compat=0
+.macro LOAD_C_CLOBBERED compat=0 ax=1
 .if !\compat
         movq  UREGS_r11(%rsp),%r11
         movq  UREGS_r10(%rsp),%r10
         movq  UREGS_r9(%rsp),%r9
         movq  UREGS_r8(%rsp),%r8
-.endif
+.if \ax
         movq  UREGS_rax(%rsp),%rax
+.endif
+.elseif \ax
+        movl  UREGS_rax(%rsp),%eax
+.endif
         movq  UREGS_rcx(%rsp),%rcx
         movq  UREGS_rdx(%rsp),%rdx
         movq  UREGS_rsi(%rsp),%rsi
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -134,12 +134,12 @@
 #define TF_kernel_mode         (1<<_TF_kernel_mode)
 
 /* #PF error code values. */
-#define PFEC_page_present   (1U<<0)
-#define PFEC_write_access   (1U<<1)
-#define PFEC_user_mode      (1U<<2)
-#define PFEC_reserved_bit   (1U<<3)
-#define PFEC_insn_fetch     (1U<<4)
-#define PFEC_prot_key       (1U<<5)
+#define PFEC_page_present   (_AC(1,U) << 0)
+#define PFEC_write_access   (_AC(1,U) << 1)
+#define PFEC_user_mode      (_AC(1,U) << 2)
+#define PFEC_reserved_bit   (_AC(1,U) << 3)
+#define PFEC_insn_fetch     (_AC(1,U) << 4)
+#define PFEC_prot_key       (_AC(1,U) << 5)
 /* Internally used only flags. */
 #define PFEC_page_paged     (1U<<16)
 #define PFEC_page_shared    (1U<<17)

Comments

Konrad Rzeszutek Wilk March 25, 2016, 6:01 p.m. UTC | #1
> @@ -174,10 +174,61 @@ compat_bad_hypercall:
>  /* %rbx: struct vcpu, interrupts disabled */
>  ENTRY(compat_restore_all_guest)
>          ASSERT_INTERRUPTS_DISABLED
> +.Lcr4_orig:
> +        ASM_NOP8 /* testb $3,UREGS_cs(%rsp) */
> +        ASM_NOP2 /* jpe   .Lcr4_alt_end */
> +        ASM_NOP8 /* mov   CPUINFO_cr4...(%rsp), %rax */
> +        ASM_NOP6 /* and   $..., %rax */
> +        ASM_NOP8 /* mov   %rax, CPUINFO_cr4...(%rsp) */
> +        ASM_NOP3 /* mov   %rax, %cr4 */
> +.Lcr4_orig_end:
> +        .pushsection .altinstr_replacement, "ax"
> +.Lcr4_alt:
> +        testb $3,UREGS_cs(%rsp)
> +        jpe   .Lcr4_alt_end

This would jump if the last operation had even bits set. And the
'testb' is 'and' operation which would give us the '011' (for $3).

Why not just depend on the ZF ? Other places that test UREGS_cs()
look to be using that?

> +        mov   CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp), %rax
> +        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
> +        mov   %rax, CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp)
> +        mov   %rax, %cr4
> +.Lcr4_alt_end:
> +        .section .altinstructions, "a"
> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, \
> +                             (.Lcr4_orig_end - .Lcr4_orig), \
> +                             (.Lcr4_alt_end - .Lcr4_alt)
> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, \
> +                             (.Lcr4_orig_end - .Lcr4_orig), \
> +                             (.Lcr4_alt_end - .Lcr4_alt)
> +        .popsection
>          RESTORE_ALL adj=8 compat=1
>  .Lft0:  iretq
>          _ASM_PRE_EXTABLE(.Lft0, handle_exception)
>  
> +/* This mustn't modify registers other than %rax. */
> +ENTRY(cr4_pv32_restore)
> +        push  %rdx
> +        GET_CPUINFO_FIELD(cr4, %rdx)
> +        mov   (%rdx), %rax
> +        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
> +        jnz   0f
> +        or    cr4_pv32_mask(%rip), %rax
> +        mov   %rax, %cr4
> +        mov   %rax, (%rdx)

Here you leave %rax with the cr4_pv32_mask value, but:

> +        pop   %rdx
> +        ret
> +0:
> +#ifndef NDEBUG
> +        /* Check that _all_ of the bits intended to be set actually are. */
> +        mov   %cr4, %rax
> +        and   cr4_pv32_mask(%rip), %eax
> +        cmp   cr4_pv32_mask(%rip), %eax
> +        je    1f
> +        BUG
> +1:
> +#endif
> +        pop   %rdx
> +        xor   %eax, %eax

.. Here you clear it. Any particular reason?

> +        ret
> +
>  /* %rdx: trap_bounce, %rbx: struct vcpu */
>  ENTRY(compat_post_handle_exception)
>          testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
.. snip..
> -.macro LOAD_C_CLOBBERED compat=0
> +.macro LOAD_C_CLOBBERED compat=0 ax=1
>  .if !\compat
>          movq  UREGS_r11(%rsp),%r11
>          movq  UREGS_r10(%rsp),%r10
>          movq  UREGS_r9(%rsp),%r9
>          movq  UREGS_r8(%rsp),%r8
> -.endif
> +.if \ax
>          movq  UREGS_rax(%rsp),%rax
> +.endif

Why the .endif here considering you are doing an:

> +.elseif \ax

an else if here?
> +        movl  UREGS_rax(%rsp),%eax
> +.endif

Actually, Why two 'if ax' ? checks?

Or am I reading this incorrect?
Jan Beulich March 29, 2016, 6:55 a.m. UTC | #2
>>> On 25.03.16 at 19:01, <konrad.wilk@oracle.com> wrote:
>>  @@ -174,10 +174,61 @@ compat_bad_hypercall:
>>  /* %rbx: struct vcpu, interrupts disabled */
>>  ENTRY(compat_restore_all_guest)
>>          ASSERT_INTERRUPTS_DISABLED
>> +.Lcr4_orig:
>> +        ASM_NOP8 /* testb $3,UREGS_cs(%rsp) */
>> +        ASM_NOP2 /* jpe   .Lcr4_alt_end */
>> +        ASM_NOP8 /* mov   CPUINFO_cr4...(%rsp), %rax */
>> +        ASM_NOP6 /* and   $..., %rax */
>> +        ASM_NOP8 /* mov   %rax, CPUINFO_cr4...(%rsp) */
>> +        ASM_NOP3 /* mov   %rax, %cr4 */
>> +.Lcr4_orig_end:
>> +        .pushsection .altinstr_replacement, "ax"
>> +.Lcr4_alt:
>> +        testb $3,UREGS_cs(%rsp)
>> +        jpe   .Lcr4_alt_end
> 
> This would jump if the last operation had even bits set. And the
> 'testb' is 'and' operation which would give us the '011' (for $3).
> 
> Why not just depend on the ZF ? Other places that test UREGS_cs()
> look to be using that?

Because we _want_ to skip the following code when outer context
is guest ring 3. See also the v3 part of the revision log.

>> +/* This mustn't modify registers other than %rax. */
>> +ENTRY(cr4_pv32_restore)
>> +        push  %rdx
>> +        GET_CPUINFO_FIELD(cr4, %rdx)
>> +        mov   (%rdx), %rax
>> +        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
>> +        jnz   0f
>> +        or    cr4_pv32_mask(%rip), %rax
>> +        mov   %rax, %cr4
>> +        mov   %rax, (%rdx)
> 
> Here you leave %rax with the cr4_pv32_mask value, but:
> 
>> +        pop   %rdx
>> +        ret
>> +0:
>> +#ifndef NDEBUG
>> +        /* Check that _all_ of the bits intended to be set actually are. */
>> +        mov   %cr4, %rax
>> +        and   cr4_pv32_mask(%rip), %eax
>> +        cmp   cr4_pv32_mask(%rip), %eax
>> +        je    1f
>> +        BUG
>> +1:
>> +#endif
>> +        pop   %rdx
>> +        xor   %eax, %eax
> 
> .. Here you clear it. Any particular reason?
> 
>> +        ret

Of course - see handle_exception, where this return value gets
checked (in the first case above we just care for there to be any
non-zero value in %rax).

>> -.macro LOAD_C_CLOBBERED compat=0
>> +.macro LOAD_C_CLOBBERED compat=0 ax=1
>>  .if !\compat
>>          movq  UREGS_r11(%rsp),%r11
>>          movq  UREGS_r10(%rsp),%r10
>>          movq  UREGS_r9(%rsp),%r9
>>          movq  UREGS_r8(%rsp),%r8
>> -.endif
>> +.if \ax
>>          movq  UREGS_rax(%rsp),%rax
>> +.endif
> 
> Why the .endif here considering you are doing an:
> 
>> +.elseif \ax
> 
> an else if here?
>> +        movl  UREGS_rax(%rsp),%eax
>> +.endif
> 
> Actually, Why two 'if ax' ? checks?
> 
> Or am I reading this incorrect?

After the change the macro first deals with the "native" case
(which requires looking at \ax) and then the "compat" one
(which too requires evaluating \ax).

Jan
Andrew Cooper May 13, 2016, 3:58 p.m. UTC | #3
On 17/03/16 08:03, Jan Beulich wrote:
> Since such guests' kernel code runs in ring 1, their memory accesses,
> at the paging layer, are supervisor mode ones, and hence subject to
> SMAP/SMEP checks. Such guests cannot be expected to be aware of those
> two features though (and so far we also don't expose the respective
> feature flags), and hence may suffer page faults they cannot deal with.
>
> While the placement of the re-enabling slightly weakens the intended
> protection, it was selected such that 64-bit paths would remain
> unaffected where possible. At the expense of a further performance hit
> the re-enabling could be put right next to the CLACs.
>
> Note that this introduces a number of extra TLB flushes - CR4.SMEP
> transitioning from 0 to 1 always causes a flush, and it transitioning
> from 1 to 0 may also do.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Sorry - I reviewed the v3 code, and replied to the v2 email.

For clarity sake, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox

Patch

--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -67,6 +67,8 @@  boolean_param("smep", opt_smep);
 static bool_t __initdata opt_smap = 1;
 boolean_param("smap", opt_smap);
 
+unsigned long __read_mostly cr4_pv32_mask;
+
 /* Boot dom0 in pvh mode */
 static bool_t __initdata opt_dom0pvh;
 boolean_param("dom0pvh", opt_dom0pvh);
@@ -1364,6 +1366,8 @@  void __init noreturn __start_xen(unsigne
     if ( cpu_has_smap )
         set_in_cr4(X86_CR4_SMAP);
 
+    cr4_pv32_mask = mmu_cr4_features & (X86_CR4_SMEP | X86_CR4_SMAP);
+
     if ( cpu_has_fsgsbase )
         set_in_cr4(X86_CR4_FSGSBASE);
 
@@ -1500,7 +1504,10 @@  void __init noreturn __start_xen(unsigne
      * copy_from_user().
      */
     if ( cpu_has_smap )
+    {
+        cr4_pv32_mask &= ~X86_CR4_SMAP;
         write_cr4(read_cr4() & ~X86_CR4_SMAP);
+    }
 
     printk("%sNX (Execute Disable) protection %sactive\n",
            cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
@@ -1517,7 +1524,10 @@  void __init noreturn __start_xen(unsigne
         panic("Could not set up DOM0 guest OS");
 
     if ( cpu_has_smap )
+    {
         write_cr4(read_cr4() | X86_CR4_SMAP);
+        cr4_pv32_mask |= X86_CR4_SMAP;
+    }
 
     /* Scrub RAM that is still free and so may go to an unprivileged domain. */
     scrub_heap_pages();
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -135,6 +135,7 @@  void __dummy__(void)
     OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs);
     OFFSET(CPUINFO_processor_id, struct cpu_info, processor_id);
     OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu);
+    OFFSET(CPUINFO_cr4, struct cpu_info, cr4);
     DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info));
     BLANK();
 
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -16,14 +16,16 @@  ENTRY(compat_hypercall)
         ASM_CLAC
         pushq $0
         SAVE_VOLATILE type=TRAP_syscall compat=1
+        CR4_PV32_RESTORE
 
         cmpb  $0,untrusted_msi(%rip)
 UNLIKELY_START(ne, msi_check)
         movl  $HYPERCALL_VECTOR,%edi
         call  check_for_unexpected_msi
-        LOAD_C_CLOBBERED
+        LOAD_C_CLOBBERED compat=1 ax=0
 UNLIKELY_END(msi_check)
 
+        movl  UREGS_rax(%rsp),%eax
         GET_CURRENT(%rbx)
 
         cmpl  $NR_hypercalls,%eax
@@ -33,7 +35,6 @@  UNLIKELY_END(msi_check)
         pushq UREGS_rbx(%rsp); pushq %rcx; pushq %rdx; pushq %rsi; pushq %rdi
         pushq UREGS_rbp+5*8(%rsp)
         leaq  compat_hypercall_args_table(%rip),%r10
-        movl  %eax,%eax
         movl  $6,%ecx
         subb  (%r10,%rax,1),%cl
         movq  %rsp,%rdi
@@ -48,7 +49,6 @@  UNLIKELY_END(msi_check)
 #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */
 #else
         /* Relocate argument registers and zero-extend to 64 bits. */
-        movl  %eax,%eax              /* Hypercall #  */
         xchgl %ecx,%esi              /* Arg 2, Arg 4 */
         movl  %edx,%edx              /* Arg 3        */
         movl  %edi,%r8d              /* Arg 5        */
@@ -174,10 +174,61 @@  compat_bad_hypercall:
 /* %rbx: struct vcpu, interrupts disabled */
 ENTRY(compat_restore_all_guest)
         ASSERT_INTERRUPTS_DISABLED
+.Lcr4_orig:
+        ASM_NOP8 /* testb $3,UREGS_cs(%rsp) */
+        ASM_NOP2 /* jpe   .Lcr4_alt_end */
+        ASM_NOP8 /* mov   CPUINFO_cr4...(%rsp), %rax */
+        ASM_NOP6 /* and   $..., %rax */
+        ASM_NOP8 /* mov   %rax, CPUINFO_cr4...(%rsp) */
+        ASM_NOP3 /* mov   %rax, %cr4 */
+.Lcr4_orig_end:
+        .pushsection .altinstr_replacement, "ax"
+.Lcr4_alt:
+        testb $3,UREGS_cs(%rsp)
+        jpe   .Lcr4_alt_end
+        mov   CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp), %rax
+        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
+        mov   %rax, CPUINFO_cr4-CPUINFO_guest_cpu_user_regs(%rsp)
+        mov   %rax, %cr4
+.Lcr4_alt_end:
+        .section .altinstructions, "a"
+        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, \
+                             (.Lcr4_orig_end - .Lcr4_orig), \
+                             (.Lcr4_alt_end - .Lcr4_alt)
+        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, \
+                             (.Lcr4_orig_end - .Lcr4_orig), \
+                             (.Lcr4_alt_end - .Lcr4_alt)
+        .popsection
         RESTORE_ALL adj=8 compat=1
 .Lft0:  iretq
         _ASM_PRE_EXTABLE(.Lft0, handle_exception)
 
+/* This mustn't modify registers other than %rax. */
+ENTRY(cr4_pv32_restore)
+        push  %rdx
+        GET_CPUINFO_FIELD(cr4, %rdx)
+        mov   (%rdx), %rax
+        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
+        jnz   0f
+        or    cr4_pv32_mask(%rip), %rax
+        mov   %rax, %cr4
+        mov   %rax, (%rdx)
+        pop   %rdx
+        ret
+0:
+#ifndef NDEBUG
+        /* Check that _all_ of the bits intended to be set actually are. */
+        mov   %cr4, %rax
+        and   cr4_pv32_mask(%rip), %eax
+        cmp   cr4_pv32_mask(%rip), %eax
+        je    1f
+        BUG
+1:
+#endif
+        pop   %rdx
+        xor   %eax, %eax
+        ret
+
 /* %rdx: trap_bounce, %rbx: struct vcpu */
 ENTRY(compat_post_handle_exception)
         testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
@@ -190,6 +241,7 @@  ENTRY(compat_post_handle_exception)
 /* See lstar_enter for entry register state. */
 ENTRY(cstar_enter)
         sti
+        CR4_PV32_RESTORE
         movq  8(%rsp),%rax /* Restore %rax. */
         movq  $FLAT_KERNEL_SS,8(%rsp)
         pushq %r11
@@ -225,6 +277,7 @@  UNLIKELY_END(compat_syscall_gpf)
         jmp   .Lcompat_bounce_exception
 
 ENTRY(compat_sysenter)
+        CR4_PV32_RESTORE
         movq  VCPU_trap_ctxt(%rbx),%rcx
         cmpb  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
         movzwl VCPU_sysenter_sel(%rbx),%eax
@@ -238,6 +291,7 @@  ENTRY(compat_sysenter)
         jmp   compat_test_all_events
 
 ENTRY(compat_int80_direct_trap)
+        CR4_PV32_RESTORE
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
 
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -434,6 +434,7 @@  ENTRY(dom_crash_sync_extable)
 
 ENTRY(common_interrupt)
         SAVE_ALL CLAC
+        CR4_PV32_RESTORE
         movq %rsp,%rdi
         callq do_IRQ
         jmp ret_from_intr
@@ -454,13 +455,67 @@  ENTRY(page_fault)
 GLOBAL(handle_exception)
         SAVE_ALL CLAC
 handle_exception_saved:
+        GET_CURRENT(%rbx)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
         jz    exception_with_ints_disabled
+
+.Lcr4_pv32_orig:
+        jmp   .Lcr4_pv32_done
+        .skip (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt) - (. - .Lcr4_pv32_orig), 0xcc
+        .pushsection .altinstr_replacement, "ax"
+.Lcr4_pv32_alt:
+        mov   VCPU_domain(%rbx),%rax
+.Lcr4_pv32_alt_end:
+        .section .altinstructions, "a"
+        altinstruction_entry .Lcr4_pv32_orig, .Lcr4_pv32_alt, \
+                             X86_FEATURE_SMEP, \
+                             (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt), \
+                             (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt)
+        altinstruction_entry .Lcr4_pv32_orig, .Lcr4_pv32_alt, \
+                             X86_FEATURE_SMAP, \
+                             (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt), \
+                             (.Lcr4_pv32_alt_end - .Lcr4_pv32_alt)
+        .popsection
+
+        testb $3,UREGS_cs(%rsp)
+        jz    .Lcr4_pv32_done
+        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
+        je    .Lcr4_pv32_done
+        call  cr4_pv32_restore
+        /*
+         * An NMI or #MC may occur between clearing CR4.SMEP / CR4.SMAP in
+         * compat_restore_all_guest and it actually returning to guest
+         * context, in which case the guest would run with the two features
+         * enabled. The only bad that can happen from this is a kernel mode
+         * #PF which the guest doesn't expect. Rather than trying to make the
+         * NMI/#MC exit path honor the intended CR4 setting, simply check
+         * whether the wrong CR4 was in use when the #PF occurred, and exit
+         * back to the guest (which will in turn clear the two CR4 bits) to
+         * re-execute the instruction. If we get back here, the CR4 bits
+         * should then be found clear (unless another NMI/#MC occurred at
+         * exactly the right time), and we'll continue processing the
+         * exception as normal.
+         */
+        test  %rax,%rax
+        jnz   .Lcr4_pv32_done
+        /*
+         * The below effectively is
+         * if ( regs->entry_vector == TRAP_page_fault &&
+         *      (regs->error_code & PFEC_page_present) &&
+         *      !(regs->error_code & ~(PFEC_write_access|PFEC_insn_fetch)) )
+         *     goto compat_test_all_events;
+         */
+        mov   $PFEC_page_present,%al
+        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
+        jne   .Lcr4_pv32_done
+        xor   UREGS_error_code(%rsp),%eax
+        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
+        jz    compat_test_all_events
+.Lcr4_pv32_done:
         sti
 1:      movq  %rsp,%rdi
         movzbl UREGS_entry_vector(%rsp),%eax
         leaq  exception_table(%rip),%rdx
-        GET_CURRENT(%rbx)
         PERFC_INCR(exceptions, %rax, %rbx)
         callq *(%rdx,%rax,8)
         testb $3,UREGS_cs(%rsp)
@@ -590,6 +645,7 @@  ENTRY(nmi)
         movl  $TRAP_nmi,4(%rsp)
 handle_ist_exception:
         SAVE_ALL CLAC
+        CR4_PV32_RESTORE
         testb $3,UREGS_cs(%rsp)
         jz    1f
         /* Interrupted guest context. Copy the context to stack bottom. */
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -209,6 +209,16 @@  void ret_from_intr(void);
 
 #define ASM_STAC ASM_AC(STAC)
 #define ASM_CLAC ASM_AC(CLAC)
+
+#define CR4_PV32_RESTORE                                           \
+        667: ASM_NOP5;                                             \
+        .pushsection .altinstr_replacement, "ax";                  \
+        668: call cr4_pv32_restore;                                \
+        .section .altinstructions, "a";                            \
+        altinstruction_entry 667b, 668b, X86_FEATURE_SMEP, 5, 5;   \
+        altinstruction_entry 667b, 668b, X86_FEATURE_SMAP, 5, 5;   \
+        .popsection
+
 #else
 static always_inline void clac(void)
 {
@@ -308,14 +318,18 @@  static always_inline void stac(void)
  *
  * For the way it is used in RESTORE_ALL, this macro must preserve EFLAGS.ZF.
  */
-.macro LOAD_C_CLOBBERED compat=0
+.macro LOAD_C_CLOBBERED compat=0 ax=1
 .if !\compat
         movq  UREGS_r11(%rsp),%r11
         movq  UREGS_r10(%rsp),%r10
         movq  UREGS_r9(%rsp),%r9
         movq  UREGS_r8(%rsp),%r8
-.endif
+.if \ax
         movq  UREGS_rax(%rsp),%rax
+.endif
+.elseif \ax
+        movl  UREGS_rax(%rsp),%eax
+.endif
         movq  UREGS_rcx(%rsp),%rcx
         movq  UREGS_rdx(%rsp),%rdx
         movq  UREGS_rsi(%rsp),%rsi
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -134,12 +134,12 @@ 
 #define TF_kernel_mode         (1<<_TF_kernel_mode)
 
 /* #PF error code values. */
-#define PFEC_page_present   (1U<<0)
-#define PFEC_write_access   (1U<<1)
-#define PFEC_user_mode      (1U<<2)
-#define PFEC_reserved_bit   (1U<<3)
-#define PFEC_insn_fetch     (1U<<4)
-#define PFEC_prot_key       (1U<<5)
+#define PFEC_page_present   (_AC(1,U) << 0)
+#define PFEC_write_access   (_AC(1,U) << 1)
+#define PFEC_user_mode      (_AC(1,U) << 2)
+#define PFEC_reserved_bit   (_AC(1,U) << 3)
+#define PFEC_insn_fetch     (_AC(1,U) << 4)
+#define PFEC_prot_key       (_AC(1,U) << 5)
 /* Internally used only flags. */
 #define PFEC_page_paged     (1U<<16)
 #define PFEC_page_shared    (1U<<17)