diff mbox

[2/4] x86: suppress SMAP and SMEP while running 32-bit PV guest code

Message ID 56D97F4802000078000D9561@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich March 4, 2016, 11:27 a.m. UTC
Since such guests' kernel code runs in ring 1, their memory accesses,
at the paging layer, are supervisor mode ones, and hence subject to
SMAP/SMEP checks. Such guests cannot be expected to be aware of those
two features though (and so far we also don't expose the respective
feature flags), and hence may suffer page faults they cannot deal with.

While the placement of the re-enabling slightly weakens the intended
protection, it was selected such that 64-bit paths would remain
unaffected where possible. At the expense of a further performance hit
the re-enabling could be put right next to the CLACs.

Note that this introduces a number of extra TLB flushes - CR4.SMEP
transitioning from 0 to 1 always causes a flush, and it transitioning
from 1 to 0 may also do.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
x86: suppress SMAP and SMEP while running 32-bit PV guest code

Since such guests' kernel code runs in ring 1, their memory accesses,
at the paging layer, are supervisor mode ones, and hence subject to
SMAP/SMEP checks. Such guests cannot be expected to be aware of those
two features though (and so far we also don't expose the respective
feature flags), and hence may suffer page faults they cannot deal with.

While the placement of the re-enabling slightly weakens the intended
protection, it was selected such that 64-bit paths would remain
unaffected where possible. At the expense of a further performance hit
the re-enabling could be put right next to the CLACs.

Note that this introduces a number of extra TLB flushes - CR4.SMEP
transitioning from 0 to 1 always causes a flush, and it transitioning
from 1 to 0 may also do.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -67,6 +67,8 @@ boolean_param("smep", opt_smep);
 static bool_t __initdata opt_smap = 1;
 boolean_param("smap", opt_smap);
 
+unsigned long __read_mostly cr4_smep_smap_mask;
+
 /* Boot dom0 in pvh mode */
 static bool_t __initdata opt_dom0pvh;
 boolean_param("dom0pvh", opt_dom0pvh);
@@ -1335,6 +1337,8 @@ void __init noreturn __start_xen(unsigne
     if ( cpu_has_smap )
         set_in_cr4(X86_CR4_SMAP);
 
+    cr4_smep_smap_mask = mmu_cr4_features & (X86_CR4_SMEP | X86_CR4_SMAP);
+
     if ( cpu_has_fsgsbase )
         set_in_cr4(X86_CR4_FSGSBASE);
 
@@ -1471,7 +1475,10 @@ void __init noreturn __start_xen(unsigne
      * copy_from_user().
      */
     if ( cpu_has_smap )
+    {
+        cr4_smep_smap_mask &= ~X86_CR4_SMAP;
         write_cr4(read_cr4() & ~X86_CR4_SMAP);
+    }
 
     printk("%sNX (Execute Disable) protection %sactive\n",
            cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
@@ -1488,7 +1495,10 @@ void __init noreturn __start_xen(unsigne
         panic("Could not set up DOM0 guest OS");
 
     if ( cpu_has_smap )
+    {
         write_cr4(read_cr4() | X86_CR4_SMAP);
+        cr4_smep_smap_mask |= X86_CR4_SMAP;
+    }
 
     /* Scrub RAM that is still free and so may go to an unprivileged domain. */
     scrub_heap_pages();
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -16,14 +16,16 @@ ENTRY(compat_hypercall)
         ASM_CLAC
         pushq $0
         SAVE_VOLATILE type=TRAP_syscall compat=1
+        SMEP_SMAP_RESTORE
 
         cmpb  $0,untrusted_msi(%rip)
 UNLIKELY_START(ne, msi_check)
         movl  $HYPERCALL_VECTOR,%edi
         call  check_for_unexpected_msi
-        LOAD_C_CLOBBERED
+        LOAD_C_CLOBBERED compat=1 ax=0
 UNLIKELY_END(msi_check)
 
+        movl  UREGS_rax(%rsp),%eax
         GET_CURRENT(%rbx)
 
         cmpl  $NR_hypercalls,%eax
@@ -33,7 +35,6 @@ UNLIKELY_END(msi_check)
         pushq UREGS_rbx(%rsp); pushq %rcx; pushq %rdx; pushq %rsi; pushq %rdi
         pushq UREGS_rbp+5*8(%rsp)
         leaq  compat_hypercall_args_table(%rip),%r10
-        movl  %eax,%eax
         movl  $6,%ecx
         subb  (%r10,%rax,1),%cl
         movq  %rsp,%rdi
@@ -48,7 +49,6 @@ UNLIKELY_END(msi_check)
 #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */
 #else
         /* Relocate argument registers and zero-extend to 64 bits. */
-        movl  %eax,%eax              /* Hypercall #  */
         xchgl %ecx,%esi              /* Arg 2, Arg 4 */
         movl  %edx,%edx              /* Arg 3        */
         movl  %edi,%r8d              /* Arg 5        */
@@ -174,10 +174,43 @@ compat_bad_hypercall:
 /* %rbx: struct vcpu, interrupts disabled */
 ENTRY(compat_restore_all_guest)
         ASSERT_INTERRUPTS_DISABLED
+.Lcr4_orig:
+        ASM_NOP3 /* mov   %cr4, %rax */
+        ASM_NOP6 /* and   $..., %rax */
+        ASM_NOP3 /* mov   %rax, %cr4 */
+        .pushsection .altinstr_replacement, "ax"
+.Lcr4_alt:
+        mov   %cr4, %rax
+        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
+        mov   %rax, %cr4
+.Lcr4_alt_end:
+        .section .altinstructions, "a"
+        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \
+                             (.Lcr4_alt_end - .Lcr4_alt)
+        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \
+                             (.Lcr4_alt_end - .Lcr4_alt)
+        .popsection
         RESTORE_ALL adj=8 compat=1
 .Lft0:  iretq
         _ASM_PRE_EXTABLE(.Lft0, handle_exception)
 
+/* This mustn't modify registers other than %rax. */
+ENTRY(cr4_smep_smap_restore)
+        mov   %cr4, %rax
+        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
+        jnz   0f
+        or    cr4_smep_smap_mask(%rip), %rax
+        mov   %rax, %cr4
+        ret
+0:
+        and   cr4_smep_smap_mask(%rip), %eax
+        cmp   cr4_smep_smap_mask(%rip), %eax
+        je    1f
+        BUG
+1:
+        xor   %eax, %eax
+        ret
+
 /* %rdx: trap_bounce, %rbx: struct vcpu */
 ENTRY(compat_post_handle_exception)
         testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
@@ -190,6 +223,7 @@ ENTRY(compat_post_handle_exception)
 /* See lstar_enter for entry register state. */
 ENTRY(cstar_enter)
         sti
+        SMEP_SMAP_RESTORE
         movq  8(%rsp),%rax /* Restore %rax. */
         movq  $FLAT_KERNEL_SS,8(%rsp)
         pushq %r11
@@ -225,6 +259,7 @@ UNLIKELY_END(compat_syscall_gpf)
         jmp   .Lcompat_bounce_exception
 
 ENTRY(compat_sysenter)
+        SMEP_SMAP_RESTORE
         movq  VCPU_trap_ctxt(%rbx),%rcx
         cmpb  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
         movzwl VCPU_sysenter_sel(%rbx),%eax
@@ -238,6 +273,7 @@ ENTRY(compat_sysenter)
         jmp   compat_test_all_events
 
 ENTRY(compat_int80_direct_trap)
+        SMEP_SMAP_RESTORE
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
 
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable)
 
 ENTRY(common_interrupt)
         SAVE_ALL CLAC
+        SMEP_SMAP_RESTORE
         movq %rsp,%rdi
         callq do_IRQ
         jmp ret_from_intr
@@ -454,13 +455,64 @@ ENTRY(page_fault)
 GLOBAL(handle_exception)
         SAVE_ALL CLAC
 handle_exception_saved:
+        GET_CURRENT(%rbx)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
         jz    exception_with_ints_disabled
-        sti
+
+.Lsmep_smap_orig:
+        jmp   0f
+        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
+        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc
+        .else
+        // worst case: rex + opcode + modrm + 4-byte displacement
+        .skip (1 + 1 + 1 + 4) - 2, 0xcc
+        .endif
+        .pushsection .altinstr_replacement, "ax"
+.Lsmep_smap_alt:
+        mov   VCPU_domain(%rbx),%rax
+.Lsmep_smap_alt_end:
+        .section .altinstructions, "a"
+        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
+                             X86_FEATURE_SMEP, \
+                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
+                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
+        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
+                             X86_FEATURE_SMAP, \
+                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
+                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
+        .popsection
+
+        testb $3,UREGS_cs(%rsp)
+        jz    0f
+        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
+        je    0f
+        call  cr4_smep_smap_restore
+        /*
+         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in
+         * compat_restore_all_guest and it actually returning to guest
+         * context, in which case the guest would run with the two features
+         * enabled. The only bad that can happen from this is a kernel mode
+         * #PF which the guest doesn't expect. Rather than trying to make the
+         * NMI/#MC exit path honor the intended CR4 setting, simply check
+         * whether the wrong CR4 was in use when the #PF occurred, and exit
+         * back to the guest (which will in turn clear the two CR4 bits) to
+         * re-execute the instruction. If we get back here, the CR4 bits
+         * should then be found clear (unless another NMI/#MC occurred at
+         * exactly the right time), and we'll continue processing the
+         * exception as normal.
+         */
+        test  %rax,%rax
+        jnz   0f
+        mov   $PFEC_page_present,%al
+        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
+        jne   0f
+        xor   UREGS_error_code(%rsp),%eax
+        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
+        jz    compat_test_all_events
+0:      sti
 1:      movq  %rsp,%rdi
         movzbl UREGS_entry_vector(%rsp),%eax
         leaq  exception_table(%rip),%rdx
-        GET_CURRENT(%rbx)
         PERFC_INCR(exceptions, %rax, %rbx)
         callq *(%rdx,%rax,8)
         testb $3,UREGS_cs(%rsp)
@@ -592,6 +644,7 @@ handle_ist_exception:
         SAVE_ALL CLAC
         testb $3,UREGS_cs(%rsp)
         jz    1f
+        SMEP_SMAP_RESTORE
         /* Interrupted guest context. Copy the context to stack bottom. */
         GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi)
         movq  %rsp,%rsi
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -209,6 +209,16 @@ void ret_from_intr(void);
 
 #define ASM_STAC ASM_AC(STAC)
 #define ASM_CLAC ASM_AC(CLAC)
+
+#define SMEP_SMAP_RESTORE                                              \
+        667: ASM_NOP5;                                                 \
+        .pushsection .altinstr_replacement, "ax";                      \
+        668: call cr4_smep_smap_restore;                               \
+        .section .altinstructions, "a";                                \
+        altinstruction_entry 667b, 668b, X86_FEATURE_SMEP, 5, 5;       \
+        altinstruction_entry 667b, 668b, X86_FEATURE_SMAP, 5, 5;       \
+        .popsection
+
 #else
 static always_inline void clac(void)
 {
@@ -308,14 +318,18 @@ static always_inline void stac(void)
  *
  * For the way it is used in RESTORE_ALL, this macro must preserve EFLAGS.ZF.
  */
-.macro LOAD_C_CLOBBERED compat=0
+.macro LOAD_C_CLOBBERED compat=0 ax=1
 .if !\compat
         movq  UREGS_r11(%rsp),%r11
         movq  UREGS_r10(%rsp),%r10
         movq  UREGS_r9(%rsp),%r9
         movq  UREGS_r8(%rsp),%r8
-.endif
+.if \ax
         movq  UREGS_rax(%rsp),%rax
+.endif
+.elseif \ax
+        movl  UREGS_rax(%rsp),%eax
+.endif
         movq  UREGS_rcx(%rsp),%rcx
         movq  UREGS_rdx(%rsp),%rdx
         movq  UREGS_rsi(%rsp),%rsi
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -134,12 +134,12 @@
 #define TF_kernel_mode         (1<<_TF_kernel_mode)
 
 /* #PF error code values. */
-#define PFEC_page_present   (1U<<0)
-#define PFEC_write_access   (1U<<1)
-#define PFEC_user_mode      (1U<<2)
-#define PFEC_reserved_bit   (1U<<3)
-#define PFEC_insn_fetch     (1U<<4)
-#define PFEC_prot_key       (1U<<5)
+#define PFEC_page_present   (_AC(1,U) << 0)
+#define PFEC_write_access   (_AC(1,U) << 1)
+#define PFEC_user_mode      (_AC(1,U) << 2)
+#define PFEC_reserved_bit   (_AC(1,U) << 3)
+#define PFEC_insn_fetch     (_AC(1,U) << 4)
+#define PFEC_prot_key       (_AC(1,U) << 5)
 /* Internally used only flags. */
 #define PFEC_page_paged     (1U<<16)
 #define PFEC_page_shared    (1U<<17)

Comments

Andrew Cooper March 7, 2016, 4:59 p.m. UTC | #1
On 04/03/16 11:27, Jan Beulich wrote:
> Since such guests' kernel code runs in ring 1, their memory accesses,
> at the paging layer, are supervisor mode ones, and hence subject to
> SMAP/SMEP checks. Such guests cannot be expected to be aware of those
> two features though (and so far we also don't expose the respective
> feature flags), and hence may suffer page faults they cannot deal with.
>
> While the placement of the re-enabling slightly weakens the intended
> protection, it was selected such that 64-bit paths would remain
> unaffected where possible. At the expense of a further performance hit
> the re-enabling could be put right next to the CLACs.
>
> Note that this introduces a number of extra TLB flushes - CR4.SMEP
> transitioning from 0 to 1 always causes a flush, and it transitioning
> from 1 to 0 may also do.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -67,6 +67,8 @@ boolean_param("smep", opt_smep);
>  static bool_t __initdata opt_smap = 1;
>  boolean_param("smap", opt_smap);
>  
> +unsigned long __read_mostly cr4_smep_smap_mask;

Are we liable to gain any other cr4 features which would want to be
included in this?  Might it be wise to chose a slightly more generic
name such as cr4_pv32_mask ?

>  #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */
>  #else
>          /* Relocate argument registers and zero-extend to 64 bits. */
> -        movl  %eax,%eax              /* Hypercall #  */
>          xchgl %ecx,%esi              /* Arg 2, Arg 4 */
>          movl  %edx,%edx              /* Arg 3        */
>          movl  %edi,%r8d              /* Arg 5        */
> @@ -174,10 +174,43 @@ compat_bad_hypercall:
>  /* %rbx: struct vcpu, interrupts disabled */
>  ENTRY(compat_restore_all_guest)
>          ASSERT_INTERRUPTS_DISABLED
> +.Lcr4_orig:
> +        ASM_NOP3 /* mov   %cr4, %rax */
> +        ASM_NOP6 /* and   $..., %rax */
> +        ASM_NOP3 /* mov   %rax, %cr4 */
> +        .pushsection .altinstr_replacement, "ax"
> +.Lcr4_alt:
> +        mov   %cr4, %rax
> +        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
> +        mov   %rax, %cr4
> +.Lcr4_alt_end:
> +        .section .altinstructions, "a"
> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \
> +                             (.Lcr4_alt_end - .Lcr4_alt)
> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \
> +                             (.Lcr4_alt_end - .Lcr4_alt)

These 12's look as if they should be (.Lcr4_alt - .Lcr4_orig).

> +        .popsection
>          RESTORE_ALL adj=8 compat=1
>  .Lft0:  iretq
>          _ASM_PRE_EXTABLE(.Lft0, handle_exception)
>  
> +/* This mustn't modify registers other than %rax. */
> +ENTRY(cr4_smep_smap_restore)
> +        mov   %cr4, %rax
> +        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
> +        jnz   0f
> +        or    cr4_smep_smap_mask(%rip), %rax
> +        mov   %rax, %cr4
> +        ret
> +0:
> +        and   cr4_smep_smap_mask(%rip), %eax
> +        cmp   cr4_smep_smap_mask(%rip), %eax
> +        je    1f
> +        BUG

What is the purpose of this bugcheck? It looks like it is catching a
mismatch of masked options, but I am not completely sure.

For all other ASM level BUG's, I put a short comment on the same line,
to aid people who hit the bug.

> +1:
> +        xor   %eax, %eax
> +        ret
> +
>  /* %rdx: trap_bounce, %rbx: struct vcpu */
>  ENTRY(compat_post_handle_exception)
>          testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
> @@ -190,6 +223,7 @@ ENTRY(compat_post_handle_exception)
>  /* See lstar_enter for entry register state. */
>  ENTRY(cstar_enter)
>          sti
> +        SMEP_SMAP_RESTORE
>          movq  8(%rsp),%rax /* Restore %rax. */
>          movq  $FLAT_KERNEL_SS,8(%rsp)
>          pushq %r11
> @@ -225,6 +259,7 @@ UNLIKELY_END(compat_syscall_gpf)
>          jmp   .Lcompat_bounce_exception
>  
>  ENTRY(compat_sysenter)
> +        SMEP_SMAP_RESTORE
>          movq  VCPU_trap_ctxt(%rbx),%rcx
>          cmpb  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
>          movzwl VCPU_sysenter_sel(%rbx),%eax
> @@ -238,6 +273,7 @@ ENTRY(compat_sysenter)
>          jmp   compat_test_all_events
>  
>  ENTRY(compat_int80_direct_trap)
> +        SMEP_SMAP_RESTORE
>          call  compat_create_bounce_frame
>          jmp   compat_test_all_events
>  
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable)
>  
>  ENTRY(common_interrupt)
>          SAVE_ALL CLAC
> +        SMEP_SMAP_RESTORE
>          movq %rsp,%rdi
>          callq do_IRQ
>          jmp ret_from_intr
> @@ -454,13 +455,64 @@ ENTRY(page_fault)
>  GLOBAL(handle_exception)
>          SAVE_ALL CLAC
>  handle_exception_saved:
> +        GET_CURRENT(%rbx)
>          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
>          jz    exception_with_ints_disabled
> -        sti
> +
> +.Lsmep_smap_orig:
> +        jmp   0f
> +        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
> +        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc
> +        .else
> +        // worst case: rex + opcode + modrm + 4-byte displacement
> +        .skip (1 + 1 + 1 + 4) - 2, 0xcc
> +        .endif

Which bug is this?  How does it manifest.  More generally, what is this
alternative trying to achieve?

> +        .pushsection .altinstr_replacement, "ax"
> +.Lsmep_smap_alt:
> +        mov   VCPU_domain(%rbx),%rax
> +.Lsmep_smap_alt_end:
> +        .section .altinstructions, "a"
> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
> +                             X86_FEATURE_SMEP, \
> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
> +                             X86_FEATURE_SMAP, \
> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
> +        .popsection
> +
> +        testb $3,UREGS_cs(%rsp)
> +        jz    0f
> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)

This comparison is wrong on hardware lacking SMEP and SMAP, as the "mov
VCPU_domain(%rbx),%rax" won't have happened.

> +        je    0f
> +        call  cr4_smep_smap_restore
> +        /*
> +         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in
> +         * compat_restore_all_guest and it actually returning to guest
> +         * context, in which case the guest would run with the two features
> +         * enabled. The only bad that can happen from this is a kernel mode
> +         * #PF which the guest doesn't expect. Rather than trying to make the
> +         * NMI/#MC exit path honor the intended CR4 setting, simply check
> +         * whether the wrong CR4 was in use when the #PF occurred, and exit
> +         * back to the guest (which will in turn clear the two CR4 bits) to
> +         * re-execute the instruction. If we get back here, the CR4 bits
> +         * should then be found clear (unless another NMI/#MC occurred at
> +         * exactly the right time), and we'll continue processing the
> +         * exception as normal.
> +         */
> +        test  %rax,%rax
> +        jnz   0f
> +        mov   $PFEC_page_present,%al
> +        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
> +        jne   0f
> +        xor   UREGS_error_code(%rsp),%eax
> +        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
> +        jz    compat_test_all_events
> +0:      sti

Its code like this which makes me even more certain that we have far too
much code written in assembly which doesn't need to be.  Maybe not this
specific sample, but it has taken me 15 minutes and a pad of paper to
try and work out how this conditional works, and I am still not certain
its correct.  In particular, PFEC_prot_key looks like it fool the test
into believing a non-smap/smep fault was a smap/smep fault.

Can you at least provide some C in a comment with the intended
conditional, to aid clarity?

~Andrew
Jan Beulich March 8, 2016, 7:57 a.m. UTC | #2
>>> On 07.03.16 at 17:59, <andrew.cooper3@citrix.com> wrote:
> On 04/03/16 11:27, Jan Beulich wrote:
>> --- a/xen/arch/x86/setup.c
>> +++ b/xen/arch/x86/setup.c
>> @@ -67,6 +67,8 @@ boolean_param("smep", opt_smep);
>>  static bool_t __initdata opt_smap = 1;
>>  boolean_param("smap", opt_smap);
>>  
>> +unsigned long __read_mostly cr4_smep_smap_mask;
> 
> Are we liable to gain any other cr4 features which would want to be
> included in this?  Might it be wise to chose a slightly more generic
> name such as cr4_pv32_mask ?

Ah, that's a good name suggestion - I didn't like the "smep_smap"
thing from the beginning, but couldn't think of a better variant.

>> @@ -174,10 +174,43 @@ compat_bad_hypercall:
>>  /* %rbx: struct vcpu, interrupts disabled */
>>  ENTRY(compat_restore_all_guest)
>>          ASSERT_INTERRUPTS_DISABLED
>> +.Lcr4_orig:
>> +        ASM_NOP3 /* mov   %cr4, %rax */
>> +        ASM_NOP6 /* and   $..., %rax */
>> +        ASM_NOP3 /* mov   %rax, %cr4 */
>> +        .pushsection .altinstr_replacement, "ax"
>> +.Lcr4_alt:
>> +        mov   %cr4, %rax
>> +        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
>> +        mov   %rax, %cr4
>> +.Lcr4_alt_end:
>> +        .section .altinstructions, "a"
>> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \
>> +                             (.Lcr4_alt_end - .Lcr4_alt)
>> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \
>> +                             (.Lcr4_alt_end - .Lcr4_alt)
> 
> These 12's look as if they should be (.Lcr4_alt - .Lcr4_orig).

Well, the NOPs that get put there make 12 (= 3 + 6 + 3) a
pretty obvious (shorter and hence more readable) option. But
yes, if you're of the strong opinion that we should use the
longer alternative, I can switch these around.

>> +/* This mustn't modify registers other than %rax. */
>> +ENTRY(cr4_smep_smap_restore)
>> +        mov   %cr4, %rax
>> +        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
>> +        jnz   0f
>> +        or    cr4_smep_smap_mask(%rip), %rax
>> +        mov   %rax, %cr4
>> +        ret
>> +0:
>> +        and   cr4_smep_smap_mask(%rip), %eax
>> +        cmp   cr4_smep_smap_mask(%rip), %eax
>> +        je    1f
>> +        BUG
> 
> What is the purpose of this bugcheck? It looks like it is catching a
> mismatch of masked options, but I am not completely sure.

This aims at detecting that some of the CR4 bits which are
expected to be set really aren't (other than the case when all
of the ones of interest here are clear).

> For all other ASM level BUG's, I put a short comment on the same line,
> to aid people who hit the bug.

Will do. Question: Should this check perhaps become !NDEBUG
dependent?

>> @@ -454,13 +455,64 @@ ENTRY(page_fault)
>>  GLOBAL(handle_exception)
>>          SAVE_ALL CLAC
>>  handle_exception_saved:
>> +        GET_CURRENT(%rbx)
>>          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
>>          jz    exception_with_ints_disabled
>> -        sti
>> +
>> +.Lsmep_smap_orig:
>> +        jmp   0f
>> +        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
>> +        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc
>> +        .else
>> +        // worst case: rex + opcode + modrm + 4-byte displacement
>> +        .skip (1 + 1 + 1 + 4) - 2, 0xcc
>> +        .endif
> 
> Which bug is this?  How does it manifest.  More generally, what is this
> alternative trying to achieve?

The .org gets a warning (.Lsmep_smap_orig supposedly being
undefined, and hence getting assumed to be zero) followed by
an error (attempt to move the current location backwards). The
fix https://sourceware.org/ml/binutils/2016-03/msg00030.html
is pending approval.

>> +        .pushsection .altinstr_replacement, "ax"
>> +.Lsmep_smap_alt:
>> +        mov   VCPU_domain(%rbx),%rax
>> +.Lsmep_smap_alt_end:
>> +        .section .altinstructions, "a"
>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>> +                             X86_FEATURE_SMEP, \
>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>> +                             X86_FEATURE_SMAP, \
>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>> +        .popsection
>> +
>> +        testb $3,UREGS_cs(%rsp)
>> +        jz    0f
>> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
> 
> This comparison is wrong on hardware lacking SMEP and SMAP, as the "mov
> VCPU_domain(%rbx),%rax" won't have happened.

That mov indeed won't have happened, but the original instruction
is a branch past all of this code, so the above is correct (and I did
test on older hardware).

>> +        je    0f
>> +        call  cr4_smep_smap_restore
>> +        /*
>> +         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in
>> +         * compat_restore_all_guest and it actually returning to guest
>> +         * context, in which case the guest would run with the two features
>> +         * enabled. The only bad that can happen from this is a kernel mode
>> +         * #PF which the guest doesn't expect. Rather than trying to make the
>> +         * NMI/#MC exit path honor the intended CR4 setting, simply check
>> +         * whether the wrong CR4 was in use when the #PF occurred, and exit
>> +         * back to the guest (which will in turn clear the two CR4 bits) to
>> +         * re-execute the instruction. If we get back here, the CR4 bits
>> +         * should then be found clear (unless another NMI/#MC occurred at
>> +         * exactly the right time), and we'll continue processing the
>> +         * exception as normal.
>> +         */
>> +        test  %rax,%rax
>> +        jnz   0f
>> +        mov   $PFEC_page_present,%al
>> +        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
>> +        jne   0f
>> +        xor   UREGS_error_code(%rsp),%eax
>> +        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
>> +        jz    compat_test_all_events
>> +0:      sti
> 
> Its code like this which makes me even more certain that we have far too
> much code written in assembly which doesn't need to be.  Maybe not this
> specific sample, but it has taken me 15 minutes and a pad of paper to
> try and work out how this conditional works, and I am still not certain
> its correct.  In particular, PFEC_prot_key looks like it fool the test
> into believing a non-smap/smep fault was a smap/smep fault.

Not sure how you come to think of PFEC_prot_key here: That's
a bit which can be set only together with PFEC_user_mode, yet
we care about kernel mode faults only here.

> Can you at least provide some C in a comment with the intended
> conditional, to aid clarity?

Sure, if you think that helps beyond the (I think) pretty extensive
comment:

+        test  %rax,%rax
+        jnz   0f
+        /*
+         * The below effectively is
+         * if ( regs->entry_vector == TRAP_page_fault &&
+         *      (regs->error_code & PFEC_page_present) &&
+         *      !(regs->error_code & ~(PFEC_write_access|PFEC_insn_fetch)) )
+         *     goto compat_test_all_events;
+         */
+        mov   $PFEC_page_present,%al
+        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
+        jne   0f
+        xor   UREGS_error_code(%rsp),%eax
+        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
+        jz    compat_test_all_events
+0:

Jan
Wu, Feng March 9, 2016, 8:09 a.m. UTC | #3
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Friday, March 4, 2016 7:28 PM
> To: xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>; Wu, Feng
> <feng.wu@intel.com>; Keir Fraser <keir@xen.org>
> Subject: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV
> guest code
> 
> Since such guests' kernel code runs in ring 1, their memory accesses,
> at the paging layer, are supervisor mode ones, and hence subject to
> SMAP/SMEP checks. Such guests cannot be expected to be aware of those
> two features though (and so far we also don't expose the respective
> feature flags), and hence may suffer page faults they cannot deal with.
> 
> While the placement of the re-enabling slightly weakens the intended
> protection, it was selected such that 64-bit paths would remain
> unaffected where possible. At the expense of a further performance hit
> the re-enabling could be put right next to the CLACs.
> 
> Note that this introduces a number of extra TLB flushes - CR4.SMEP
> transitioning from 0 to 1 always causes a flush, and it transitioning
> from 1 to 0 may also do.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -67,6 +67,8 @@ boolean_param("smep", opt_smep);
>  static bool_t __initdata opt_smap = 1;
>  boolean_param("smap", opt_smap);
> 
> +unsigned long __read_mostly cr4_smep_smap_mask;
> +
>  /* Boot dom0 in pvh mode */
>  static bool_t __initdata opt_dom0pvh;
>  boolean_param("dom0pvh", opt_dom0pvh);
> @@ -1335,6 +1337,8 @@ void __init noreturn __start_xen(unsigne
>      if ( cpu_has_smap )
>          set_in_cr4(X86_CR4_SMAP);
> 
> +    cr4_smep_smap_mask = mmu_cr4_features & (X86_CR4_SMEP |
> X86_CR4_SMAP);
> +
>      if ( cpu_has_fsgsbase )
>          set_in_cr4(X86_CR4_FSGSBASE);
> 
> @@ -1471,7 +1475,10 @@ void __init noreturn __start_xen(unsigne
>       * copy_from_user().
>       */
>      if ( cpu_has_smap )
> +    {
> +        cr4_smep_smap_mask &= ~X86_CR4_SMAP;

You change ' cr4_smep_smap_mask ' here ...

>          write_cr4(read_cr4() & ~X86_CR4_SMAP);
> +    }
> 
>      printk("%sNX (Execute Disable) protection %sactive\n",
>             cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
> @@ -1488,7 +1495,10 @@ void __init noreturn __start_xen(unsigne
>          panic("Could not set up DOM0 guest OS");
> 
>      if ( cpu_has_smap )
> +    {
>          write_cr4(read_cr4() | X86_CR4_SMAP);
> +        cr4_smep_smap_mask |= X86_CR4_SMAP;

... and here. I am wonder whether it is because Domain 0 can actually start
running between the two place? Or I don't think the changes in the above
two places is needed. right?
> 
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable)
> 
>  ENTRY(common_interrupt)
>          SAVE_ALL CLAC
> +        SMEP_SMAP_RESTORE
>          movq %rsp,%rdi
>          callq do_IRQ
>          jmp ret_from_intr
> @@ -454,13 +455,64 @@ ENTRY(page_fault)
>  GLOBAL(handle_exception)
>          SAVE_ALL CLAC
>  handle_exception_saved:
> +        GET_CURRENT(%rbx)
>          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
>          jz    exception_with_ints_disabled
> -        sti
> +
> +.Lsmep_smap_orig:
> +        jmp   0f
> +        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
> +        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc
> +        .else
> +        // worst case: rex + opcode + modrm + 4-byte displacement
> +        .skip (1 + 1 + 1 + 4) - 2, 0xcc
> +        .endif
> +        .pushsection .altinstr_replacement, "ax"
> +.Lsmep_smap_alt:
> +        mov   VCPU_domain(%rbx),%rax
> +.Lsmep_smap_alt_end:
> +        .section .altinstructions, "a"
> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
> +                             X86_FEATURE_SMEP, \
> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
> +                             X86_FEATURE_SMAP, \
> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
> +        .popsection
> +
> +        testb $3,UREGS_cs(%rsp)
> +        jz    0f
> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
> +        je    0f
> +        call  cr4_smep_smap_restore
> +        /*
> +         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in

Do you mean "before" when you typed "between" above?

> +         * compat_restore_all_guest and it actually returning to guest
> +         * context, in which case the guest would run with the two features
> +         * enabled. The only bad that can happen from this is a kernel mode
> +         * #PF which the guest doesn't expect. Rather than trying to make the
> +         * NMI/#MC exit path honor the intended CR4 setting, simply check
> +         * whether the wrong CR4 was in use when the #PF occurred, and exit
> +         * back to the guest (which will in turn clear the two CR4 bits) to
> +         * re-execute the instruction. If we get back here, the CR4 bits
> +         * should then be found clear (unless another NMI/#MC occurred at
> +         * exactly the right time), and we'll continue processing the
> +         * exception as normal.
> +         */

As Andrew mentioned in another mail, this scenario is a little tricky, could you
please give a more detailed description about how the MNI/#MC affect the
execution flow, maybe with some code in the explanation? Thanks a lot!

Thanks,
Feng
Wu, Feng March 9, 2016, 8:09 a.m. UTC | #4
> >> +/* This mustn't modify registers other than %rax. */
> >> +ENTRY(cr4_smep_smap_restore)
> >> +        mov   %cr4, %rax
> >> +        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
> >> +        jnz   0f

If we clear every place where we are back to 32bit pv guest,
X86_CR4_SMEP and X86_CR4_SMAP bit should be clear
in CR4, right?  If that is the case, we cannot jump to 0f.
However, like what you mentioned in the comments below:

+        /*
+         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in
+         * compat_restore_all_guest and it actually returning to guest
+         * context, in which case the guest would run with the two features
+         * enabled. The only bad that can happen from this is a kernel mode
+         * #PF which the guest doesn't expect. Rather than trying to make the
+         * NMI/#MC exit path honor the intended CR4 setting, simply check
+         * whether the wrong CR4 was in use when the #PF occurred, and exit
+         * back to the guest (which will in turn clear the two CR4 bits) to
+         * re-execute the instruction. If we get back here, the CR4 bits
+         * should then be found clear (unless another NMI/#MC occurred at
+         * exactly the right time), and we'll continue processing the
+         * exception as normal.
+         */

That means, if NMI or #MC happens in the right time, the guest can be
running with SMAP/SMEP set in cr4, only in this case, we can get the '0f'.
Is my understanding correct? Thanks!


> >> +        or    cr4_smep_smap_mask(%rip), %rax
> >> +        mov   %rax, %cr4
> >> +        ret
> >> +0:
> >> +        and   cr4_smep_smap_mask(%rip), %eax
> >> +        cmp   cr4_smep_smap_mask(%rip), %eax
> >> +        je    1f
> >> +        BUG
> >
> > What is the purpose of this bugcheck? It looks like it is catching a
> > mismatch of masked options, but I am not completely sure.
> 
> This aims at detecting that some of the CR4 bits which are
> expected to be set really aren't (other than the case when all
> of the ones of interest here are clear).

The code in the 0f section consider the following case as not
a bug, do you think this can really happen?

Let's suppose smap it bit 0 and smep is bit1, then
cr4_smep_smap_mask is 01 or 10 and %eax is 11, in this case, it
means, Xen sets both smap and smep, but when guest is running,
only one feature is set. In your path, smap and smep is set/clear
at the same time, not sure this can happen.

BTW, I think it is worth adding some comments for the 0f section.

Thanks,
Feng
Andrew Cooper March 9, 2016, 10:45 a.m. UTC | #5
On 09/03/16 08:09, Wu, Feng wrote:

>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable)
>>
>>  ENTRY(common_interrupt)
>>          SAVE_ALL CLAC
>> +        SMEP_SMAP_RESTORE
>>          movq %rsp,%rdi
>>          callq do_IRQ
>>          jmp ret_from_intr
>> @@ -454,13 +455,64 @@ ENTRY(page_fault)
>>  GLOBAL(handle_exception)
>>          SAVE_ALL CLAC
>>  handle_exception_saved:
>> +        GET_CURRENT(%rbx)
>>          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
>>          jz    exception_with_ints_disabled
>> -        sti
>> +
>> +.Lsmep_smap_orig:
>> +        jmp   0f
>> +        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
>> +        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc
>> +        .else
>> +        // worst case: rex + opcode + modrm + 4-byte displacement
>> +        .skip (1 + 1 + 1 + 4) - 2, 0xcc
>> +        .endif
>> +        .pushsection .altinstr_replacement, "ax"
>> +.Lsmep_smap_alt:
>> +        mov   VCPU_domain(%rbx),%rax
>> +.Lsmep_smap_alt_end:
>> +        .section .altinstructions, "a"
>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>> +                             X86_FEATURE_SMEP, \
>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>> +                             X86_FEATURE_SMAP, \
>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>> +        .popsection
>> +
>> +        testb $3,UREGS_cs(%rsp)
>> +        jz    0f
>> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
>> +        je    0f
>> +        call  cr4_smep_smap_restore
>> +        /*
>> +         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in
> Do you mean "before" when you typed "between" above?

The meaning is "between (clearing CR4.SMEP and CR4.SMAP in
compat_restore_all_guest) and (it actually returning to guest)"

Nested lists in English are a source of confusion, even to native speakers.

~Andrew

>> +         * compat_restore_all_guest and it actually returning to guest
>> +         * context, in which case the guest would run with the two features
>> +         * enabled. The only bad that can happen from this is a kernel mode
>> +         * #PF which the guest doesn't expect. Rather than trying to make the
>> +         * NMI/#MC exit path honor the intended CR4 setting, simply check
>> +         * whether the wrong CR4 was in use when the #PF occurred, and exit
>> +         * back to the guest (which will in turn clear the two CR4 bits) to
>> +         * re-execute the instruction. If we get back here, the CR4 bits
>> +         * should then be found clear (unless another NMI/#MC occurred at
>> +         * exactly the right time), and we'll continue processing the
>> +         * exception as normal.
>> +         */
Andrew Cooper March 9, 2016, 11:19 a.m. UTC | #6
On 08/03/16 07:57, Jan Beulich wrote:
>
>>> @@ -174,10 +174,43 @@ compat_bad_hypercall:
>>>  /* %rbx: struct vcpu, interrupts disabled */
>>>  ENTRY(compat_restore_all_guest)
>>>          ASSERT_INTERRUPTS_DISABLED
>>> +.Lcr4_orig:
>>> +        ASM_NOP3 /* mov   %cr4, %rax */
>>> +        ASM_NOP6 /* and   $..., %rax */
>>> +        ASM_NOP3 /* mov   %rax, %cr4 */
>>> +        .pushsection .altinstr_replacement, "ax"
>>> +.Lcr4_alt:
>>> +        mov   %cr4, %rax
>>> +        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
>>> +        mov   %rax, %cr4
>>> +.Lcr4_alt_end:
>>> +        .section .altinstructions, "a"
>>> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \
>>> +                             (.Lcr4_alt_end - .Lcr4_alt)
>>> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \
>>> +                             (.Lcr4_alt_end - .Lcr4_alt)
>> These 12's look as if they should be (.Lcr4_alt - .Lcr4_orig).
> Well, the NOPs that get put there make 12 (= 3 + 6 + 3) a
> pretty obvious (shorter and hence more readable) option. But
> yes, if you're of the strong opinion that we should use the
> longer alternative, I can switch these around.

I have to admit that I prefer the Linux ALTERNATIVE macro for assembly,
which takes care of the calculations like this.  It is slightly
unfortunate that it generally requires its assembly blocks in strings,
and is unsuitable for larger blocks.  Perhaps we can see about an
variant in due course.

>
>>> +/* This mustn't modify registers other than %rax. */
>>> +ENTRY(cr4_smep_smap_restore)
>>> +        mov   %cr4, %rax
>>> +        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
>>> +        jnz   0f
>>> +        or    cr4_smep_smap_mask(%rip), %rax
>>> +        mov   %rax, %cr4
>>> +        ret
>>> +0:
>>> +        and   cr4_smep_smap_mask(%rip), %eax
>>> +        cmp   cr4_smep_smap_mask(%rip), %eax
>>> +        je    1f
>>> +        BUG
>> What is the purpose of this bugcheck? It looks like it is catching a
>> mismatch of masked options, but I am not completely sure.
> This aims at detecting that some of the CR4 bits which are
> expected to be set really aren't (other than the case when all
> of the ones of interest here are clear).
>
>> For all other ASM level BUG's, I put a short comment on the same line,
>> to aid people who hit the bug.
> Will do. Question: Should this check perhaps become !NDEBUG
> dependent?

It probably should do.

>
>>> @@ -454,13 +455,64 @@ ENTRY(page_fault)
>>>  GLOBAL(handle_exception)
>>>          SAVE_ALL CLAC
>>>  handle_exception_saved:
>>> +        GET_CURRENT(%rbx)
>>>          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
>>>          jz    exception_with_ints_disabled
>>> -        sti
>>> +
>>> +.Lsmep_smap_orig:
>>> +        jmp   0f
>>> +        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
>>> +        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc
>>> +        .else
>>> +        // worst case: rex + opcode + modrm + 4-byte displacement
>>> +        .skip (1 + 1 + 1 + 4) - 2, 0xcc
>>> +        .endif
>> Which bug is this?  How does it manifest.  More generally, what is this
>> alternative trying to achieve?
> The .org gets a warning (.Lsmep_smap_orig supposedly being
> undefined, and hence getting assumed to be zero) followed by
> an error (attempt to move the current location backwards). The
> fix https://sourceware.org/ml/binutils/2016-03/msg00030.html
> is pending approval.

I presume this is down to the documented restriction about crossing
sections.  i.e. there is no .Lsmep_smap_orig in .altinstr_replacement,
where it found the first two symbols.

>>> +        .pushsection .altinstr_replacement, "ax"
>>> +.Lsmep_smap_alt:
>>> +        mov   VCPU_domain(%rbx),%rax
>>> +.Lsmep_smap_alt_end:
>>> +        .section .altinstructions, "a"
>>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>>> +                             X86_FEATURE_SMEP, \
>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>>> +                             X86_FEATURE_SMAP, \
>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>>> +        .popsection
>>> +
>>> +        testb $3,UREGS_cs(%rsp)
>>> +        jz    0f
>>> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
>> This comparison is wrong on hardware lacking SMEP and SMAP, as the "mov
>> VCPU_domain(%rbx),%rax" won't have happened.
> That mov indeed won't have happened, but the original instruction
> is a branch past all of this code, so the above is correct (and I did
> test on older hardware).

Oh so it wont.  It is moderately subtle that this entire codeblock is
logically contained in the alternative.

It would be far clearer, and work around your org bug, if this was a
single alternative which patched jump into a nop.

At the very least, a label of .Lcr3_pv32_fixup_done would be an
improvement over 0.

>
>>> +        je    0f
>>> +        call  cr4_smep_smap_restore
>>> +        /*
>>> +         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in
>>> +         * compat_restore_all_guest and it actually returning to guest
>>> +         * context, in which case the guest would run with the two features
>>> +         * enabled. The only bad that can happen from this is a kernel mode
>>> +         * #PF which the guest doesn't expect. Rather than trying to make the
>>> +         * NMI/#MC exit path honor the intended CR4 setting, simply check
>>> +         * whether the wrong CR4 was in use when the #PF occurred, and exit
>>> +         * back to the guest (which will in turn clear the two CR4 bits) to
>>> +         * re-execute the instruction. If we get back here, the CR4 bits
>>> +         * should then be found clear (unless another NMI/#MC occurred at
>>> +         * exactly the right time), and we'll continue processing the
>>> +         * exception as normal.
>>> +         */
>>> +        test  %rax,%rax
>>> +        jnz   0f
>>> +        mov   $PFEC_page_present,%al
>>> +        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
>>> +        jne   0f
>>> +        xor   UREGS_error_code(%rsp),%eax
>>> +        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
>>> +        jz    compat_test_all_events
>>> +0:      sti
>> Its code like this which makes me even more certain that we have far too
>> much code written in assembly which doesn't need to be.  Maybe not this
>> specific sample, but it has taken me 15 minutes and a pad of paper to
>> try and work out how this conditional works, and I am still not certain
>> its correct.  In particular, PFEC_prot_key looks like it fool the test
>> into believing a non-smap/smep fault was a smap/smep fault.
> Not sure how you come to think of PFEC_prot_key here: That's
> a bit which can be set only together with PFEC_user_mode, yet
> we care about kernel mode faults only here.

I would not make that assumption.  Assumptions about the valid set of
#PF flags are precisely the reason that older Linux falls into an
infinite loop when encountering a SMAP pagefault, rather than a clean crash.

~Andrew

>
>> Can you at least provide some C in a comment with the intended
>> conditional, to aid clarity?
> Sure, if you think that helps beyond the (I think) pretty extensive
> comment:
>
> +        test  %rax,%rax
> +        jnz   0f
> +        /*
> +         * The below effectively is
> +         * if ( regs->entry_vector == TRAP_page_fault &&
> +         *      (regs->error_code & PFEC_page_present) &&
> +         *      !(regs->error_code & ~(PFEC_write_access|PFEC_insn_fetch)) )
> +         *     goto compat_test_all_events;
> +         */
> +        mov   $PFEC_page_present,%al
> +        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
> +        jne   0f
> +        xor   UREGS_error_code(%rsp),%eax
> +        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
> +        jz    compat_test_all_events
> +0:
>
> Jan
Wu, Feng March 9, 2016, 12:27 p.m. UTC | #7
> -----Original Message-----
> From: Andrew Cooper [mailto:andrew.cooper3@citrix.com]
> Sent: Wednesday, March 9, 2016 6:46 PM
> To: Wu, Feng <feng.wu@intel.com>; Jan Beulich <JBeulich@suse.com>; xen-
> devel <xen-devel@lists.xenproject.org>
> Cc: Keir Fraser <keir@xen.org>
> Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV
> guest code
> 
> On 09/03/16 08:09, Wu, Feng wrote:
> 
> >> --- a/xen/arch/x86/x86_64/entry.S
> >> +++ b/xen/arch/x86/x86_64/entry.S
> >> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable)
> >>
> >>  ENTRY(common_interrupt)
> >>          SAVE_ALL CLAC
> >> +        SMEP_SMAP_RESTORE
> >>          movq %rsp,%rdi
> >>          callq do_IRQ
> >>          jmp ret_from_intr
> >> @@ -454,13 +455,64 @@ ENTRY(page_fault)
> >>  GLOBAL(handle_exception)
> >>          SAVE_ALL CLAC
> >>  handle_exception_saved:
> >> +        GET_CURRENT(%rbx)
> >>          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
> >>          jz    exception_with_ints_disabled
> >> -        sti
> >> +
> >> +.Lsmep_smap_orig:
> >> +        jmp   0f
> >> +        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
> >> +        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt),
> 0xcc
> >> +        .else
> >> +        // worst case: rex + opcode + modrm + 4-byte displacement
> >> +        .skip (1 + 1 + 1 + 4) - 2, 0xcc
> >> +        .endif
> >> +        .pushsection .altinstr_replacement, "ax"
> >> +.Lsmep_smap_alt:
> >> +        mov   VCPU_domain(%rbx),%rax
> >> +.Lsmep_smap_alt_end:
> >> +        .section .altinstructions, "a"
> >> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
> >> +                             X86_FEATURE_SMEP, \
> >> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
> >> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
> >> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
> >> +                             X86_FEATURE_SMAP, \
> >> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
> >> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
> >> +        .popsection
> >> +
> >> +        testb $3,UREGS_cs(%rsp)
> >> +        jz    0f
> >> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
> >> +        je    0f
> >> +        call  cr4_smep_smap_restore
> >> +        /*
> >> +         * An NMI or #MC may occur between clearing CR4.SMEP and
> CR4.SMAP in
> > Do you mean "before" when you typed "between" above?
> 
> The meaning is "between (clearing CR4.SMEP and CR4.SMAP in
> compat_restore_all_guest) and (it actually returning to guest)"
> 
> Nested lists in English are a source of confusion, even to native speakers.

Oh, thanks for the clarification! Do you know how "An NMI or #MC may occur
between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and
it actually returning to guest context, in which case the guest would run with
the two features enabled. " can happen? Especially how the guest can run
with the two features enabled? 

Thanks,
Feng

> 
> ~Andrew
> 
> >> +         * compat_restore_all_guest and it actually returning to guest
> >> +         * context, in which case the guest would run with the two features
> >> +         * enabled. The only bad that can happen from this is a kernel mode
> >> +         * #PF which the guest doesn't expect. Rather than trying to make the
> >> +         * NMI/#MC exit path honor the intended CR4 setting, simply check
> >> +         * whether the wrong CR4 was in use when the #PF occurred, and exit
> >> +         * back to the guest (which will in turn clear the two CR4 bits) to
> >> +         * re-execute the instruction. If we get back here, the CR4 bits
> >> +         * should then be found clear (unless another NMI/#MC occurred at
> >> +         * exactly the right time), and we'll continue processing the
> >> +         * exception as normal.
> >> +         */
>
Andrew Cooper March 9, 2016, 12:33 p.m. UTC | #8
On 09/03/16 12:27, Wu, Feng wrote:
>
>> -----Original Message-----
>> From: Andrew Cooper [mailto:andrew.cooper3@citrix.com]
>> Sent: Wednesday, March 9, 2016 6:46 PM
>> To: Wu, Feng <feng.wu@intel.com>; Jan Beulich <JBeulich@suse.com>; xen-
>> devel <xen-devel@lists.xenproject.org>
>> Cc: Keir Fraser <keir@xen.org>
>> Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV
>> guest code
>>
>> On 09/03/16 08:09, Wu, Feng wrote:
>>
>>>> --- a/xen/arch/x86/x86_64/entry.S
>>>> +++ b/xen/arch/x86/x86_64/entry.S
>>>> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable)
>>>>
>>>>  ENTRY(common_interrupt)
>>>>          SAVE_ALL CLAC
>>>> +        SMEP_SMAP_RESTORE
>>>>          movq %rsp,%rdi
>>>>          callq do_IRQ
>>>>          jmp ret_from_intr
>>>> @@ -454,13 +455,64 @@ ENTRY(page_fault)
>>>>  GLOBAL(handle_exception)
>>>>          SAVE_ALL CLAC
>>>>  handle_exception_saved:
>>>> +        GET_CURRENT(%rbx)
>>>>          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
>>>>          jz    exception_with_ints_disabled
>>>> -        sti
>>>> +
>>>> +.Lsmep_smap_orig:
>>>> +        jmp   0f
>>>> +        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
>>>> +        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt),
>> 0xcc
>>>> +        .else
>>>> +        // worst case: rex + opcode + modrm + 4-byte displacement
>>>> +        .skip (1 + 1 + 1 + 4) - 2, 0xcc
>>>> +        .endif
>>>> +        .pushsection .altinstr_replacement, "ax"
>>>> +.Lsmep_smap_alt:
>>>> +        mov   VCPU_domain(%rbx),%rax
>>>> +.Lsmep_smap_alt_end:
>>>> +        .section .altinstructions, "a"
>>>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>>>> +                             X86_FEATURE_SMEP, \
>>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>>>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>>>> +                             X86_FEATURE_SMAP, \
>>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>>>> +        .popsection
>>>> +
>>>> +        testb $3,UREGS_cs(%rsp)
>>>> +        jz    0f
>>>> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
>>>> +        je    0f
>>>> +        call  cr4_smep_smap_restore
>>>> +        /*
>>>> +         * An NMI or #MC may occur between clearing CR4.SMEP and
>> CR4.SMAP in
>>> Do you mean "before" when you typed "between" above?
>> The meaning is "between (clearing CR4.SMEP and CR4.SMAP in
>> compat_restore_all_guest) and (it actually returning to guest)"
>>
>> Nested lists in English are a source of confusion, even to native speakers.
> Oh, thanks for the clarification! Do you know how "An NMI or #MC may occur
> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and
> it actually returning to guest context, in which case the guest would run with
> the two features enabled. " can happen? Especially how the guest can run
> with the two features enabled?

NMIs and MCEs can occur at any point, even if interrupts are disabled.

The bad situation is this sequence:

* Xen is returning to the guest and disables CR4.SMEP/SMAP
* NMI occurs while still in Xen
* NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP
* Xen ends up returning to guest with CR4.SMEP/SMAP enabled.

~Andrew
Jan Beulich March 9, 2016, 12:36 p.m. UTC | #9
>>>> Andrew Cooper <andrew.cooper3@citrix.com> 03/09/16 1:33 PM >>>
>On 09/03/16 12:27, Wu, Feng wrote:
>> Oh, thanks for the clarification! Do you know how "An NMI or #MC may occur
>> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and
>> it actually returning to guest context, in which case the guest would run with
>> the two features enabled. " can happen? Especially how the guest can run
>> with the two features enabled?
>
>NMIs and MCEs can occur at any point, even if interrupts are disabled.
>
>The bad situation is this sequence:
>
>* Xen is returning to the guest and disables CR4.SMEP/SMAP
>* NMI occurs while still in Xen
>* NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP

Well, almost: Re-enabling happens on the NMI entry path. The NMI exit
path would, seeing it's returning to Xen context, simply not disable them
again.

Jan

>* Xen ends up returning to guest with CR4.SMEP/SMAP enabled.
>
>~Andrew
Wu, Feng March 9, 2016, 12:54 p.m. UTC | #10
> -----Original Message-----
> From: Jan Beulich [mailto:jbeulich@suse.com]
> Sent: Wednesday, March 9, 2016 8:37 PM
> To: andrew.cooper3@citrix.com; Wu, Feng <feng.wu@intel.com>; xen-
> devel@lists.xenproject.org
> Cc: keir@xen.org
> Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV
> guest code
> 
> >>>> Andrew Cooper <andrew.cooper3@citrix.com> 03/09/16 1:33 PM >>>
> >On 09/03/16 12:27, Wu, Feng wrote:
> >> Oh, thanks for the clarification! Do you know how "An NMI or #MC may
> occur
> >> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and
> >> it actually returning to guest context, in which case the guest would run with
> >> the two features enabled. " can happen? Especially how the guest can run
> >> with the two features enabled?
> >
> >NMIs and MCEs can occur at any point, even if interrupts are disabled.
> >
> >The bad situation is this sequence:
> >
> >* Xen is returning to the guest and disables CR4.SMEP/SMAP
> >* NMI occurs while still in Xen
> >* NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP
> 
> Well, almost: Re-enabling happens on the NMI entry path. The NMI exit
> path would, seeing it's returning to Xen context, simply not disable them
> again.

Oh, this is the point I ignored. Thanks for the clarification.

Thanks,
Feng

> 
> Jan
> 
> >* Xen ends up returning to guest with CR4.SMEP/SMAP enabled.
> >
> >~Andrew
>
Wu, Feng March 9, 2016, 1:35 p.m. UTC | #11
> -----Original Message-----
> From: Jan Beulich [mailto:jbeulich@suse.com]
> Sent: Wednesday, March 9, 2016 8:37 PM
> To: andrew.cooper3@citrix.com; Wu, Feng <feng.wu@intel.com>; xen-
> devel@lists.xenproject.org
> Cc: keir@xen.org
> Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV
> guest code
> 
> >>>> Andrew Cooper <andrew.cooper3@citrix.com> 03/09/16 1:33 PM >>>
> >On 09/03/16 12:27, Wu, Feng wrote:
> >> Oh, thanks for the clarification! Do you know how "An NMI or #MC may
> occur
> >> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and
> >> it actually returning to guest context, in which case the guest would run with
> >> the two features enabled. " can happen? Especially how the guest can run
> >> with the two features enabled?
> >
> >NMIs and MCEs can occur at any point, even if interrupts are disabled.
> >
> >The bad situation is this sequence:
> >
> >* Xen is returning to the guest and disables CR4.SMEP/SMAP
> >* NMI occurs while still in Xen
> >* NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP
> 
> Well, almost: Re-enabling happens on the NMI entry path. The NMI exit
> path would, seeing it's returning to Xen context, simply not disable them
> again.

Thinking about this again, in this case, when the NMI happens, we are in
Xen context (CPL in cs is 0), so the CPL of the saved cs in stack is 0,right?
why do we re-enable CR4.SMEP/SMAP in this case? I mean do we only
need to enable SMEP/SMAP when coming from 32bit pv guest (CPL of cs is 1) ?

Thanks,
Feng

> 
> Jan
> 
> >* Xen ends up returning to guest with CR4.SMEP/SMAP enabled.
> >
> >~Andrew
>
Andrew Cooper March 9, 2016, 1:42 p.m. UTC | #12
On 09/03/16 13:35, Wu, Feng wrote:
>
>> -----Original Message-----
>> From: Jan Beulich [mailto:jbeulich@suse.com]
>> Sent: Wednesday, March 9, 2016 8:37 PM
>> To: andrew.cooper3@citrix.com; Wu, Feng <feng.wu@intel.com>; xen-
>> devel@lists.xenproject.org
>> Cc: keir@xen.org
>> Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV
>> guest code
>>
>>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 03/09/16 1:33 PM >>>
>>> On 09/03/16 12:27, Wu, Feng wrote:
>>>> Oh, thanks for the clarification! Do you know how "An NMI or #MC may
>> occur
>>>> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and
>>>> it actually returning to guest context, in which case the guest would run with
>>>> the two features enabled. " can happen? Especially how the guest can run
>>>> with the two features enabled?
>>> NMIs and MCEs can occur at any point, even if interrupts are disabled.
>>>
>>> The bad situation is this sequence:
>>>
>>> * Xen is returning to the guest and disables CR4.SMEP/SMAP
>>> * NMI occurs while still in Xen
>>> * NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP
>> Well, almost: Re-enabling happens on the NMI entry path. The NMI exit
>> path would, seeing it's returning to Xen context, simply not disable them
>> again.
> Thinking about this again, in this case, when the NMI happens, we are in
> Xen context (CPL in cs is 0), so the CPL of the saved cs in stack is 0,right?
> why do we re-enable CR4.SMEP/SMAP in this case? I mean do we only
> need to enable SMEP/SMAP when coming from 32bit pv guest (CPL of cs is 1) ?

We always want Xen to be running with SMEP/SMAP enabled.  Therefore the
safer and simpler option is to always enable them if we observe them
disabled.

Interrupting a 32bit PV guest might end up seeing a cpl of 1 or 3, and
peeking into the active struct domain to check is_pv32_domain would be a
larger overhead on the entry paths.

~Andrew
Jan Beulich March 9, 2016, 2:03 p.m. UTC | #13
>>> On 09.03.16 at 11:45, <andrew.cooper3@citrix.com> wrote:
> On 09/03/16 08:09, Wu, Feng wrote:
> 
>>> --- a/xen/arch/x86/x86_64/entry.S
>>> +++ b/xen/arch/x86/x86_64/entry.S
>>> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable)
>>>
>>>  ENTRY(common_interrupt)
>>>          SAVE_ALL CLAC
>>> +        SMEP_SMAP_RESTORE
>>>          movq %rsp,%rdi
>>>          callq do_IRQ
>>>          jmp ret_from_intr
>>> @@ -454,13 +455,64 @@ ENTRY(page_fault)
>>>  GLOBAL(handle_exception)
>>>          SAVE_ALL CLAC
>>>  handle_exception_saved:
>>> +        GET_CURRENT(%rbx)
>>>          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
>>>          jz    exception_with_ints_disabled
>>> -        sti
>>> +
>>> +.Lsmep_smap_orig:
>>> +        jmp   0f
>>> +        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
>>> +        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 
> 0xcc
>>> +        .else
>>> +        // worst case: rex + opcode + modrm + 4-byte displacement
>>> +        .skip (1 + 1 + 1 + 4) - 2, 0xcc
>>> +        .endif
>>> +        .pushsection .altinstr_replacement, "ax"
>>> +.Lsmep_smap_alt:
>>> +        mov   VCPU_domain(%rbx),%rax
>>> +.Lsmep_smap_alt_end:
>>> +        .section .altinstructions, "a"
>>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>>> +                             X86_FEATURE_SMEP, \
>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>>> +                             X86_FEATURE_SMAP, \
>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>>> +        .popsection
>>> +
>>> +        testb $3,UREGS_cs(%rsp)
>>> +        jz    0f
>>> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
>>> +        je    0f
>>> +        call  cr4_smep_smap_restore
>>> +        /*
>>> +         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP 
> in
>> Do you mean "before" when you typed "between" above?
> 
> The meaning is "between (clearing CR4.SMEP and CR4.SMAP in
> compat_restore_all_guest) and (it actually returning to guest)"
> 
> Nested lists in English are a source of confusion, even to native speakers.

I've switched the first "and" to / to avoid the ambiguity.

Jan
Jan Beulich March 9, 2016, 2:07 p.m. UTC | #14
>>> On 09.03.16 at 09:09, <feng.wu@intel.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: Friday, March 4, 2016 7:28 PM
>> @@ -1471,7 +1475,10 @@ void __init noreturn __start_xen(unsigne
>>       * copy_from_user().
>>       */
>>      if ( cpu_has_smap )
>> +    {
>> +        cr4_smep_smap_mask &= ~X86_CR4_SMAP;
> 
> You change ' cr4_smep_smap_mask ' here ...
> 
>>          write_cr4(read_cr4() & ~X86_CR4_SMAP);
>> +    }
>> 
>>      printk("%sNX (Execute Disable) protection %sactive\n",
>>             cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
>> @@ -1488,7 +1495,10 @@ void __init noreturn __start_xen(unsigne
>>          panic("Could not set up DOM0 guest OS");
>> 
>>      if ( cpu_has_smap )
>> +    {
>>          write_cr4(read_cr4() | X86_CR4_SMAP);
>> +        cr4_smep_smap_mask |= X86_CR4_SMAP;
> 
> ... and here. I am wonder whether it is because Domain 0 can actually start
> running between the two place? Or I don't think the changes in the above
> two places is needed. right?

They very definitely needed, to avoid hitting the BUG in
cr4_pv32_restore (cr4_smep_smap_restore in this patch
version) in every interrupt that occurs while Dom0 is being
constructed.

Jan
Jan Beulich March 9, 2016, 2:09 p.m. UTC | #15
>>> On 09.03.16 at 09:09, <feng.wu@intel.com> wrote:
>> >> +/* This mustn't modify registers other than %rax. */
>> >> +ENTRY(cr4_smep_smap_restore)
>> >> +        mov   %cr4, %rax
>> >> +        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
>> >> +        jnz   0f
> 
> If we clear every place where we are back to 32bit pv guest,
> X86_CR4_SMEP and X86_CR4_SMAP bit should be clear
> in CR4, right?  If that is the case, we cannot jump to 0f.

I think Andrew's reply to (I think) a later mail of yours already
answered this, but just in case: We unconditionally come here
on paths that _may_ be used when entering Xen out of 32-bit
PV guest context. I.e. we do not know which state the two
flags are in.

Jan
Jan Beulich March 9, 2016, 2:28 p.m. UTC | #16
>>> On 09.03.16 at 12:19, <andrew.cooper3@citrix.com> wrote:
> On 08/03/16 07:57, Jan Beulich wrote:
>>>> @@ -174,10 +174,43 @@ compat_bad_hypercall:
>>>>  /* %rbx: struct vcpu, interrupts disabled */
>>>>  ENTRY(compat_restore_all_guest)
>>>>          ASSERT_INTERRUPTS_DISABLED
>>>> +.Lcr4_orig:
>>>> +        ASM_NOP3 /* mov   %cr4, %rax */
>>>> +        ASM_NOP6 /* and   $..., %rax */
>>>> +        ASM_NOP3 /* mov   %rax, %cr4 */
>>>> +        .pushsection .altinstr_replacement, "ax"
>>>> +.Lcr4_alt:
>>>> +        mov   %cr4, %rax
>>>> +        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
>>>> +        mov   %rax, %cr4
>>>> +.Lcr4_alt_end:
>>>> +        .section .altinstructions, "a"
>>>> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \
>>>> +                             (.Lcr4_alt_end - .Lcr4_alt)
>>>> +        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \
>>>> +                             (.Lcr4_alt_end - .Lcr4_alt)
>>> These 12's look as if they should be (.Lcr4_alt - .Lcr4_orig).
>> Well, the NOPs that get put there make 12 (= 3 + 6 + 3) a
>> pretty obvious (shorter and hence more readable) option. But
>> yes, if you're of the strong opinion that we should use the
>> longer alternative, I can switch these around.
> 
> I have to admit that I prefer the Linux ALTERNATIVE macro for assembly,
> which takes care of the calculations like this.  It is slightly
> unfortunate that it generally requires its assembly blocks in strings,
> and is unsuitable for larger blocks.  Perhaps we can see about an
> variant in due course.

I due course to me means subsequently - is that the meaning you
imply here too?

But what's interesting about this suggestion: Their macro uses
.skip instead of .org, which means I should be able to replace the
ugly gas bug workaround by simply using .skip. I'll give that a try.

>>>> +        .pushsection .altinstr_replacement, "ax"
>>>> +.Lsmep_smap_alt:
>>>> +        mov   VCPU_domain(%rbx),%rax
>>>> +.Lsmep_smap_alt_end:
>>>> +        .section .altinstructions, "a"
>>>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>>>> +                             X86_FEATURE_SMEP, \
>>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>>>> +        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
>>>> +                             X86_FEATURE_SMAP, \
>>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
>>>> +                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
>>>> +        .popsection
>>>> +
>>>> +        testb $3,UREGS_cs(%rsp)
>>>> +        jz    0f
>>>> +        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
>>> This comparison is wrong on hardware lacking SMEP and SMAP, as the "mov
>>> VCPU_domain(%rbx),%rax" won't have happened.
>> That mov indeed won't have happened, but the original instruction
>> is a branch past all of this code, so the above is correct (and I did
>> test on older hardware).
> 
> Oh so it wont.  It is moderately subtle that this entire codeblock is
> logically contained in the alternative.
> 
> It would be far clearer, and work around your org bug, if this was a
> single alternative which patched jump into a nop.

I specifically wanted to avoid needlessly patching a NOP in there
when going forward we expect the majority of systems to have
that patching done.

> At the very least, a label of .Lcr3_pv32_fixup_done would be an
> improvement over 0.

Agreed (albeit I prefer it to be named .Lcr4_pv32_done).

>>>> +        je    0f
>>>> +        call  cr4_smep_smap_restore
>>>> +        /*
>>>> +         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in
>>>> +         * compat_restore_all_guest and it actually returning to guest
>>>> +         * context, in which case the guest would run with the two features
>>>> +         * enabled. The only bad that can happen from this is a kernel mode
>>>> +         * #PF which the guest doesn't expect. Rather than trying to make the
>>>> +         * NMI/#MC exit path honor the intended CR4 setting, simply check
>>>> +         * whether the wrong CR4 was in use when the #PF occurred, and exit
>>>> +         * back to the guest (which will in turn clear the two CR4 bits) to
>>>> +         * re-execute the instruction. If we get back here, the CR4 bits
>>>> +         * should then be found clear (unless another NMI/#MC occurred at
>>>> +         * exactly the right time), and we'll continue processing the
>>>> +         * exception as normal.
>>>> +         */
>>>> +        test  %rax,%rax
>>>> +        jnz   0f
>>>> +        mov   $PFEC_page_present,%al
>>>> +        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
>>>> +        jne   0f
>>>> +        xor   UREGS_error_code(%rsp),%eax
>>>> +        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
>>>> +        jz    compat_test_all_events
>>>> +0:      sti
>>> Its code like this which makes me even more certain that we have far too
>>> much code written in assembly which doesn't need to be.  Maybe not this
>>> specific sample, but it has taken me 15 minutes and a pad of paper to
>>> try and work out how this conditional works, and I am still not certain
>>> its correct.  In particular, PFEC_prot_key looks like it fool the test
>>> into believing a non-smap/smep fault was a smap/smep fault.
>> Not sure how you come to think of PFEC_prot_key here: That's
>> a bit which can be set only together with PFEC_user_mode, yet
>> we care about kernel mode faults only here.
> 
> I would not make that assumption.  Assumptions about the valid set of
> #PF flags are precisely the reason that older Linux falls into an
> infinite loop when encountering a SMAP pagefault, rather than a clean crash.

We have to make _some_ assumption here on the bits which
so far have no meaning. Whichever route we go, we can't
exclude that for things to be right with new features getting
added, we may need to adjust the mask. As to the example you
give for comparison - that's apples and oranges as long as Xen
isn't meant to run as PV guest on some (other) hypervisor.

Jan
diff mbox

Patch

--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -67,6 +67,8 @@  boolean_param("smep", opt_smep);
 static bool_t __initdata opt_smap = 1;
 boolean_param("smap", opt_smap);
 
+unsigned long __read_mostly cr4_smep_smap_mask;
+
 /* Boot dom0 in pvh mode */
 static bool_t __initdata opt_dom0pvh;
 boolean_param("dom0pvh", opt_dom0pvh);
@@ -1335,6 +1337,8 @@  void __init noreturn __start_xen(unsigne
     if ( cpu_has_smap )
         set_in_cr4(X86_CR4_SMAP);
 
+    cr4_smep_smap_mask = mmu_cr4_features & (X86_CR4_SMEP | X86_CR4_SMAP);
+
     if ( cpu_has_fsgsbase )
         set_in_cr4(X86_CR4_FSGSBASE);
 
@@ -1471,7 +1475,10 @@  void __init noreturn __start_xen(unsigne
      * copy_from_user().
      */
     if ( cpu_has_smap )
+    {
+        cr4_smep_smap_mask &= ~X86_CR4_SMAP;
         write_cr4(read_cr4() & ~X86_CR4_SMAP);
+    }
 
     printk("%sNX (Execute Disable) protection %sactive\n",
            cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
@@ -1488,7 +1495,10 @@  void __init noreturn __start_xen(unsigne
         panic("Could not set up DOM0 guest OS");
 
     if ( cpu_has_smap )
+    {
         write_cr4(read_cr4() | X86_CR4_SMAP);
+        cr4_smep_smap_mask |= X86_CR4_SMAP;
+    }
 
     /* Scrub RAM that is still free and so may go to an unprivileged domain. */
     scrub_heap_pages();
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -16,14 +16,16 @@  ENTRY(compat_hypercall)
         ASM_CLAC
         pushq $0
         SAVE_VOLATILE type=TRAP_syscall compat=1
+        SMEP_SMAP_RESTORE
 
         cmpb  $0,untrusted_msi(%rip)
 UNLIKELY_START(ne, msi_check)
         movl  $HYPERCALL_VECTOR,%edi
         call  check_for_unexpected_msi
-        LOAD_C_CLOBBERED
+        LOAD_C_CLOBBERED compat=1 ax=0
 UNLIKELY_END(msi_check)
 
+        movl  UREGS_rax(%rsp),%eax
         GET_CURRENT(%rbx)
 
         cmpl  $NR_hypercalls,%eax
@@ -33,7 +35,6 @@  UNLIKELY_END(msi_check)
         pushq UREGS_rbx(%rsp); pushq %rcx; pushq %rdx; pushq %rsi; pushq %rdi
         pushq UREGS_rbp+5*8(%rsp)
         leaq  compat_hypercall_args_table(%rip),%r10
-        movl  %eax,%eax
         movl  $6,%ecx
         subb  (%r10,%rax,1),%cl
         movq  %rsp,%rdi
@@ -48,7 +49,6 @@  UNLIKELY_END(msi_check)
 #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */
 #else
         /* Relocate argument registers and zero-extend to 64 bits. */
-        movl  %eax,%eax              /* Hypercall #  */
         xchgl %ecx,%esi              /* Arg 2, Arg 4 */
         movl  %edx,%edx              /* Arg 3        */
         movl  %edi,%r8d              /* Arg 5        */
@@ -174,10 +174,43 @@  compat_bad_hypercall:
 /* %rbx: struct vcpu, interrupts disabled */
 ENTRY(compat_restore_all_guest)
         ASSERT_INTERRUPTS_DISABLED
+.Lcr4_orig:
+        ASM_NOP3 /* mov   %cr4, %rax */
+        ASM_NOP6 /* and   $..., %rax */
+        ASM_NOP3 /* mov   %rax, %cr4 */
+        .pushsection .altinstr_replacement, "ax"
+.Lcr4_alt:
+        mov   %cr4, %rax
+        and   $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax
+        mov   %rax, %cr4
+.Lcr4_alt_end:
+        .section .altinstructions, "a"
+        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \
+                             (.Lcr4_alt_end - .Lcr4_alt)
+        altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \
+                             (.Lcr4_alt_end - .Lcr4_alt)
+        .popsection
         RESTORE_ALL adj=8 compat=1
 .Lft0:  iretq
         _ASM_PRE_EXTABLE(.Lft0, handle_exception)
 
+/* This mustn't modify registers other than %rax. */
+ENTRY(cr4_smep_smap_restore)
+        mov   %cr4, %rax
+        test  $X86_CR4_SMEP|X86_CR4_SMAP,%eax
+        jnz   0f
+        or    cr4_smep_smap_mask(%rip), %rax
+        mov   %rax, %cr4
+        ret
+0:
+        and   cr4_smep_smap_mask(%rip), %eax
+        cmp   cr4_smep_smap_mask(%rip), %eax
+        je    1f
+        BUG
+1:
+        xor   %eax, %eax
+        ret
+
 /* %rdx: trap_bounce, %rbx: struct vcpu */
 ENTRY(compat_post_handle_exception)
         testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
@@ -190,6 +223,7 @@  ENTRY(compat_post_handle_exception)
 /* See lstar_enter for entry register state. */
 ENTRY(cstar_enter)
         sti
+        SMEP_SMAP_RESTORE
         movq  8(%rsp),%rax /* Restore %rax. */
         movq  $FLAT_KERNEL_SS,8(%rsp)
         pushq %r11
@@ -225,6 +259,7 @@  UNLIKELY_END(compat_syscall_gpf)
         jmp   .Lcompat_bounce_exception
 
 ENTRY(compat_sysenter)
+        SMEP_SMAP_RESTORE
         movq  VCPU_trap_ctxt(%rbx),%rcx
         cmpb  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
         movzwl VCPU_sysenter_sel(%rbx),%eax
@@ -238,6 +273,7 @@  ENTRY(compat_sysenter)
         jmp   compat_test_all_events
 
 ENTRY(compat_int80_direct_trap)
+        SMEP_SMAP_RESTORE
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
 
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -434,6 +434,7 @@  ENTRY(dom_crash_sync_extable)
 
 ENTRY(common_interrupt)
         SAVE_ALL CLAC
+        SMEP_SMAP_RESTORE
         movq %rsp,%rdi
         callq do_IRQ
         jmp ret_from_intr
@@ -454,13 +455,64 @@  ENTRY(page_fault)
 GLOBAL(handle_exception)
         SAVE_ALL CLAC
 handle_exception_saved:
+        GET_CURRENT(%rbx)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
         jz    exception_with_ints_disabled
-        sti
+
+.Lsmep_smap_orig:
+        jmp   0f
+        .if 0 // GAS bug (affecting at least 2.22 ... 2.26)
+        .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc
+        .else
+        // worst case: rex + opcode + modrm + 4-byte displacement
+        .skip (1 + 1 + 1 + 4) - 2, 0xcc
+        .endif
+        .pushsection .altinstr_replacement, "ax"
+.Lsmep_smap_alt:
+        mov   VCPU_domain(%rbx),%rax
+.Lsmep_smap_alt_end:
+        .section .altinstructions, "a"
+        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
+                             X86_FEATURE_SMEP, \
+                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
+                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
+        altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \
+                             X86_FEATURE_SMAP, \
+                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \
+                             (.Lsmep_smap_alt_end - .Lsmep_smap_alt)
+        .popsection
+
+        testb $3,UREGS_cs(%rsp)
+        jz    0f
+        cmpb  $0,DOMAIN_is_32bit_pv(%rax)
+        je    0f
+        call  cr4_smep_smap_restore
+        /*
+         * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in
+         * compat_restore_all_guest and it actually returning to guest
+         * context, in which case the guest would run with the two features
+         * enabled. The only bad that can happen from this is a kernel mode
+         * #PF which the guest doesn't expect. Rather than trying to make the
+         * NMI/#MC exit path honor the intended CR4 setting, simply check
+         * whether the wrong CR4 was in use when the #PF occurred, and exit
+         * back to the guest (which will in turn clear the two CR4 bits) to
+         * re-execute the instruction. If we get back here, the CR4 bits
+         * should then be found clear (unless another NMI/#MC occurred at
+         * exactly the right time), and we'll continue processing the
+         * exception as normal.
+         */
+        test  %rax,%rax
+        jnz   0f
+        mov   $PFEC_page_present,%al
+        cmpb  $TRAP_page_fault,UREGS_entry_vector(%rsp)
+        jne   0f
+        xor   UREGS_error_code(%rsp),%eax
+        test  $~(PFEC_write_access|PFEC_insn_fetch),%eax
+        jz    compat_test_all_events
+0:      sti
 1:      movq  %rsp,%rdi
         movzbl UREGS_entry_vector(%rsp),%eax
         leaq  exception_table(%rip),%rdx
-        GET_CURRENT(%rbx)
         PERFC_INCR(exceptions, %rax, %rbx)
         callq *(%rdx,%rax,8)
         testb $3,UREGS_cs(%rsp)
@@ -592,6 +644,7 @@  handle_ist_exception:
         SAVE_ALL CLAC
         testb $3,UREGS_cs(%rsp)
         jz    1f
+        SMEP_SMAP_RESTORE
         /* Interrupted guest context. Copy the context to stack bottom. */
         GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi)
         movq  %rsp,%rsi
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -209,6 +209,16 @@  void ret_from_intr(void);
 
 #define ASM_STAC ASM_AC(STAC)
 #define ASM_CLAC ASM_AC(CLAC)
+
+#define SMEP_SMAP_RESTORE                                              \
+        667: ASM_NOP5;                                                 \
+        .pushsection .altinstr_replacement, "ax";                      \
+        668: call cr4_smep_smap_restore;                               \
+        .section .altinstructions, "a";                                \
+        altinstruction_entry 667b, 668b, X86_FEATURE_SMEP, 5, 5;       \
+        altinstruction_entry 667b, 668b, X86_FEATURE_SMAP, 5, 5;       \
+        .popsection
+
 #else
 static always_inline void clac(void)
 {
@@ -308,14 +318,18 @@  static always_inline void stac(void)
  *
  * For the way it is used in RESTORE_ALL, this macro must preserve EFLAGS.ZF.
  */
-.macro LOAD_C_CLOBBERED compat=0
+.macro LOAD_C_CLOBBERED compat=0 ax=1
 .if !\compat
         movq  UREGS_r11(%rsp),%r11
         movq  UREGS_r10(%rsp),%r10
         movq  UREGS_r9(%rsp),%r9
         movq  UREGS_r8(%rsp),%r8
-.endif
+.if \ax
         movq  UREGS_rax(%rsp),%rax
+.endif
+.elseif \ax
+        movl  UREGS_rax(%rsp),%eax
+.endif
         movq  UREGS_rcx(%rsp),%rcx
         movq  UREGS_rdx(%rsp),%rdx
         movq  UREGS_rsi(%rsp),%rsi
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -134,12 +134,12 @@ 
 #define TF_kernel_mode         (1<<_TF_kernel_mode)
 
 /* #PF error code values. */
-#define PFEC_page_present   (1U<<0)
-#define PFEC_write_access   (1U<<1)
-#define PFEC_user_mode      (1U<<2)
-#define PFEC_reserved_bit   (1U<<3)
-#define PFEC_insn_fetch     (1U<<4)
-#define PFEC_prot_key       (1U<<5)
+#define PFEC_page_present   (_AC(1,U) << 0)
+#define PFEC_write_access   (_AC(1,U) << 1)
+#define PFEC_user_mode      (_AC(1,U) << 2)
+#define PFEC_reserved_bit   (_AC(1,U) << 3)
+#define PFEC_insn_fetch     (_AC(1,U) << 4)
+#define PFEC_prot_key       (_AC(1,U) << 5)
 /* Internally used only flags. */
 #define PFEC_page_paged     (1U<<16)
 #define PFEC_page_shared    (1U<<17)