Message ID | 56D97F4802000078000D9561@prv-mh.provo.novell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 04/03/16 11:27, Jan Beulich wrote: > Since such guests' kernel code runs in ring 1, their memory accesses, > at the paging layer, are supervisor mode ones, and hence subject to > SMAP/SMEP checks. Such guests cannot be expected to be aware of those > two features though (and so far we also don't expose the respective > feature flags), and hence may suffer page faults they cannot deal with. > > While the placement of the re-enabling slightly weakens the intended > protection, it was selected such that 64-bit paths would remain > unaffected where possible. At the expense of a further performance hit > the re-enabling could be put right next to the CLACs. > > Note that this introduces a number of extra TLB flushes - CR4.SMEP > transitioning from 0 to 1 always causes a flush, and it transitioning > from 1 to 0 may also do. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/setup.c > +++ b/xen/arch/x86/setup.c > @@ -67,6 +67,8 @@ boolean_param("smep", opt_smep); > static bool_t __initdata opt_smap = 1; > boolean_param("smap", opt_smap); > > +unsigned long __read_mostly cr4_smep_smap_mask; Are we liable to gain any other cr4 features which would want to be included in this? Might it be wise to chose a slightly more generic name such as cr4_pv32_mask ? > #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */ > #else > /* Relocate argument registers and zero-extend to 64 bits. */ > - movl %eax,%eax /* Hypercall # */ > xchgl %ecx,%esi /* Arg 2, Arg 4 */ > movl %edx,%edx /* Arg 3 */ > movl %edi,%r8d /* Arg 5 */ > @@ -174,10 +174,43 @@ compat_bad_hypercall: > /* %rbx: struct vcpu, interrupts disabled */ > ENTRY(compat_restore_all_guest) > ASSERT_INTERRUPTS_DISABLED > +.Lcr4_orig: > + ASM_NOP3 /* mov %cr4, %rax */ > + ASM_NOP6 /* and $..., %rax */ > + ASM_NOP3 /* mov %rax, %cr4 */ > + .pushsection .altinstr_replacement, "ax" > +.Lcr4_alt: > + mov %cr4, %rax > + and $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax > + mov %rax, %cr4 > +.Lcr4_alt_end: > + .section .altinstructions, "a" > + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \ > + (.Lcr4_alt_end - .Lcr4_alt) > + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \ > + (.Lcr4_alt_end - .Lcr4_alt) These 12's look as if they should be (.Lcr4_alt - .Lcr4_orig). > + .popsection > RESTORE_ALL adj=8 compat=1 > .Lft0: iretq > _ASM_PRE_EXTABLE(.Lft0, handle_exception) > > +/* This mustn't modify registers other than %rax. */ > +ENTRY(cr4_smep_smap_restore) > + mov %cr4, %rax > + test $X86_CR4_SMEP|X86_CR4_SMAP,%eax > + jnz 0f > + or cr4_smep_smap_mask(%rip), %rax > + mov %rax, %cr4 > + ret > +0: > + and cr4_smep_smap_mask(%rip), %eax > + cmp cr4_smep_smap_mask(%rip), %eax > + je 1f > + BUG What is the purpose of this bugcheck? It looks like it is catching a mismatch of masked options, but I am not completely sure. For all other ASM level BUG's, I put a short comment on the same line, to aid people who hit the bug. > +1: > + xor %eax, %eax > + ret > + > /* %rdx: trap_bounce, %rbx: struct vcpu */ > ENTRY(compat_post_handle_exception) > testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) > @@ -190,6 +223,7 @@ ENTRY(compat_post_handle_exception) > /* See lstar_enter for entry register state. */ > ENTRY(cstar_enter) > sti > + SMEP_SMAP_RESTORE > movq 8(%rsp),%rax /* Restore %rax. */ > movq $FLAT_KERNEL_SS,8(%rsp) > pushq %r11 > @@ -225,6 +259,7 @@ UNLIKELY_END(compat_syscall_gpf) > jmp .Lcompat_bounce_exception > > ENTRY(compat_sysenter) > + SMEP_SMAP_RESTORE > movq VCPU_trap_ctxt(%rbx),%rcx > cmpb $TRAP_gp_fault,UREGS_entry_vector(%rsp) > movzwl VCPU_sysenter_sel(%rbx),%eax > @@ -238,6 +273,7 @@ ENTRY(compat_sysenter) > jmp compat_test_all_events > > ENTRY(compat_int80_direct_trap) > + SMEP_SMAP_RESTORE > call compat_create_bounce_frame > jmp compat_test_all_events > > --- a/xen/arch/x86/x86_64/entry.S > +++ b/xen/arch/x86/x86_64/entry.S > @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable) > > ENTRY(common_interrupt) > SAVE_ALL CLAC > + SMEP_SMAP_RESTORE > movq %rsp,%rdi > callq do_IRQ > jmp ret_from_intr > @@ -454,13 +455,64 @@ ENTRY(page_fault) > GLOBAL(handle_exception) > SAVE_ALL CLAC > handle_exception_saved: > + GET_CURRENT(%rbx) > testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) > jz exception_with_ints_disabled > - sti > + > +.Lsmep_smap_orig: > + jmp 0f > + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) > + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc > + .else > + // worst case: rex + opcode + modrm + 4-byte displacement > + .skip (1 + 1 + 1 + 4) - 2, 0xcc > + .endif Which bug is this? How does it manifest. More generally, what is this alternative trying to achieve? > + .pushsection .altinstr_replacement, "ax" > +.Lsmep_smap_alt: > + mov VCPU_domain(%rbx),%rax > +.Lsmep_smap_alt_end: > + .section .altinstructions, "a" > + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ > + X86_FEATURE_SMEP, \ > + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ > + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) > + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ > + X86_FEATURE_SMAP, \ > + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ > + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) > + .popsection > + > + testb $3,UREGS_cs(%rsp) > + jz 0f > + cmpb $0,DOMAIN_is_32bit_pv(%rax) This comparison is wrong on hardware lacking SMEP and SMAP, as the "mov VCPU_domain(%rbx),%rax" won't have happened. > + je 0f > + call cr4_smep_smap_restore > + /* > + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in > + * compat_restore_all_guest and it actually returning to guest > + * context, in which case the guest would run with the two features > + * enabled. The only bad that can happen from this is a kernel mode > + * #PF which the guest doesn't expect. Rather than trying to make the > + * NMI/#MC exit path honor the intended CR4 setting, simply check > + * whether the wrong CR4 was in use when the #PF occurred, and exit > + * back to the guest (which will in turn clear the two CR4 bits) to > + * re-execute the instruction. If we get back here, the CR4 bits > + * should then be found clear (unless another NMI/#MC occurred at > + * exactly the right time), and we'll continue processing the > + * exception as normal. > + */ > + test %rax,%rax > + jnz 0f > + mov $PFEC_page_present,%al > + cmpb $TRAP_page_fault,UREGS_entry_vector(%rsp) > + jne 0f > + xor UREGS_error_code(%rsp),%eax > + test $~(PFEC_write_access|PFEC_insn_fetch),%eax > + jz compat_test_all_events > +0: sti Its code like this which makes me even more certain that we have far too much code written in assembly which doesn't need to be. Maybe not this specific sample, but it has taken me 15 minutes and a pad of paper to try and work out how this conditional works, and I am still not certain its correct. In particular, PFEC_prot_key looks like it fool the test into believing a non-smap/smep fault was a smap/smep fault. Can you at least provide some C in a comment with the intended conditional, to aid clarity? ~Andrew
>>> On 07.03.16 at 17:59, <andrew.cooper3@citrix.com> wrote: > On 04/03/16 11:27, Jan Beulich wrote: >> --- a/xen/arch/x86/setup.c >> +++ b/xen/arch/x86/setup.c >> @@ -67,6 +67,8 @@ boolean_param("smep", opt_smep); >> static bool_t __initdata opt_smap = 1; >> boolean_param("smap", opt_smap); >> >> +unsigned long __read_mostly cr4_smep_smap_mask; > > Are we liable to gain any other cr4 features which would want to be > included in this? Might it be wise to chose a slightly more generic > name such as cr4_pv32_mask ? Ah, that's a good name suggestion - I didn't like the "smep_smap" thing from the beginning, but couldn't think of a better variant. >> @@ -174,10 +174,43 @@ compat_bad_hypercall: >> /* %rbx: struct vcpu, interrupts disabled */ >> ENTRY(compat_restore_all_guest) >> ASSERT_INTERRUPTS_DISABLED >> +.Lcr4_orig: >> + ASM_NOP3 /* mov %cr4, %rax */ >> + ASM_NOP6 /* and $..., %rax */ >> + ASM_NOP3 /* mov %rax, %cr4 */ >> + .pushsection .altinstr_replacement, "ax" >> +.Lcr4_alt: >> + mov %cr4, %rax >> + and $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax >> + mov %rax, %cr4 >> +.Lcr4_alt_end: >> + .section .altinstructions, "a" >> + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \ >> + (.Lcr4_alt_end - .Lcr4_alt) >> + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \ >> + (.Lcr4_alt_end - .Lcr4_alt) > > These 12's look as if they should be (.Lcr4_alt - .Lcr4_orig). Well, the NOPs that get put there make 12 (= 3 + 6 + 3) a pretty obvious (shorter and hence more readable) option. But yes, if you're of the strong opinion that we should use the longer alternative, I can switch these around. >> +/* This mustn't modify registers other than %rax. */ >> +ENTRY(cr4_smep_smap_restore) >> + mov %cr4, %rax >> + test $X86_CR4_SMEP|X86_CR4_SMAP,%eax >> + jnz 0f >> + or cr4_smep_smap_mask(%rip), %rax >> + mov %rax, %cr4 >> + ret >> +0: >> + and cr4_smep_smap_mask(%rip), %eax >> + cmp cr4_smep_smap_mask(%rip), %eax >> + je 1f >> + BUG > > What is the purpose of this bugcheck? It looks like it is catching a > mismatch of masked options, but I am not completely sure. This aims at detecting that some of the CR4 bits which are expected to be set really aren't (other than the case when all of the ones of interest here are clear). > For all other ASM level BUG's, I put a short comment on the same line, > to aid people who hit the bug. Will do. Question: Should this check perhaps become !NDEBUG dependent? >> @@ -454,13 +455,64 @@ ENTRY(page_fault) >> GLOBAL(handle_exception) >> SAVE_ALL CLAC >> handle_exception_saved: >> + GET_CURRENT(%rbx) >> testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) >> jz exception_with_ints_disabled >> - sti >> + >> +.Lsmep_smap_orig: >> + jmp 0f >> + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) >> + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc >> + .else >> + // worst case: rex + opcode + modrm + 4-byte displacement >> + .skip (1 + 1 + 1 + 4) - 2, 0xcc >> + .endif > > Which bug is this? How does it manifest. More generally, what is this > alternative trying to achieve? The .org gets a warning (.Lsmep_smap_orig supposedly being undefined, and hence getting assumed to be zero) followed by an error (attempt to move the current location backwards). The fix https://sourceware.org/ml/binutils/2016-03/msg00030.html is pending approval. >> + .pushsection .altinstr_replacement, "ax" >> +.Lsmep_smap_alt: >> + mov VCPU_domain(%rbx),%rax >> +.Lsmep_smap_alt_end: >> + .section .altinstructions, "a" >> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >> + X86_FEATURE_SMEP, \ >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >> + X86_FEATURE_SMAP, \ >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >> + .popsection >> + >> + testb $3,UREGS_cs(%rsp) >> + jz 0f >> + cmpb $0,DOMAIN_is_32bit_pv(%rax) > > This comparison is wrong on hardware lacking SMEP and SMAP, as the "mov > VCPU_domain(%rbx),%rax" won't have happened. That mov indeed won't have happened, but the original instruction is a branch past all of this code, so the above is correct (and I did test on older hardware). >> + je 0f >> + call cr4_smep_smap_restore >> + /* >> + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in >> + * compat_restore_all_guest and it actually returning to guest >> + * context, in which case the guest would run with the two features >> + * enabled. The only bad that can happen from this is a kernel mode >> + * #PF which the guest doesn't expect. Rather than trying to make the >> + * NMI/#MC exit path honor the intended CR4 setting, simply check >> + * whether the wrong CR4 was in use when the #PF occurred, and exit >> + * back to the guest (which will in turn clear the two CR4 bits) to >> + * re-execute the instruction. If we get back here, the CR4 bits >> + * should then be found clear (unless another NMI/#MC occurred at >> + * exactly the right time), and we'll continue processing the >> + * exception as normal. >> + */ >> + test %rax,%rax >> + jnz 0f >> + mov $PFEC_page_present,%al >> + cmpb $TRAP_page_fault,UREGS_entry_vector(%rsp) >> + jne 0f >> + xor UREGS_error_code(%rsp),%eax >> + test $~(PFEC_write_access|PFEC_insn_fetch),%eax >> + jz compat_test_all_events >> +0: sti > > Its code like this which makes me even more certain that we have far too > much code written in assembly which doesn't need to be. Maybe not this > specific sample, but it has taken me 15 minutes and a pad of paper to > try and work out how this conditional works, and I am still not certain > its correct. In particular, PFEC_prot_key looks like it fool the test > into believing a non-smap/smep fault was a smap/smep fault. Not sure how you come to think of PFEC_prot_key here: That's a bit which can be set only together with PFEC_user_mode, yet we care about kernel mode faults only here. > Can you at least provide some C in a comment with the intended > conditional, to aid clarity? Sure, if you think that helps beyond the (I think) pretty extensive comment: + test %rax,%rax + jnz 0f + /* + * The below effectively is + * if ( regs->entry_vector == TRAP_page_fault && + * (regs->error_code & PFEC_page_present) && + * !(regs->error_code & ~(PFEC_write_access|PFEC_insn_fetch)) ) + * goto compat_test_all_events; + */ + mov $PFEC_page_present,%al + cmpb $TRAP_page_fault,UREGS_entry_vector(%rsp) + jne 0f + xor UREGS_error_code(%rsp),%eax + test $~(PFEC_write_access|PFEC_insn_fetch),%eax + jz compat_test_all_events +0: Jan
> -----Original Message----- > From: Jan Beulich [mailto:JBeulich@suse.com] > Sent: Friday, March 4, 2016 7:28 PM > To: xen-devel <xen-devel@lists.xenproject.org> > Cc: Andrew Cooper <andrew.cooper3@citrix.com>; Wu, Feng > <feng.wu@intel.com>; Keir Fraser <keir@xen.org> > Subject: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV > guest code > > Since such guests' kernel code runs in ring 1, their memory accesses, > at the paging layer, are supervisor mode ones, and hence subject to > SMAP/SMEP checks. Such guests cannot be expected to be aware of those > two features though (and so far we also don't expose the respective > feature flags), and hence may suffer page faults they cannot deal with. > > While the placement of the re-enabling slightly weakens the intended > protection, it was selected such that 64-bit paths would remain > unaffected where possible. At the expense of a further performance hit > the re-enabling could be put right next to the CLACs. > > Note that this introduces a number of extra TLB flushes - CR4.SMEP > transitioning from 0 to 1 always causes a flush, and it transitioning > from 1 to 0 may also do. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/setup.c > +++ b/xen/arch/x86/setup.c > @@ -67,6 +67,8 @@ boolean_param("smep", opt_smep); > static bool_t __initdata opt_smap = 1; > boolean_param("smap", opt_smap); > > +unsigned long __read_mostly cr4_smep_smap_mask; > + > /* Boot dom0 in pvh mode */ > static bool_t __initdata opt_dom0pvh; > boolean_param("dom0pvh", opt_dom0pvh); > @@ -1335,6 +1337,8 @@ void __init noreturn __start_xen(unsigne > if ( cpu_has_smap ) > set_in_cr4(X86_CR4_SMAP); > > + cr4_smep_smap_mask = mmu_cr4_features & (X86_CR4_SMEP | > X86_CR4_SMAP); > + > if ( cpu_has_fsgsbase ) > set_in_cr4(X86_CR4_FSGSBASE); > > @@ -1471,7 +1475,10 @@ void __init noreturn __start_xen(unsigne > * copy_from_user(). > */ > if ( cpu_has_smap ) > + { > + cr4_smep_smap_mask &= ~X86_CR4_SMAP; You change ' cr4_smep_smap_mask ' here ... > write_cr4(read_cr4() & ~X86_CR4_SMAP); > + } > > printk("%sNX (Execute Disable) protection %sactive\n", > cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ", > @@ -1488,7 +1495,10 @@ void __init noreturn __start_xen(unsigne > panic("Could not set up DOM0 guest OS"); > > if ( cpu_has_smap ) > + { > write_cr4(read_cr4() | X86_CR4_SMAP); > + cr4_smep_smap_mask |= X86_CR4_SMAP; ... and here. I am wonder whether it is because Domain 0 can actually start running between the two place? Or I don't think the changes in the above two places is needed. right? > > --- a/xen/arch/x86/x86_64/entry.S > +++ b/xen/arch/x86/x86_64/entry.S > @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable) > > ENTRY(common_interrupt) > SAVE_ALL CLAC > + SMEP_SMAP_RESTORE > movq %rsp,%rdi > callq do_IRQ > jmp ret_from_intr > @@ -454,13 +455,64 @@ ENTRY(page_fault) > GLOBAL(handle_exception) > SAVE_ALL CLAC > handle_exception_saved: > + GET_CURRENT(%rbx) > testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) > jz exception_with_ints_disabled > - sti > + > +.Lsmep_smap_orig: > + jmp 0f > + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) > + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc > + .else > + // worst case: rex + opcode + modrm + 4-byte displacement > + .skip (1 + 1 + 1 + 4) - 2, 0xcc > + .endif > + .pushsection .altinstr_replacement, "ax" > +.Lsmep_smap_alt: > + mov VCPU_domain(%rbx),%rax > +.Lsmep_smap_alt_end: > + .section .altinstructions, "a" > + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ > + X86_FEATURE_SMEP, \ > + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ > + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) > + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ > + X86_FEATURE_SMAP, \ > + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ > + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) > + .popsection > + > + testb $3,UREGS_cs(%rsp) > + jz 0f > + cmpb $0,DOMAIN_is_32bit_pv(%rax) > + je 0f > + call cr4_smep_smap_restore > + /* > + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in Do you mean "before" when you typed "between" above? > + * compat_restore_all_guest and it actually returning to guest > + * context, in which case the guest would run with the two features > + * enabled. The only bad that can happen from this is a kernel mode > + * #PF which the guest doesn't expect. Rather than trying to make the > + * NMI/#MC exit path honor the intended CR4 setting, simply check > + * whether the wrong CR4 was in use when the #PF occurred, and exit > + * back to the guest (which will in turn clear the two CR4 bits) to > + * re-execute the instruction. If we get back here, the CR4 bits > + * should then be found clear (unless another NMI/#MC occurred at > + * exactly the right time), and we'll continue processing the > + * exception as normal. > + */ As Andrew mentioned in another mail, this scenario is a little tricky, could you please give a more detailed description about how the MNI/#MC affect the execution flow, maybe with some code in the explanation? Thanks a lot! Thanks, Feng
> >> +/* This mustn't modify registers other than %rax. */ > >> +ENTRY(cr4_smep_smap_restore) > >> + mov %cr4, %rax > >> + test $X86_CR4_SMEP|X86_CR4_SMAP,%eax > >> + jnz 0f If we clear every place where we are back to 32bit pv guest, X86_CR4_SMEP and X86_CR4_SMAP bit should be clear in CR4, right? If that is the case, we cannot jump to 0f. However, like what you mentioned in the comments below: + /* + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in + * compat_restore_all_guest and it actually returning to guest + * context, in which case the guest would run with the two features + * enabled. The only bad that can happen from this is a kernel mode + * #PF which the guest doesn't expect. Rather than trying to make the + * NMI/#MC exit path honor the intended CR4 setting, simply check + * whether the wrong CR4 was in use when the #PF occurred, and exit + * back to the guest (which will in turn clear the two CR4 bits) to + * re-execute the instruction. If we get back here, the CR4 bits + * should then be found clear (unless another NMI/#MC occurred at + * exactly the right time), and we'll continue processing the + * exception as normal. + */ That means, if NMI or #MC happens in the right time, the guest can be running with SMAP/SMEP set in cr4, only in this case, we can get the '0f'. Is my understanding correct? Thanks! > >> + or cr4_smep_smap_mask(%rip), %rax > >> + mov %rax, %cr4 > >> + ret > >> +0: > >> + and cr4_smep_smap_mask(%rip), %eax > >> + cmp cr4_smep_smap_mask(%rip), %eax > >> + je 1f > >> + BUG > > > > What is the purpose of this bugcheck? It looks like it is catching a > > mismatch of masked options, but I am not completely sure. > > This aims at detecting that some of the CR4 bits which are > expected to be set really aren't (other than the case when all > of the ones of interest here are clear). The code in the 0f section consider the following case as not a bug, do you think this can really happen? Let's suppose smap it bit 0 and smep is bit1, then cr4_smep_smap_mask is 01 or 10 and %eax is 11, in this case, it means, Xen sets both smap and smep, but when guest is running, only one feature is set. In your path, smap and smep is set/clear at the same time, not sure this can happen. BTW, I think it is worth adding some comments for the 0f section. Thanks, Feng
On 09/03/16 08:09, Wu, Feng wrote: >> --- a/xen/arch/x86/x86_64/entry.S >> +++ b/xen/arch/x86/x86_64/entry.S >> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable) >> >> ENTRY(common_interrupt) >> SAVE_ALL CLAC >> + SMEP_SMAP_RESTORE >> movq %rsp,%rdi >> callq do_IRQ >> jmp ret_from_intr >> @@ -454,13 +455,64 @@ ENTRY(page_fault) >> GLOBAL(handle_exception) >> SAVE_ALL CLAC >> handle_exception_saved: >> + GET_CURRENT(%rbx) >> testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) >> jz exception_with_ints_disabled >> - sti >> + >> +.Lsmep_smap_orig: >> + jmp 0f >> + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) >> + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc >> + .else >> + // worst case: rex + opcode + modrm + 4-byte displacement >> + .skip (1 + 1 + 1 + 4) - 2, 0xcc >> + .endif >> + .pushsection .altinstr_replacement, "ax" >> +.Lsmep_smap_alt: >> + mov VCPU_domain(%rbx),%rax >> +.Lsmep_smap_alt_end: >> + .section .altinstructions, "a" >> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >> + X86_FEATURE_SMEP, \ >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >> + X86_FEATURE_SMAP, \ >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >> + .popsection >> + >> + testb $3,UREGS_cs(%rsp) >> + jz 0f >> + cmpb $0,DOMAIN_is_32bit_pv(%rax) >> + je 0f >> + call cr4_smep_smap_restore >> + /* >> + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in > Do you mean "before" when you typed "between" above? The meaning is "between (clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest) and (it actually returning to guest)" Nested lists in English are a source of confusion, even to native speakers. ~Andrew >> + * compat_restore_all_guest and it actually returning to guest >> + * context, in which case the guest would run with the two features >> + * enabled. The only bad that can happen from this is a kernel mode >> + * #PF which the guest doesn't expect. Rather than trying to make the >> + * NMI/#MC exit path honor the intended CR4 setting, simply check >> + * whether the wrong CR4 was in use when the #PF occurred, and exit >> + * back to the guest (which will in turn clear the two CR4 bits) to >> + * re-execute the instruction. If we get back here, the CR4 bits >> + * should then be found clear (unless another NMI/#MC occurred at >> + * exactly the right time), and we'll continue processing the >> + * exception as normal. >> + */
On 08/03/16 07:57, Jan Beulich wrote: > >>> @@ -174,10 +174,43 @@ compat_bad_hypercall: >>> /* %rbx: struct vcpu, interrupts disabled */ >>> ENTRY(compat_restore_all_guest) >>> ASSERT_INTERRUPTS_DISABLED >>> +.Lcr4_orig: >>> + ASM_NOP3 /* mov %cr4, %rax */ >>> + ASM_NOP6 /* and $..., %rax */ >>> + ASM_NOP3 /* mov %rax, %cr4 */ >>> + .pushsection .altinstr_replacement, "ax" >>> +.Lcr4_alt: >>> + mov %cr4, %rax >>> + and $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax >>> + mov %rax, %cr4 >>> +.Lcr4_alt_end: >>> + .section .altinstructions, "a" >>> + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \ >>> + (.Lcr4_alt_end - .Lcr4_alt) >>> + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \ >>> + (.Lcr4_alt_end - .Lcr4_alt) >> These 12's look as if they should be (.Lcr4_alt - .Lcr4_orig). > Well, the NOPs that get put there make 12 (= 3 + 6 + 3) a > pretty obvious (shorter and hence more readable) option. But > yes, if you're of the strong opinion that we should use the > longer alternative, I can switch these around. I have to admit that I prefer the Linux ALTERNATIVE macro for assembly, which takes care of the calculations like this. It is slightly unfortunate that it generally requires its assembly blocks in strings, and is unsuitable for larger blocks. Perhaps we can see about an variant in due course. > >>> +/* This mustn't modify registers other than %rax. */ >>> +ENTRY(cr4_smep_smap_restore) >>> + mov %cr4, %rax >>> + test $X86_CR4_SMEP|X86_CR4_SMAP,%eax >>> + jnz 0f >>> + or cr4_smep_smap_mask(%rip), %rax >>> + mov %rax, %cr4 >>> + ret >>> +0: >>> + and cr4_smep_smap_mask(%rip), %eax >>> + cmp cr4_smep_smap_mask(%rip), %eax >>> + je 1f >>> + BUG >> What is the purpose of this bugcheck? It looks like it is catching a >> mismatch of masked options, but I am not completely sure. > This aims at detecting that some of the CR4 bits which are > expected to be set really aren't (other than the case when all > of the ones of interest here are clear). > >> For all other ASM level BUG's, I put a short comment on the same line, >> to aid people who hit the bug. > Will do. Question: Should this check perhaps become !NDEBUG > dependent? It probably should do. > >>> @@ -454,13 +455,64 @@ ENTRY(page_fault) >>> GLOBAL(handle_exception) >>> SAVE_ALL CLAC >>> handle_exception_saved: >>> + GET_CURRENT(%rbx) >>> testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) >>> jz exception_with_ints_disabled >>> - sti >>> + >>> +.Lsmep_smap_orig: >>> + jmp 0f >>> + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) >>> + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc >>> + .else >>> + // worst case: rex + opcode + modrm + 4-byte displacement >>> + .skip (1 + 1 + 1 + 4) - 2, 0xcc >>> + .endif >> Which bug is this? How does it manifest. More generally, what is this >> alternative trying to achieve? > The .org gets a warning (.Lsmep_smap_orig supposedly being > undefined, and hence getting assumed to be zero) followed by > an error (attempt to move the current location backwards). The > fix https://sourceware.org/ml/binutils/2016-03/msg00030.html > is pending approval. I presume this is down to the documented restriction about crossing sections. i.e. there is no .Lsmep_smap_orig in .altinstr_replacement, where it found the first two symbols. >>> + .pushsection .altinstr_replacement, "ax" >>> +.Lsmep_smap_alt: >>> + mov VCPU_domain(%rbx),%rax >>> +.Lsmep_smap_alt_end: >>> + .section .altinstructions, "a" >>> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >>> + X86_FEATURE_SMEP, \ >>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >>> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >>> + X86_FEATURE_SMAP, \ >>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >>> + .popsection >>> + >>> + testb $3,UREGS_cs(%rsp) >>> + jz 0f >>> + cmpb $0,DOMAIN_is_32bit_pv(%rax) >> This comparison is wrong on hardware lacking SMEP and SMAP, as the "mov >> VCPU_domain(%rbx),%rax" won't have happened. > That mov indeed won't have happened, but the original instruction > is a branch past all of this code, so the above is correct (and I did > test on older hardware). Oh so it wont. It is moderately subtle that this entire codeblock is logically contained in the alternative. It would be far clearer, and work around your org bug, if this was a single alternative which patched jump into a nop. At the very least, a label of .Lcr3_pv32_fixup_done would be an improvement over 0. > >>> + je 0f >>> + call cr4_smep_smap_restore >>> + /* >>> + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in >>> + * compat_restore_all_guest and it actually returning to guest >>> + * context, in which case the guest would run with the two features >>> + * enabled. The only bad that can happen from this is a kernel mode >>> + * #PF which the guest doesn't expect. Rather than trying to make the >>> + * NMI/#MC exit path honor the intended CR4 setting, simply check >>> + * whether the wrong CR4 was in use when the #PF occurred, and exit >>> + * back to the guest (which will in turn clear the two CR4 bits) to >>> + * re-execute the instruction. If we get back here, the CR4 bits >>> + * should then be found clear (unless another NMI/#MC occurred at >>> + * exactly the right time), and we'll continue processing the >>> + * exception as normal. >>> + */ >>> + test %rax,%rax >>> + jnz 0f >>> + mov $PFEC_page_present,%al >>> + cmpb $TRAP_page_fault,UREGS_entry_vector(%rsp) >>> + jne 0f >>> + xor UREGS_error_code(%rsp),%eax >>> + test $~(PFEC_write_access|PFEC_insn_fetch),%eax >>> + jz compat_test_all_events >>> +0: sti >> Its code like this which makes me even more certain that we have far too >> much code written in assembly which doesn't need to be. Maybe not this >> specific sample, but it has taken me 15 minutes and a pad of paper to >> try and work out how this conditional works, and I am still not certain >> its correct. In particular, PFEC_prot_key looks like it fool the test >> into believing a non-smap/smep fault was a smap/smep fault. > Not sure how you come to think of PFEC_prot_key here: That's > a bit which can be set only together with PFEC_user_mode, yet > we care about kernel mode faults only here. I would not make that assumption. Assumptions about the valid set of #PF flags are precisely the reason that older Linux falls into an infinite loop when encountering a SMAP pagefault, rather than a clean crash. ~Andrew > >> Can you at least provide some C in a comment with the intended >> conditional, to aid clarity? > Sure, if you think that helps beyond the (I think) pretty extensive > comment: > > + test %rax,%rax > + jnz 0f > + /* > + * The below effectively is > + * if ( regs->entry_vector == TRAP_page_fault && > + * (regs->error_code & PFEC_page_present) && > + * !(regs->error_code & ~(PFEC_write_access|PFEC_insn_fetch)) ) > + * goto compat_test_all_events; > + */ > + mov $PFEC_page_present,%al > + cmpb $TRAP_page_fault,UREGS_entry_vector(%rsp) > + jne 0f > + xor UREGS_error_code(%rsp),%eax > + test $~(PFEC_write_access|PFEC_insn_fetch),%eax > + jz compat_test_all_events > +0: > > Jan
> -----Original Message----- > From: Andrew Cooper [mailto:andrew.cooper3@citrix.com] > Sent: Wednesday, March 9, 2016 6:46 PM > To: Wu, Feng <feng.wu@intel.com>; Jan Beulich <JBeulich@suse.com>; xen- > devel <xen-devel@lists.xenproject.org> > Cc: Keir Fraser <keir@xen.org> > Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV > guest code > > On 09/03/16 08:09, Wu, Feng wrote: > > >> --- a/xen/arch/x86/x86_64/entry.S > >> +++ b/xen/arch/x86/x86_64/entry.S > >> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable) > >> > >> ENTRY(common_interrupt) > >> SAVE_ALL CLAC > >> + SMEP_SMAP_RESTORE > >> movq %rsp,%rdi > >> callq do_IRQ > >> jmp ret_from_intr > >> @@ -454,13 +455,64 @@ ENTRY(page_fault) > >> GLOBAL(handle_exception) > >> SAVE_ALL CLAC > >> handle_exception_saved: > >> + GET_CURRENT(%rbx) > >> testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) > >> jz exception_with_ints_disabled > >> - sti > >> + > >> +.Lsmep_smap_orig: > >> + jmp 0f > >> + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) > >> + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), > 0xcc > >> + .else > >> + // worst case: rex + opcode + modrm + 4-byte displacement > >> + .skip (1 + 1 + 1 + 4) - 2, 0xcc > >> + .endif > >> + .pushsection .altinstr_replacement, "ax" > >> +.Lsmep_smap_alt: > >> + mov VCPU_domain(%rbx),%rax > >> +.Lsmep_smap_alt_end: > >> + .section .altinstructions, "a" > >> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ > >> + X86_FEATURE_SMEP, \ > >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ > >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) > >> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ > >> + X86_FEATURE_SMAP, \ > >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ > >> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) > >> + .popsection > >> + > >> + testb $3,UREGS_cs(%rsp) > >> + jz 0f > >> + cmpb $0,DOMAIN_is_32bit_pv(%rax) > >> + je 0f > >> + call cr4_smep_smap_restore > >> + /* > >> + * An NMI or #MC may occur between clearing CR4.SMEP and > CR4.SMAP in > > Do you mean "before" when you typed "between" above? > > The meaning is "between (clearing CR4.SMEP and CR4.SMAP in > compat_restore_all_guest) and (it actually returning to guest)" > > Nested lists in English are a source of confusion, even to native speakers. Oh, thanks for the clarification! Do you know how "An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and it actually returning to guest context, in which case the guest would run with the two features enabled. " can happen? Especially how the guest can run with the two features enabled? Thanks, Feng > > ~Andrew > > >> + * compat_restore_all_guest and it actually returning to guest > >> + * context, in which case the guest would run with the two features > >> + * enabled. The only bad that can happen from this is a kernel mode > >> + * #PF which the guest doesn't expect. Rather than trying to make the > >> + * NMI/#MC exit path honor the intended CR4 setting, simply check > >> + * whether the wrong CR4 was in use when the #PF occurred, and exit > >> + * back to the guest (which will in turn clear the two CR4 bits) to > >> + * re-execute the instruction. If we get back here, the CR4 bits > >> + * should then be found clear (unless another NMI/#MC occurred at > >> + * exactly the right time), and we'll continue processing the > >> + * exception as normal. > >> + */ >
On 09/03/16 12:27, Wu, Feng wrote: > >> -----Original Message----- >> From: Andrew Cooper [mailto:andrew.cooper3@citrix.com] >> Sent: Wednesday, March 9, 2016 6:46 PM >> To: Wu, Feng <feng.wu@intel.com>; Jan Beulich <JBeulich@suse.com>; xen- >> devel <xen-devel@lists.xenproject.org> >> Cc: Keir Fraser <keir@xen.org> >> Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV >> guest code >> >> On 09/03/16 08:09, Wu, Feng wrote: >> >>>> --- a/xen/arch/x86/x86_64/entry.S >>>> +++ b/xen/arch/x86/x86_64/entry.S >>>> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable) >>>> >>>> ENTRY(common_interrupt) >>>> SAVE_ALL CLAC >>>> + SMEP_SMAP_RESTORE >>>> movq %rsp,%rdi >>>> callq do_IRQ >>>> jmp ret_from_intr >>>> @@ -454,13 +455,64 @@ ENTRY(page_fault) >>>> GLOBAL(handle_exception) >>>> SAVE_ALL CLAC >>>> handle_exception_saved: >>>> + GET_CURRENT(%rbx) >>>> testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) >>>> jz exception_with_ints_disabled >>>> - sti >>>> + >>>> +.Lsmep_smap_orig: >>>> + jmp 0f >>>> + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) >>>> + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), >> 0xcc >>>> + .else >>>> + // worst case: rex + opcode + modrm + 4-byte displacement >>>> + .skip (1 + 1 + 1 + 4) - 2, 0xcc >>>> + .endif >>>> + .pushsection .altinstr_replacement, "ax" >>>> +.Lsmep_smap_alt: >>>> + mov VCPU_domain(%rbx),%rax >>>> +.Lsmep_smap_alt_end: >>>> + .section .altinstructions, "a" >>>> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >>>> + X86_FEATURE_SMEP, \ >>>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >>>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >>>> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >>>> + X86_FEATURE_SMAP, \ >>>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >>>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >>>> + .popsection >>>> + >>>> + testb $3,UREGS_cs(%rsp) >>>> + jz 0f >>>> + cmpb $0,DOMAIN_is_32bit_pv(%rax) >>>> + je 0f >>>> + call cr4_smep_smap_restore >>>> + /* >>>> + * An NMI or #MC may occur between clearing CR4.SMEP and >> CR4.SMAP in >>> Do you mean "before" when you typed "between" above? >> The meaning is "between (clearing CR4.SMEP and CR4.SMAP in >> compat_restore_all_guest) and (it actually returning to guest)" >> >> Nested lists in English are a source of confusion, even to native speakers. > Oh, thanks for the clarification! Do you know how "An NMI or #MC may occur > between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and > it actually returning to guest context, in which case the guest would run with > the two features enabled. " can happen? Especially how the guest can run > with the two features enabled? NMIs and MCEs can occur at any point, even if interrupts are disabled. The bad situation is this sequence: * Xen is returning to the guest and disables CR4.SMEP/SMAP * NMI occurs while still in Xen * NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP * Xen ends up returning to guest with CR4.SMEP/SMAP enabled. ~Andrew
>>>> Andrew Cooper <andrew.cooper3@citrix.com> 03/09/16 1:33 PM >>> >On 09/03/16 12:27, Wu, Feng wrote: >> Oh, thanks for the clarification! Do you know how "An NMI or #MC may occur >> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and >> it actually returning to guest context, in which case the guest would run with >> the two features enabled. " can happen? Especially how the guest can run >> with the two features enabled? > >NMIs and MCEs can occur at any point, even if interrupts are disabled. > >The bad situation is this sequence: > >* Xen is returning to the guest and disables CR4.SMEP/SMAP >* NMI occurs while still in Xen >* NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP Well, almost: Re-enabling happens on the NMI entry path. The NMI exit path would, seeing it's returning to Xen context, simply not disable them again. Jan >* Xen ends up returning to guest with CR4.SMEP/SMAP enabled. > >~Andrew
> -----Original Message----- > From: Jan Beulich [mailto:jbeulich@suse.com] > Sent: Wednesday, March 9, 2016 8:37 PM > To: andrew.cooper3@citrix.com; Wu, Feng <feng.wu@intel.com>; xen- > devel@lists.xenproject.org > Cc: keir@xen.org > Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV > guest code > > >>>> Andrew Cooper <andrew.cooper3@citrix.com> 03/09/16 1:33 PM >>> > >On 09/03/16 12:27, Wu, Feng wrote: > >> Oh, thanks for the clarification! Do you know how "An NMI or #MC may > occur > >> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and > >> it actually returning to guest context, in which case the guest would run with > >> the two features enabled. " can happen? Especially how the guest can run > >> with the two features enabled? > > > >NMIs and MCEs can occur at any point, even if interrupts are disabled. > > > >The bad situation is this sequence: > > > >* Xen is returning to the guest and disables CR4.SMEP/SMAP > >* NMI occurs while still in Xen > >* NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP > > Well, almost: Re-enabling happens on the NMI entry path. The NMI exit > path would, seeing it's returning to Xen context, simply not disable them > again. Oh, this is the point I ignored. Thanks for the clarification. Thanks, Feng > > Jan > > >* Xen ends up returning to guest with CR4.SMEP/SMAP enabled. > > > >~Andrew >
> -----Original Message----- > From: Jan Beulich [mailto:jbeulich@suse.com] > Sent: Wednesday, March 9, 2016 8:37 PM > To: andrew.cooper3@citrix.com; Wu, Feng <feng.wu@intel.com>; xen- > devel@lists.xenproject.org > Cc: keir@xen.org > Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV > guest code > > >>>> Andrew Cooper <andrew.cooper3@citrix.com> 03/09/16 1:33 PM >>> > >On 09/03/16 12:27, Wu, Feng wrote: > >> Oh, thanks for the clarification! Do you know how "An NMI or #MC may > occur > >> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and > >> it actually returning to guest context, in which case the guest would run with > >> the two features enabled. " can happen? Especially how the guest can run > >> with the two features enabled? > > > >NMIs and MCEs can occur at any point, even if interrupts are disabled. > > > >The bad situation is this sequence: > > > >* Xen is returning to the guest and disables CR4.SMEP/SMAP > >* NMI occurs while still in Xen > >* NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP > > Well, almost: Re-enabling happens on the NMI entry path. The NMI exit > path would, seeing it's returning to Xen context, simply not disable them > again. Thinking about this again, in this case, when the NMI happens, we are in Xen context (CPL in cs is 0), so the CPL of the saved cs in stack is 0,right? why do we re-enable CR4.SMEP/SMAP in this case? I mean do we only need to enable SMEP/SMAP when coming from 32bit pv guest (CPL of cs is 1) ? Thanks, Feng > > Jan > > >* Xen ends up returning to guest with CR4.SMEP/SMAP enabled. > > > >~Andrew >
On 09/03/16 13:35, Wu, Feng wrote: > >> -----Original Message----- >> From: Jan Beulich [mailto:jbeulich@suse.com] >> Sent: Wednesday, March 9, 2016 8:37 PM >> To: andrew.cooper3@citrix.com; Wu, Feng <feng.wu@intel.com>; xen- >> devel@lists.xenproject.org >> Cc: keir@xen.org >> Subject: Re: [PATCH 2/4] x86: suppress SMAP and SMEP while running 32-bit PV >> guest code >> >>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 03/09/16 1:33 PM >>> >>> On 09/03/16 12:27, Wu, Feng wrote: >>>> Oh, thanks for the clarification! Do you know how "An NMI or #MC may >> occur >>>> between clearing CR4.SMEP and CR4.SMAP in compat_restore_all_guest and >>>> it actually returning to guest context, in which case the guest would run with >>>> the two features enabled. " can happen? Especially how the guest can run >>>> with the two features enabled? >>> NMIs and MCEs can occur at any point, even if interrupts are disabled. >>> >>> The bad situation is this sequence: >>> >>> * Xen is returning to the guest and disables CR4.SMEP/SMAP >>> * NMI occurs while still in Xen >>> * NMI exit path sees it is returning to Xen and re-enabled CR4.SMEP/SMAP >> Well, almost: Re-enabling happens on the NMI entry path. The NMI exit >> path would, seeing it's returning to Xen context, simply not disable them >> again. > Thinking about this again, in this case, when the NMI happens, we are in > Xen context (CPL in cs is 0), so the CPL of the saved cs in stack is 0,right? > why do we re-enable CR4.SMEP/SMAP in this case? I mean do we only > need to enable SMEP/SMAP when coming from 32bit pv guest (CPL of cs is 1) ? We always want Xen to be running with SMEP/SMAP enabled. Therefore the safer and simpler option is to always enable them if we observe them disabled. Interrupting a 32bit PV guest might end up seeing a cpl of 1 or 3, and peeking into the active struct domain to check is_pv32_domain would be a larger overhead on the entry paths. ~Andrew
>>> On 09.03.16 at 11:45, <andrew.cooper3@citrix.com> wrote: > On 09/03/16 08:09, Wu, Feng wrote: > >>> --- a/xen/arch/x86/x86_64/entry.S >>> +++ b/xen/arch/x86/x86_64/entry.S >>> @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable) >>> >>> ENTRY(common_interrupt) >>> SAVE_ALL CLAC >>> + SMEP_SMAP_RESTORE >>> movq %rsp,%rdi >>> callq do_IRQ >>> jmp ret_from_intr >>> @@ -454,13 +455,64 @@ ENTRY(page_fault) >>> GLOBAL(handle_exception) >>> SAVE_ALL CLAC >>> handle_exception_saved: >>> + GET_CURRENT(%rbx) >>> testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) >>> jz exception_with_ints_disabled >>> - sti >>> + >>> +.Lsmep_smap_orig: >>> + jmp 0f >>> + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) >>> + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), > 0xcc >>> + .else >>> + // worst case: rex + opcode + modrm + 4-byte displacement >>> + .skip (1 + 1 + 1 + 4) - 2, 0xcc >>> + .endif >>> + .pushsection .altinstr_replacement, "ax" >>> +.Lsmep_smap_alt: >>> + mov VCPU_domain(%rbx),%rax >>> +.Lsmep_smap_alt_end: >>> + .section .altinstructions, "a" >>> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >>> + X86_FEATURE_SMEP, \ >>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >>> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >>> + X86_FEATURE_SMAP, \ >>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >>> + .popsection >>> + >>> + testb $3,UREGS_cs(%rsp) >>> + jz 0f >>> + cmpb $0,DOMAIN_is_32bit_pv(%rax) >>> + je 0f >>> + call cr4_smep_smap_restore >>> + /* >>> + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP > in >> Do you mean "before" when you typed "between" above? > > The meaning is "between (clearing CR4.SMEP and CR4.SMAP in > compat_restore_all_guest) and (it actually returning to guest)" > > Nested lists in English are a source of confusion, even to native speakers. I've switched the first "and" to / to avoid the ambiguity. Jan
>>> On 09.03.16 at 09:09, <feng.wu@intel.com> wrote: >> From: Jan Beulich [mailto:JBeulich@suse.com] >> Sent: Friday, March 4, 2016 7:28 PM >> @@ -1471,7 +1475,10 @@ void __init noreturn __start_xen(unsigne >> * copy_from_user(). >> */ >> if ( cpu_has_smap ) >> + { >> + cr4_smep_smap_mask &= ~X86_CR4_SMAP; > > You change ' cr4_smep_smap_mask ' here ... > >> write_cr4(read_cr4() & ~X86_CR4_SMAP); >> + } >> >> printk("%sNX (Execute Disable) protection %sactive\n", >> cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ", >> @@ -1488,7 +1495,10 @@ void __init noreturn __start_xen(unsigne >> panic("Could not set up DOM0 guest OS"); >> >> if ( cpu_has_smap ) >> + { >> write_cr4(read_cr4() | X86_CR4_SMAP); >> + cr4_smep_smap_mask |= X86_CR4_SMAP; > > ... and here. I am wonder whether it is because Domain 0 can actually start > running between the two place? Or I don't think the changes in the above > two places is needed. right? They very definitely needed, to avoid hitting the BUG in cr4_pv32_restore (cr4_smep_smap_restore in this patch version) in every interrupt that occurs while Dom0 is being constructed. Jan
>>> On 09.03.16 at 09:09, <feng.wu@intel.com> wrote: >> >> +/* This mustn't modify registers other than %rax. */ >> >> +ENTRY(cr4_smep_smap_restore) >> >> + mov %cr4, %rax >> >> + test $X86_CR4_SMEP|X86_CR4_SMAP,%eax >> >> + jnz 0f > > If we clear every place where we are back to 32bit pv guest, > X86_CR4_SMEP and X86_CR4_SMAP bit should be clear > in CR4, right? If that is the case, we cannot jump to 0f. I think Andrew's reply to (I think) a later mail of yours already answered this, but just in case: We unconditionally come here on paths that _may_ be used when entering Xen out of 32-bit PV guest context. I.e. we do not know which state the two flags are in. Jan
>>> On 09.03.16 at 12:19, <andrew.cooper3@citrix.com> wrote: > On 08/03/16 07:57, Jan Beulich wrote: >>>> @@ -174,10 +174,43 @@ compat_bad_hypercall: >>>> /* %rbx: struct vcpu, interrupts disabled */ >>>> ENTRY(compat_restore_all_guest) >>>> ASSERT_INTERRUPTS_DISABLED >>>> +.Lcr4_orig: >>>> + ASM_NOP3 /* mov %cr4, %rax */ >>>> + ASM_NOP6 /* and $..., %rax */ >>>> + ASM_NOP3 /* mov %rax, %cr4 */ >>>> + .pushsection .altinstr_replacement, "ax" >>>> +.Lcr4_alt: >>>> + mov %cr4, %rax >>>> + and $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax >>>> + mov %rax, %cr4 >>>> +.Lcr4_alt_end: >>>> + .section .altinstructions, "a" >>>> + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \ >>>> + (.Lcr4_alt_end - .Lcr4_alt) >>>> + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \ >>>> + (.Lcr4_alt_end - .Lcr4_alt) >>> These 12's look as if they should be (.Lcr4_alt - .Lcr4_orig). >> Well, the NOPs that get put there make 12 (= 3 + 6 + 3) a >> pretty obvious (shorter and hence more readable) option. But >> yes, if you're of the strong opinion that we should use the >> longer alternative, I can switch these around. > > I have to admit that I prefer the Linux ALTERNATIVE macro for assembly, > which takes care of the calculations like this. It is slightly > unfortunate that it generally requires its assembly blocks in strings, > and is unsuitable for larger blocks. Perhaps we can see about an > variant in due course. I due course to me means subsequently - is that the meaning you imply here too? But what's interesting about this suggestion: Their macro uses .skip instead of .org, which means I should be able to replace the ugly gas bug workaround by simply using .skip. I'll give that a try. >>>> + .pushsection .altinstr_replacement, "ax" >>>> +.Lsmep_smap_alt: >>>> + mov VCPU_domain(%rbx),%rax >>>> +.Lsmep_smap_alt_end: >>>> + .section .altinstructions, "a" >>>> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >>>> + X86_FEATURE_SMEP, \ >>>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >>>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >>>> + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ >>>> + X86_FEATURE_SMAP, \ >>>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ >>>> + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) >>>> + .popsection >>>> + >>>> + testb $3,UREGS_cs(%rsp) >>>> + jz 0f >>>> + cmpb $0,DOMAIN_is_32bit_pv(%rax) >>> This comparison is wrong on hardware lacking SMEP and SMAP, as the "mov >>> VCPU_domain(%rbx),%rax" won't have happened. >> That mov indeed won't have happened, but the original instruction >> is a branch past all of this code, so the above is correct (and I did >> test on older hardware). > > Oh so it wont. It is moderately subtle that this entire codeblock is > logically contained in the alternative. > > It would be far clearer, and work around your org bug, if this was a > single alternative which patched jump into a nop. I specifically wanted to avoid needlessly patching a NOP in there when going forward we expect the majority of systems to have that patching done. > At the very least, a label of .Lcr3_pv32_fixup_done would be an > improvement over 0. Agreed (albeit I prefer it to be named .Lcr4_pv32_done). >>>> + je 0f >>>> + call cr4_smep_smap_restore >>>> + /* >>>> + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in >>>> + * compat_restore_all_guest and it actually returning to guest >>>> + * context, in which case the guest would run with the two features >>>> + * enabled. The only bad that can happen from this is a kernel mode >>>> + * #PF which the guest doesn't expect. Rather than trying to make the >>>> + * NMI/#MC exit path honor the intended CR4 setting, simply check >>>> + * whether the wrong CR4 was in use when the #PF occurred, and exit >>>> + * back to the guest (which will in turn clear the two CR4 bits) to >>>> + * re-execute the instruction. If we get back here, the CR4 bits >>>> + * should then be found clear (unless another NMI/#MC occurred at >>>> + * exactly the right time), and we'll continue processing the >>>> + * exception as normal. >>>> + */ >>>> + test %rax,%rax >>>> + jnz 0f >>>> + mov $PFEC_page_present,%al >>>> + cmpb $TRAP_page_fault,UREGS_entry_vector(%rsp) >>>> + jne 0f >>>> + xor UREGS_error_code(%rsp),%eax >>>> + test $~(PFEC_write_access|PFEC_insn_fetch),%eax >>>> + jz compat_test_all_events >>>> +0: sti >>> Its code like this which makes me even more certain that we have far too >>> much code written in assembly which doesn't need to be. Maybe not this >>> specific sample, but it has taken me 15 minutes and a pad of paper to >>> try and work out how this conditional works, and I am still not certain >>> its correct. In particular, PFEC_prot_key looks like it fool the test >>> into believing a non-smap/smep fault was a smap/smep fault. >> Not sure how you come to think of PFEC_prot_key here: That's >> a bit which can be set only together with PFEC_user_mode, yet >> we care about kernel mode faults only here. > > I would not make that assumption. Assumptions about the valid set of > #PF flags are precisely the reason that older Linux falls into an > infinite loop when encountering a SMAP pagefault, rather than a clean crash. We have to make _some_ assumption here on the bits which so far have no meaning. Whichever route we go, we can't exclude that for things to be right with new features getting added, we may need to adjust the mask. As to the example you give for comparison - that's apples and oranges as long as Xen isn't meant to run as PV guest on some (other) hypervisor. Jan
--- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -67,6 +67,8 @@ boolean_param("smep", opt_smep); static bool_t __initdata opt_smap = 1; boolean_param("smap", opt_smap); +unsigned long __read_mostly cr4_smep_smap_mask; + /* Boot dom0 in pvh mode */ static bool_t __initdata opt_dom0pvh; boolean_param("dom0pvh", opt_dom0pvh); @@ -1335,6 +1337,8 @@ void __init noreturn __start_xen(unsigne if ( cpu_has_smap ) set_in_cr4(X86_CR4_SMAP); + cr4_smep_smap_mask = mmu_cr4_features & (X86_CR4_SMEP | X86_CR4_SMAP); + if ( cpu_has_fsgsbase ) set_in_cr4(X86_CR4_FSGSBASE); @@ -1471,7 +1475,10 @@ void __init noreturn __start_xen(unsigne * copy_from_user(). */ if ( cpu_has_smap ) + { + cr4_smep_smap_mask &= ~X86_CR4_SMAP; write_cr4(read_cr4() & ~X86_CR4_SMAP); + } printk("%sNX (Execute Disable) protection %sactive\n", cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ", @@ -1488,7 +1495,10 @@ void __init noreturn __start_xen(unsigne panic("Could not set up DOM0 guest OS"); if ( cpu_has_smap ) + { write_cr4(read_cr4() | X86_CR4_SMAP); + cr4_smep_smap_mask |= X86_CR4_SMAP; + } /* Scrub RAM that is still free and so may go to an unprivileged domain. */ scrub_heap_pages(); --- a/xen/arch/x86/x86_64/compat/entry.S +++ b/xen/arch/x86/x86_64/compat/entry.S @@ -16,14 +16,16 @@ ENTRY(compat_hypercall) ASM_CLAC pushq $0 SAVE_VOLATILE type=TRAP_syscall compat=1 + SMEP_SMAP_RESTORE cmpb $0,untrusted_msi(%rip) UNLIKELY_START(ne, msi_check) movl $HYPERCALL_VECTOR,%edi call check_for_unexpected_msi - LOAD_C_CLOBBERED + LOAD_C_CLOBBERED compat=1 ax=0 UNLIKELY_END(msi_check) + movl UREGS_rax(%rsp),%eax GET_CURRENT(%rbx) cmpl $NR_hypercalls,%eax @@ -33,7 +35,6 @@ UNLIKELY_END(msi_check) pushq UREGS_rbx(%rsp); pushq %rcx; pushq %rdx; pushq %rsi; pushq %rdi pushq UREGS_rbp+5*8(%rsp) leaq compat_hypercall_args_table(%rip),%r10 - movl %eax,%eax movl $6,%ecx subb (%r10,%rax,1),%cl movq %rsp,%rdi @@ -48,7 +49,6 @@ UNLIKELY_END(msi_check) #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */ #else /* Relocate argument registers and zero-extend to 64 bits. */ - movl %eax,%eax /* Hypercall # */ xchgl %ecx,%esi /* Arg 2, Arg 4 */ movl %edx,%edx /* Arg 3 */ movl %edi,%r8d /* Arg 5 */ @@ -174,10 +174,43 @@ compat_bad_hypercall: /* %rbx: struct vcpu, interrupts disabled */ ENTRY(compat_restore_all_guest) ASSERT_INTERRUPTS_DISABLED +.Lcr4_orig: + ASM_NOP3 /* mov %cr4, %rax */ + ASM_NOP6 /* and $..., %rax */ + ASM_NOP3 /* mov %rax, %cr4 */ + .pushsection .altinstr_replacement, "ax" +.Lcr4_alt: + mov %cr4, %rax + and $~(X86_CR4_SMEP|X86_CR4_SMAP), %rax + mov %rax, %cr4 +.Lcr4_alt_end: + .section .altinstructions, "a" + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMEP, 12, \ + (.Lcr4_alt_end - .Lcr4_alt) + altinstruction_entry .Lcr4_orig, .Lcr4_alt, X86_FEATURE_SMAP, 12, \ + (.Lcr4_alt_end - .Lcr4_alt) + .popsection RESTORE_ALL adj=8 compat=1 .Lft0: iretq _ASM_PRE_EXTABLE(.Lft0, handle_exception) +/* This mustn't modify registers other than %rax. */ +ENTRY(cr4_smep_smap_restore) + mov %cr4, %rax + test $X86_CR4_SMEP|X86_CR4_SMAP,%eax + jnz 0f + or cr4_smep_smap_mask(%rip), %rax + mov %rax, %cr4 + ret +0: + and cr4_smep_smap_mask(%rip), %eax + cmp cr4_smep_smap_mask(%rip), %eax + je 1f + BUG +1: + xor %eax, %eax + ret + /* %rdx: trap_bounce, %rbx: struct vcpu */ ENTRY(compat_post_handle_exception) testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) @@ -190,6 +223,7 @@ ENTRY(compat_post_handle_exception) /* See lstar_enter for entry register state. */ ENTRY(cstar_enter) sti + SMEP_SMAP_RESTORE movq 8(%rsp),%rax /* Restore %rax. */ movq $FLAT_KERNEL_SS,8(%rsp) pushq %r11 @@ -225,6 +259,7 @@ UNLIKELY_END(compat_syscall_gpf) jmp .Lcompat_bounce_exception ENTRY(compat_sysenter) + SMEP_SMAP_RESTORE movq VCPU_trap_ctxt(%rbx),%rcx cmpb $TRAP_gp_fault,UREGS_entry_vector(%rsp) movzwl VCPU_sysenter_sel(%rbx),%eax @@ -238,6 +273,7 @@ ENTRY(compat_sysenter) jmp compat_test_all_events ENTRY(compat_int80_direct_trap) + SMEP_SMAP_RESTORE call compat_create_bounce_frame jmp compat_test_all_events --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -434,6 +434,7 @@ ENTRY(dom_crash_sync_extable) ENTRY(common_interrupt) SAVE_ALL CLAC + SMEP_SMAP_RESTORE movq %rsp,%rdi callq do_IRQ jmp ret_from_intr @@ -454,13 +455,64 @@ ENTRY(page_fault) GLOBAL(handle_exception) SAVE_ALL CLAC handle_exception_saved: + GET_CURRENT(%rbx) testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) jz exception_with_ints_disabled - sti + +.Lsmep_smap_orig: + jmp 0f + .if 0 // GAS bug (affecting at least 2.22 ... 2.26) + .org .Lsmep_smap_orig + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), 0xcc + .else + // worst case: rex + opcode + modrm + 4-byte displacement + .skip (1 + 1 + 1 + 4) - 2, 0xcc + .endif + .pushsection .altinstr_replacement, "ax" +.Lsmep_smap_alt: + mov VCPU_domain(%rbx),%rax +.Lsmep_smap_alt_end: + .section .altinstructions, "a" + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ + X86_FEATURE_SMEP, \ + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) + altinstruction_entry .Lsmep_smap_orig, .Lsmep_smap_alt, \ + X86_FEATURE_SMAP, \ + (.Lsmep_smap_alt_end - .Lsmep_smap_alt), \ + (.Lsmep_smap_alt_end - .Lsmep_smap_alt) + .popsection + + testb $3,UREGS_cs(%rsp) + jz 0f + cmpb $0,DOMAIN_is_32bit_pv(%rax) + je 0f + call cr4_smep_smap_restore + /* + * An NMI or #MC may occur between clearing CR4.SMEP and CR4.SMAP in + * compat_restore_all_guest and it actually returning to guest + * context, in which case the guest would run with the two features + * enabled. The only bad that can happen from this is a kernel mode + * #PF which the guest doesn't expect. Rather than trying to make the + * NMI/#MC exit path honor the intended CR4 setting, simply check + * whether the wrong CR4 was in use when the #PF occurred, and exit + * back to the guest (which will in turn clear the two CR4 bits) to + * re-execute the instruction. If we get back here, the CR4 bits + * should then be found clear (unless another NMI/#MC occurred at + * exactly the right time), and we'll continue processing the + * exception as normal. + */ + test %rax,%rax + jnz 0f + mov $PFEC_page_present,%al + cmpb $TRAP_page_fault,UREGS_entry_vector(%rsp) + jne 0f + xor UREGS_error_code(%rsp),%eax + test $~(PFEC_write_access|PFEC_insn_fetch),%eax + jz compat_test_all_events +0: sti 1: movq %rsp,%rdi movzbl UREGS_entry_vector(%rsp),%eax leaq exception_table(%rip),%rdx - GET_CURRENT(%rbx) PERFC_INCR(exceptions, %rax, %rbx) callq *(%rdx,%rax,8) testb $3,UREGS_cs(%rsp) @@ -592,6 +644,7 @@ handle_ist_exception: SAVE_ALL CLAC testb $3,UREGS_cs(%rsp) jz 1f + SMEP_SMAP_RESTORE /* Interrupted guest context. Copy the context to stack bottom. */ GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi) movq %rsp,%rsi --- a/xen/include/asm-x86/asm_defns.h +++ b/xen/include/asm-x86/asm_defns.h @@ -209,6 +209,16 @@ void ret_from_intr(void); #define ASM_STAC ASM_AC(STAC) #define ASM_CLAC ASM_AC(CLAC) + +#define SMEP_SMAP_RESTORE \ + 667: ASM_NOP5; \ + .pushsection .altinstr_replacement, "ax"; \ + 668: call cr4_smep_smap_restore; \ + .section .altinstructions, "a"; \ + altinstruction_entry 667b, 668b, X86_FEATURE_SMEP, 5, 5; \ + altinstruction_entry 667b, 668b, X86_FEATURE_SMAP, 5, 5; \ + .popsection + #else static always_inline void clac(void) { @@ -308,14 +318,18 @@ static always_inline void stac(void) * * For the way it is used in RESTORE_ALL, this macro must preserve EFLAGS.ZF. */ -.macro LOAD_C_CLOBBERED compat=0 +.macro LOAD_C_CLOBBERED compat=0 ax=1 .if !\compat movq UREGS_r11(%rsp),%r11 movq UREGS_r10(%rsp),%r10 movq UREGS_r9(%rsp),%r9 movq UREGS_r8(%rsp),%r8 -.endif +.if \ax movq UREGS_rax(%rsp),%rax +.endif +.elseif \ax + movl UREGS_rax(%rsp),%eax +.endif movq UREGS_rcx(%rsp),%rcx movq UREGS_rdx(%rsp),%rdx movq UREGS_rsi(%rsp),%rsi --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -134,12 +134,12 @@ #define TF_kernel_mode (1<<_TF_kernel_mode) /* #PF error code values. */ -#define PFEC_page_present (1U<<0) -#define PFEC_write_access (1U<<1) -#define PFEC_user_mode (1U<<2) -#define PFEC_reserved_bit (1U<<3) -#define PFEC_insn_fetch (1U<<4) -#define PFEC_prot_key (1U<<5) +#define PFEC_page_present (_AC(1,U) << 0) +#define PFEC_write_access (_AC(1,U) << 1) +#define PFEC_user_mode (_AC(1,U) << 2) +#define PFEC_reserved_bit (_AC(1,U) << 3) +#define PFEC_insn_fetch (_AC(1,U) << 4) +#define PFEC_prot_key (_AC(1,U) << 5) /* Internally used only flags. */ #define PFEC_page_paged (1U<<16) #define PFEC_page_shared (1U<<17)