@@ -71,6 +71,7 @@
#include <public/memory.h>
#include <public/vm_event.h>
#include <public/arch-x86/cpuid.h>
+#include <asm/cpuid.h>
bool_t __read_mostly hvm_enabled;
@@ -4678,62 +4679,55 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
/* Fix up VLAPIC details. */
*ebx &= 0x00FFFFFFu;
*ebx |= (v->vcpu_id * 2) << 24;
+
+ *ecx &= hvm_featureset[FEATURESET_1c];
+ *edx &= hvm_featureset[FEATURESET_1d];
+
+ /* APIC exposed to guests, but Fast-forward MSR_APIC_BASE.EN back in. */
if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
- __clear_bit(X86_FEATURE_APIC & 31, edx);
+ *edx &= ~cpufeat_bit(X86_FEATURE_APIC);
- /* Fix up OSXSAVE. */
- if ( *ecx & cpufeat_mask(X86_FEATURE_XSAVE) &&
- (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) )
+ /* OSXSAVE cleared by hvm_featureset. Fast-forward CR4 back in. */
+ if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
*ecx |= cpufeat_mask(X86_FEATURE_OSXSAVE);
- else
- *ecx &= ~cpufeat_mask(X86_FEATURE_OSXSAVE);
- /* Don't expose PCID to non-hap hvm. */
+ /* Don't expose HAP-only features to non-hap guests. */
if ( !hap_enabled(d) )
+ {
*ecx &= ~cpufeat_mask(X86_FEATURE_PCID);
-
- /* Only provide PSE36 when guest runs in 32bit PAE or in long mode */
- if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
*edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
+ }
break;
+
case 0x7:
if ( count == 0 )
{
- if ( !cpu_has_smep )
- *ebx &= ~cpufeat_mask(X86_FEATURE_SMEP);
-
- if ( !cpu_has_smap )
- *ebx &= ~cpufeat_mask(X86_FEATURE_SMAP);
+ /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
+ *ebx &= (hvm_featureset[FEATURESET_7b0] &
+ ~special_features[FEATURESET_7b0]);
+ *ebx |= (host_featureset[FEATURESET_7b0] &
+ special_features[FEATURESET_7b0]);
- /* Don't expose MPX to hvm when VMX support is not available. */
- if ( !(vmx_vmexit_control & VM_EXIT_CLEAR_BNDCFGS) ||
- !(vmx_vmentry_control & VM_ENTRY_LOAD_BNDCFGS) )
- *ebx &= ~cpufeat_mask(X86_FEATURE_MPX);
+ *ecx &= hvm_featureset[FEATURESET_7c0];
+ /* Don't expose HAP-only features to non-hap guests. */
if ( !hap_enabled(d) )
{
- /* Don't expose INVPCID to non-hap hvm. */
*ebx &= ~cpufeat_mask(X86_FEATURE_INVPCID);
- /* X86_FEATURE_PKU is not yet implemented for shadow paging. */
*ecx &= ~cpufeat_mask(X86_FEATURE_PKU);
}
- if ( (*ecx & cpufeat_mask(X86_FEATURE_PKU)) &&
- (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PKE) )
+ /* OSPKE cleared by hvm_featureset. Fast-forward CR4 back in. */
+ if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PKE )
*ecx |= cpufeat_mask(X86_FEATURE_OSPKE);
- else
- *ecx &= ~cpufeat_mask(X86_FEATURE_OSPKE);
-
- /* Don't expose PCOMMIT to hvm when VMX support is not available. */
- if ( !cpu_has_vmx_pcommit )
- *ebx &= ~cpufeat_mask(X86_FEATURE_PCOMMIT);
}
-
break;
+
case 0xb:
/* Fix the x2APIC identifier. */
*edx = v->vcpu_id * 2;
break;
+
case 0xd:
/* EBX value of main leaf 0 depends on enabled xsave features */
if ( count == 0 && v->arch.xcr0 )
@@ -4750,9 +4744,12 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
*ebx = _eax + _ebx;
}
}
+
if ( count == 1 )
{
- if ( cpu_has_xsaves && cpu_has_vmx_xsaves )
+ *eax &= hvm_featureset[FEATURESET_Da1];
+
+ if ( *eax & cpufeat_mask(X86_FEATURE_XSAVES) )
{
*ebx = XSTATE_AREA_MIN_SIZE;
if ( v->arch.xcr0 | v->arch.hvm_vcpu.msr_xss )
@@ -4767,20 +4764,24 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
break;
case 0x80000001:
- /* We expose RDTSCP feature to guest only when
- tsc_mode == TSC_MODE_DEFAULT and host_tsc_is_safe() returns 1 */
- if ( d->arch.tsc_mode != TSC_MODE_DEFAULT ||
- !host_tsc_is_safe() )
- *edx &= ~cpufeat_mask(X86_FEATURE_RDTSCP);
- /* Hide 1GB-superpage feature if we can't emulate it. */
- if (!hvm_pse1gb_supported(d))
- *edx &= ~cpufeat_mask(X86_FEATURE_PAGE1GB);
- /* Only provide PSE36 when guest runs in 32bit PAE or in long mode */
- if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
- *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
- /* Hide data breakpoint extensions if the hardware has no support. */
- if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
- *ecx &= ~cpufeat_mask(X86_FEATURE_DBEXT);
+ *ecx &= hvm_featureset[FEATURESET_e1c];
+ *edx &= hvm_featureset[FEATURESET_e1d];
+
+ /* If emulating Intel, clear the duplicated features in e1d. */
+ if ( d->arch.x86_vendor == X86_VENDOR_INTEL )
+ *edx &= ~CPUID_COMMON_1D_FEATURES;
+ /* fast-forward MSR_APIC_BASE.EN if it hasn't already been clobbered. */
+ else if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
+ *edx &= ~cpufeat_bit(X86_FEATURE_APIC);
+
+ /* Don't expose HAP-only features to non-hap guests. */
+ if ( !hap_enabled(d) )
+ *edx &= ~(cpufeat_mask(X86_FEATURE_PAGE1GB) |
+ cpufeat_mask(X86_FEATURE_PSE36));
+ break;
+
+ case 0x80000007:
+ *edx &= hvm_featureset[FEATURESET_e7d];
break;
case 0x80000008:
@@ -4798,6 +4799,8 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
hvm_cpuid(0x80000001, NULL, NULL, NULL, &_edx);
*eax = (*eax & ~0xffff00) | (_edx & cpufeat_mask(X86_FEATURE_LM)
? 0x3000 : 0x2000);
+
+ *ebx &= hvm_featureset[FEATURESET_e8b];
break;
}
}
@@ -73,6 +73,7 @@
#include <asm/hpet.h>
#include <asm/vpmu.h>
#include <public/arch-x86/cpuid.h>
+#include <asm/cpuid.h>
#include <xsm/xsm.h>
/*
@@ -932,69 +933,86 @@ void pv_cpuid(struct cpu_user_regs *regs)
else
cpuid_count(leaf, subleaf, &a, &b, &c, &d);
- if ( (leaf & 0x7fffffff) == 0x00000001 )
- {
- /* Modify Feature Information. */
- if ( !cpu_has_apic )
- __clear_bit(X86_FEATURE_APIC, &d);
-
- if ( !is_pvh_domain(currd) )
- {
- __clear_bit(X86_FEATURE_PSE, &d);
- __clear_bit(X86_FEATURE_PGE, &d);
- __clear_bit(X86_FEATURE_PSE36, &d);
- __clear_bit(X86_FEATURE_VME, &d);
- }
- }
-
switch ( leaf )
{
case 0x00000001:
- /* Modify Feature Information. */
- if ( !cpu_has_sep )
- __clear_bit(X86_FEATURE_SEP, &d);
- __clear_bit(X86_FEATURE_DS, &d);
- __clear_bit(X86_FEATURE_TM1, &d);
- __clear_bit(X86_FEATURE_PBE, &d);
- if ( is_pvh_domain(currd) )
- __clear_bit(X86_FEATURE_MTRR, &d);
-
- __clear_bit(X86_FEATURE_DTES64 % 32, &c);
- __clear_bit(X86_FEATURE_MONITOR % 32, &c);
- __clear_bit(X86_FEATURE_DSCPL % 32, &c);
- __clear_bit(X86_FEATURE_VMX % 32, &c);
- __clear_bit(X86_FEATURE_SMX % 32, &c);
- __clear_bit(X86_FEATURE_TM2 % 32, &c);
+ c &= pv_featureset[FEATURESET_1c];
+ d &= pv_featureset[FEATURESET_1d];
+
if ( is_pv_32bit_domain(currd) )
- __clear_bit(X86_FEATURE_CX16 % 32, &c);
- __clear_bit(X86_FEATURE_XTPR % 32, &c);
- __clear_bit(X86_FEATURE_PDCM % 32, &c);
- __clear_bit(X86_FEATURE_PCID % 32, &c);
- __clear_bit(X86_FEATURE_DCA % 32, &c);
- if ( !cpu_has_xsave )
- {
- __clear_bit(X86_FEATURE_XSAVE % 32, &c);
- __clear_bit(X86_FEATURE_AVX % 32, &c);
- }
- if ( !cpu_has_apic )
- __clear_bit(X86_FEATURE_X2APIC % 32, &c);
- __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
+ c &= ~cpufeat_mask(X86_FEATURE_CX16);
+
+ /*
+ * !!! Warning - OSXSAVE handling for PV guests is non-architectural !!!
+ *
+ * Architecturally, the correct code here is simply:
+ *
+ * if ( curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
+ * c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+ *
+ * However because of bugs in Xen (before c/s bd19080b, Nov 2010, the
+ * XSAVE cpuid flag leaked into guests despite the feature not being
+ * avilable for use), buggy workarounds where introduced to Linux (c/s
+ * 947ccf9c, also Nov 2010) which relied on the fact that Xen also
+ * incorrectly leaked OSXSAVE into the guest.
+ *
+ * Furthermore, providing architectural OSXSAVE behaviour to a many
+ * Linux PV guests triggered a further kernel bug when the fpu code
+ * observes that XSAVEOPT is available, assumes that xsave state had
+ * been set up for the task, and follows a wild pointer.
+ *
+ * Older Linux PVOPS kernels however do require architectrual
+ * behaviour. They observe Xen's leaked OSXSAVE and assume they can
+ * already use XSETBV, dying with a #UD because the shadowed
+ * CR4.OSXSAVE is clear. This behaviour has been adjusted in all
+ * observed cases via stable backports of the above changeset.
+ *
+ * Therefore, the leaking of Xen's OSXSAVE setting has become a
+ * defacto part of the PV ABI and can't reasonably be corrected.
+ *
+ * The following situations and logic now applies:
+ *
+ * - Hardware without CPUID faulting support and native CPUID:
+ * There is nothing Xen can do here. The hosts XSAVE flag will
+ * leak through and Xen's OSXSAVE choice will leak through.
+ *
+ * In the case that the guest kernel has not set up OSXSAVE, only
+ * SSE will be set in xcr0, and guest userspace can't do too much
+ * damage itself.
+ *
+ * - Enlightened CPUID or CPUID faulting available:
+ * Xen can fully control what is seen here. Guest kernels need to
+ * see the leaked OSXSAVE, but guest userspace is given
+ * architectural behaviour, to reflect the guest kernels
+ * intentions.
+ */
+ /* OSXSAVE cleared by pv_featureset. Fast-forward CR4 back in. */
+ if ( (is_pv_domain(currd) && guest_kernel_mode(curr, regs) &&
+ (this_cpu(cr4) & X86_CR4_OSXSAVE)) ||
+ (curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
+ c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+
+ c |= cpufeat_mask(X86_FEATURE_HYPERVISOR);
break;
case 0x00000007:
if ( subleaf == 0 )
- b &= (cpufeat_mask(X86_FEATURE_BMI1) |
- cpufeat_mask(X86_FEATURE_HLE) |
- cpufeat_mask(X86_FEATURE_AVX2) |
- cpufeat_mask(X86_FEATURE_BMI2) |
- cpufeat_mask(X86_FEATURE_ERMS) |
- cpufeat_mask(X86_FEATURE_RTM) |
- cpufeat_mask(X86_FEATURE_RDSEED) |
- cpufeat_mask(X86_FEATURE_ADX) |
- cpufeat_mask(X86_FEATURE_FSGSBASE));
+ {
+ /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
+ b &= (pv_featureset[FEATURESET_7b0] &
+ ~special_features[FEATURESET_7b0]);
+ b |= (host_featureset[FEATURESET_7b0] &
+ special_features[FEATURESET_7b0]);
+
+ c &= pv_featureset[FEATURESET_7c0];
+
+ /* OSPKE cleared by pv_featureset. Fast-forward CR4 back in. */
+ if ( curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_PKE )
+ c |= cpufeat_mask(X86_FEATURE_OSPKE);
+ }
else
- b = 0;
- a = c = d = 0;
+ b = c = 0;
+ a = d = 0;
break;
case XSTATE_CPUID:
@@ -1017,37 +1035,36 @@ void pv_cpuid(struct cpu_user_regs *regs)
}
case 1:
- a &= (boot_cpu_data.x86_capability[cpufeat_word(X86_FEATURE_XSAVEOPT)] &
- ~cpufeat_mask(X86_FEATURE_XSAVES));
+ a &= pv_featureset[FEATURESET_Da1];
b = c = d = 0;
break;
}
break;
case 0x80000001:
- /* Modify Feature Information. */
+ c &= pv_featureset[FEATURESET_e1c];
+ d &= pv_featureset[FEATURESET_e1d];
+
+ /* If emulating Intel, clear the duplicated features in e1d. */
+ if ( currd->arch.x86_vendor == X86_VENDOR_INTEL )
+ d &= ~CPUID_COMMON_1D_FEATURES;
+
if ( is_pv_32bit_domain(currd) )
{
- __clear_bit(X86_FEATURE_LM % 32, &d);
- __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
+ d &= ~cpufeat_mask(X86_FEATURE_LM);
+ c &= ~cpufeat_mask(X86_FEATURE_LAHF_LM);
+
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+ d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
}
- if ( is_pv_32bit_domain(currd) &&
- boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
- __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
- __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
- __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
-
- __clear_bit(X86_FEATURE_SVM % 32, &c);
- if ( !cpu_has_apic )
- __clear_bit(X86_FEATURE_EXTAPIC % 32, &c);
- __clear_bit(X86_FEATURE_OSVW % 32, &c);
- __clear_bit(X86_FEATURE_IBS % 32, &c);
- __clear_bit(X86_FEATURE_SKINIT % 32, &c);
- __clear_bit(X86_FEATURE_WDT % 32, &c);
- __clear_bit(X86_FEATURE_LWP % 32, &c);
- __clear_bit(X86_FEATURE_NODEID_MSR % 32, &c);
- __clear_bit(X86_FEATURE_TOPOEXT % 32, &c);
- __clear_bit(X86_FEATURE_MONITORX % 32, &c);
+ break;
+
+ case 0x80000007:
+ d &= pv_featureset[FEATURESET_e7d];
+ break;
+
+ case 0x80000008:
+ b &= pv_featureset[FEATURESET_e8b];
break;
case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
Currently, {pv,hvm}_cpuid() has a large quantity of essentially-static logic for modifying the features visible to a guest. A lot of this can be subsumed by {pv,hvm}_featuremask, which identify the features available on this hardware which could be given to a PV or HVM guest. This is a step in the direction of full per-domain cpuid policies, but lots more development is needed for that. As a result, the static checks are simplified, but the dynamic checks need to remain for now. As a side effect, some of the logic for special features can be improved. OSXSAVE and OSPKE will be automatically cleared because of being absent in the featuremask. This allows the fast-forward logic to be more simple. In addition, there are some corrections to the existing logic: * Hiding PSE36 out of PAE mode is architecturally wrong. It turns out that it was a bugfix for running HyperV under Xen, which wanted to see PSE36 even after choosing to use PAE paging. PSE36 is not supported by shadow paging, so is hidden from non-HAP guests, but is still visible for HAP guests. * Changing the visibility of RDTSCP based on host TSC stability or virtual TSC mode is bogus, so dropped. * When emulating Intel to a guest, the common features in e1d should be cleared. * The APIC bit in e1d (on non-Intel) is also a fast-forward from the APIC_BASE MSR. As a small improvement, use compiler-visible &'s and |'s, rather than {clear,set}_bit(). Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> --- CC: Jan Beulich <JBeulich@suse.com> v3: * Better handling of the cross-vendor case. * Improvements to the handling of special features. * Correct PSE36 to being a HAP-only feature. * Yet more OSXSAVE fixes for PV guests. v2: * Reinstate some of the dynamic checks for now. Future development work will instate a complete per-domain policy. * Fix OSXSAVE handling for PV guests. --- xen/arch/x86/hvm/hvm.c | 93 ++++++++++++++------------- xen/arch/x86/traps.c | 169 +++++++++++++++++++++++++++---------------------- 2 files changed, 141 insertions(+), 121 deletions(-)