@@ -1098,6 +1098,17 @@ static void update_exception_bitmap(stru
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
if (vcpu->fpu_active)
eb &= ~(1u << NM_VECTOR);
+
+ /* When we are running a nested L2 guest and L1 specified for it a
+ * certain exception bitmap, we must trap the same exceptions and pass
+ * them to L1. When running L2, we will only handle the exceptions
+ * specified above if L1 did not want them.
+ */
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ u32 nested_eb = get_vmcs12_fields(vcpu)->exception_bitmap;
+ eb |= nested_eb;
+ }
+
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -1422,8 +1433,19 @@ static void vmx_fpu_activate(struct kvm_
cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
vmcs_writel(GUEST_CR0, cr0);
- update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ /* While we (L0) no longer care about NM exceptions or cr0.TS
+ * changes, our guest hypervisor (L1) might care in which case
+ * we must trap them for it.
+ */
+ u32 eb = vmcs_read32(EXCEPTION_BITMAP) & ~(1u << NM_VECTOR);
+ struct vmcs_fields *vmcs12 = get_vmcs12_fields(vcpu);
+ eb |= vmcs12->exception_bitmap;
+ vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+ vmcs_write32(EXCEPTION_BITMAP, eb);
+ } else
+ update_exception_bitmap(vcpu);
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
}
@@ -1431,12 +1453,24 @@ static void vmx_decache_cr0_guest_bits(s
static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
{
+ /* Note that there is no vcpu->fpu_active = 0 here. The caller must
+ * set this *before* calling this function.
+ */
vmx_decache_cr0_guest_bits(vcpu);
vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
- update_exception_bitmap(vcpu);
+ vmcs_write32(EXCEPTION_BITMAP,
+ vmcs_read32(EXCEPTION_BITMAP) | (1u << NM_VECTOR));
vcpu->arch.cr0_guest_owned_bits = 0;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+ if (to_vmx(vcpu)->nested.nested_mode)
+ /* Unfortunately in nested mode we play with arch.cr0's PG
+ * bit, so we musn't copy it all, just the relevant TS bit
+ */
+ vmcs_writel(CR0_READ_SHADOW,
+ (vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS) |
+ (vcpu->arch.cr0 & X86_CR0_TS));
+ else
+ vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
}
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -3876,6 +3910,52 @@ static void complete_insn_gp(struct kvm_
skip_emulated_instruction(vcpu);
}
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ /* When running L2, we usually do what L1 wants: it decides
+ * which cr0 bits to intercept, we forward it cr0-change events
+ * (see nested_vmx_exit_handled()). We only get here when a cr0
+ * bit was changed that L1 did not ask to intercept, but L0
+ * nevertheless did. Currently this can only happen with the TS
+ * bit (see CR0_GUEST_HOST_MASK in prepare_vmcs02()).
+ * We must change only this bit in GUEST_CR0 and CR0_READ_SHADOW
+ * and not call kvm_set_cr0 because it enforces a relationship
+ * between the two that is specific to KVM (i.e., only the TS
+ * bit might differ) and with which L1 might not agree.
+ */
+ long new_cr0 = vmcs_readl(GUEST_CR0);
+ long new_cr0_rs = vmcs_readl(CR0_READ_SHADOW);
+ if (val & X86_CR0_TS) {
+ new_cr0 |= X86_CR0_TS;
+ new_cr0_rs |= X86_CR0_TS;
+ vcpu->arch.cr0 |= X86_CR0_TS;
+ } else {
+ new_cr0 &= ~X86_CR0_TS;
+ new_cr0_rs &= ~X86_CR0_TS;
+ vcpu->arch.cr0 &= ~X86_CR0_TS;
+ }
+ vmcs_writel(GUEST_CR0, new_cr0);
+ vmcs_writel(CR0_READ_SHADOW, new_cr0_rs);
+ return 0;
+ } else
+ return kvm_set_cr0(vcpu, val);
+}
+
+/* called to set cr0 as approriate for clts instruction exit. */
+static void handle_clts(struct kvm_vcpu *vcpu)
+{
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ /* As in handle_set_cr0(), we can't call vmx_set_cr0 here */
+ vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0) & ~X86_CR0_TS);
+ vmcs_writel(CR0_READ_SHADOW,
+ vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+ vcpu->arch.cr0 &= ~X86_CR0_TS;
+ } else
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+}
+
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@@ -3892,7 +3972,7 @@ static int handle_cr(struct kvm_vcpu *vc
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- err = kvm_set_cr0(vcpu, val);
+ err = handle_set_cr0(vcpu, val);
complete_insn_gp(vcpu, err);
return 1;
case 3:
@@ -3918,7 +3998,7 @@ static int handle_cr(struct kvm_vcpu *vc
};
break;
case 2: /* clts */
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ handle_clts(vcpu);
trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
skip_emulated_instruction(vcpu);
vmx_fpu_activate(vcpu);