From patchwork Sun Oct 17 10:15:52 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nadav Har'El X-Patchwork-Id: 259951 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id o9HAG3wT020055 for ; Sun, 17 Oct 2010 10:16:03 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932406Ab0JQKP7 (ORCPT ); Sun, 17 Oct 2010 06:15:59 -0400 Received: from mtagate3.uk.ibm.com ([194.196.100.163]:36203 "EHLO mtagate3.uk.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932389Ab0JQKP7 (ORCPT ); Sun, 17 Oct 2010 06:15:59 -0400 Received: from d06nrmr1507.portsmouth.uk.ibm.com (d06nrmr1507.portsmouth.uk.ibm.com [9.149.38.233]) by mtagate3.uk.ibm.com (8.13.1/8.13.1) with ESMTP id o9HAFtor004640 for ; Sun, 17 Oct 2010 10:15:55 GMT Received: from d06av07.portsmouth.uk.ibm.com (d06av07.portsmouth.uk.ibm.com [9.149.37.248]) by d06nrmr1507.portsmouth.uk.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o9HAFt6s2695352 for ; Sun, 17 Oct 2010 11:15:55 +0100 Received: from d06av07.portsmouth.uk.ibm.com (loopback [127.0.0.1]) by d06av07.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id o9HAFsMe006367 for ; Sun, 17 Oct 2010 04:15:54 -0600 Received: from rice.haifa.ibm.com (rice.haifa.ibm.com [9.148.8.112]) by d06av07.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with ESMTP id o9HAFr6F006364 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO); Sun, 17 Oct 2010 04:15:54 -0600 Received: from rice.haifa.ibm.com (lnx-nyh.haifa.ibm.com [127.0.0.1]) by rice.haifa.ibm.com (8.14.4/8.14.4) with ESMTP id o9HAFreq029603; Sun, 17 Oct 2010 12:15:53 +0200 Received: (from nyh@localhost) by rice.haifa.ibm.com (8.14.4/8.14.4/Submit) id o9HAFqST029601; Sun, 17 Oct 2010 12:15:52 +0200 Date: Sun, 17 Oct 2010 12:15:52 +0200 Message-Id: <201010171015.o9HAFqST029601@rice.haifa.ibm.com> X-Authentication-Warning: rice.haifa.ibm.com: nyh set sender to "Nadav Har'El" using -f Cc: gleb@redhat.com, avi@redhat.com To: kvm@vger.kernel.org From: "Nadav Har'El" References: <1287309814-nyh@il.ibm.com> Subject: [PATCH 24/27] nVMX: Handling of CR0.TS and #NM for Lazy FPU loading Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Sun, 17 Oct 2010 10:16:03 +0000 (UTC) --- .before/arch/x86/kvm/vmx.c 2010-10-17 11:52:03.000000000 +0200 +++ .after/arch/x86/kvm/vmx.c 2010-10-17 11:52:03.000000000 +0200 @@ -1098,6 +1098,17 @@ static void update_exception_bitmap(stru eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ if (vcpu->fpu_active) eb &= ~(1u << NM_VECTOR); + + /* When we are running a nested L2 guest and L1 specified for it a + * certain exception bitmap, we must trap the same exceptions and pass + * them to L1. When running L2, we will only handle the exceptions + * specified above if L1 did not want them. + */ + if (to_vmx(vcpu)->nested.nested_mode) { + u32 nested_eb = get_vmcs12_fields(vcpu)->exception_bitmap; + eb |= nested_eb; + } + vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -1422,8 +1433,19 @@ static void vmx_fpu_activate(struct kvm_ cr0 &= ~(X86_CR0_TS | X86_CR0_MP); cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); vmcs_writel(GUEST_CR0, cr0); - update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + if (to_vmx(vcpu)->nested.nested_mode) { + /* While we (L0) no longer care about NM exceptions or cr0.TS + * changes, our guest hypervisor (L1) might care in which case + * we must trap them for it. + */ + u32 eb = vmcs_read32(EXCEPTION_BITMAP) & ~(1u << NM_VECTOR); + struct vmcs_fields *vmcs12 = get_vmcs12_fields(vcpu); + eb |= vmcs12->exception_bitmap; + vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; + vmcs_write32(EXCEPTION_BITMAP, eb); + } else + update_exception_bitmap(vcpu); vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } @@ -1431,12 +1453,24 @@ static void vmx_decache_cr0_guest_bits(s static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { + /* Note that there is no vcpu->fpu_active = 0 here. The caller must + * set this *before* calling this function. + */ vmx_decache_cr0_guest_bits(vcpu); vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); - update_exception_bitmap(vcpu); + vmcs_write32(EXCEPTION_BITMAP, + vmcs_read32(EXCEPTION_BITMAP) | (1u << NM_VECTOR)); vcpu->arch.cr0_guest_owned_bits = 0; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + if (to_vmx(vcpu)->nested.nested_mode) + /* Unfortunately in nested mode we play with arch.cr0's PG + * bit, so we musn't copy it all, just the relevant TS bit + */ + vmcs_writel(CR0_READ_SHADOW, + (vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS) | + (vcpu->arch.cr0 & X86_CR0_TS)); + else + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -3876,6 +3910,52 @@ static void complete_insn_gp(struct kvm_ skip_emulated_instruction(vcpu); } +/* called to set cr0 as approriate for a mov-to-cr0 exit. */ +static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (to_vmx(vcpu)->nested.nested_mode) { + /* When running L2, we usually do what L1 wants: it decides + * which cr0 bits to intercept, we forward it cr0-change events + * (see nested_vmx_exit_handled()). We only get here when a cr0 + * bit was changed that L1 did not ask to intercept, but L0 + * nevertheless did. Currently this can only happen with the TS + * bit (see CR0_GUEST_HOST_MASK in prepare_vmcs02()). + * We must change only this bit in GUEST_CR0 and CR0_READ_SHADOW + * and not call kvm_set_cr0 because it enforces a relationship + * between the two that is specific to KVM (i.e., only the TS + * bit might differ) and with which L1 might not agree. + */ + long new_cr0 = vmcs_readl(GUEST_CR0); + long new_cr0_rs = vmcs_readl(CR0_READ_SHADOW); + if (val & X86_CR0_TS) { + new_cr0 |= X86_CR0_TS; + new_cr0_rs |= X86_CR0_TS; + vcpu->arch.cr0 |= X86_CR0_TS; + } else { + new_cr0 &= ~X86_CR0_TS; + new_cr0_rs &= ~X86_CR0_TS; + vcpu->arch.cr0 &= ~X86_CR0_TS; + } + vmcs_writel(GUEST_CR0, new_cr0); + vmcs_writel(CR0_READ_SHADOW, new_cr0_rs); + return 0; + } else + return kvm_set_cr0(vcpu, val); +} + +/* called to set cr0 as approriate for clts instruction exit. */ +static void handle_clts(struct kvm_vcpu *vcpu) +{ + if (to_vmx(vcpu)->nested.nested_mode) { + /* As in handle_set_cr0(), we can't call vmx_set_cr0 here */ + vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0) & ~X86_CR0_TS); + vmcs_writel(CR0_READ_SHADOW, + vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); + vcpu->arch.cr0 &= ~X86_CR0_TS; + } else + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; @@ -3892,7 +3972,7 @@ static int handle_cr(struct kvm_vcpu *vc trace_kvm_cr_write(cr, val); switch (cr) { case 0: - err = kvm_set_cr0(vcpu, val); + err = handle_set_cr0(vcpu, val); complete_insn_gp(vcpu, err); return 1; case 3: @@ -3918,7 +3998,7 @@ static int handle_cr(struct kvm_vcpu *vc }; break; case 2: /* clts */ - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + handle_clts(vcpu); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); vmx_fpu_activate(vcpu);