From patchwork Sun Oct 17 10:15:52 2010
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Nadav Har'El <nyh@il.ibm.com>
X-Patchwork-Id: 259951
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id o9HAG3wT020055
	for <patchwork-kvm@patchwork.kernel.org>;
	Sun, 17 Oct 2010 10:16:03 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S932406Ab0JQKP7 (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Sun, 17 Oct 2010 06:15:59 -0400
Received: from mtagate3.uk.ibm.com ([194.196.100.163]:36203 "EHLO
	mtagate3.uk.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S932389Ab0JQKP7 (ORCPT <rfc822;kvm@vger.kernel.org>);
	Sun, 17 Oct 2010 06:15:59 -0400
Received: from d06nrmr1507.portsmouth.uk.ibm.com
	(d06nrmr1507.portsmouth.uk.ibm.com [9.149.38.233])
	by mtagate3.uk.ibm.com (8.13.1/8.13.1) with ESMTP id o9HAFtor004640
	for <kvm@vger.kernel.org>; Sun, 17 Oct 2010 10:15:55 GMT
Received: from d06av07.portsmouth.uk.ibm.com (d06av07.portsmouth.uk.ibm.com
	[9.149.37.248])
	by d06nrmr1507.portsmouth.uk.ibm.com (8.13.8/8.13.8/NCO v10.0) with
	ESMTP id o9HAFt6s2695352
	for <kvm@vger.kernel.org>; Sun, 17 Oct 2010 11:15:55 +0100
Received: from d06av07.portsmouth.uk.ibm.com (loopback [127.0.0.1])
	by d06av07.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with
	ESMTP id o9HAFsMe006367
	for <kvm@vger.kernel.org>; Sun, 17 Oct 2010 04:15:54 -0600
Received: from rice.haifa.ibm.com (rice.haifa.ibm.com [9.148.8.112])
	by d06av07.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with
	ESMTP id o9HAFr6F006364
	(version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO);
	Sun, 17 Oct 2010 04:15:54 -0600
Received: from rice.haifa.ibm.com (lnx-nyh.haifa.ibm.com [127.0.0.1])
	by rice.haifa.ibm.com (8.14.4/8.14.4) with ESMTP id o9HAFreq029603;
	Sun, 17 Oct 2010 12:15:53 +0200
Received: (from nyh@localhost)
	by rice.haifa.ibm.com (8.14.4/8.14.4/Submit) id o9HAFqST029601;
	Sun, 17 Oct 2010 12:15:52 +0200
Date: Sun, 17 Oct 2010 12:15:52 +0200
Message-Id: <201010171015.o9HAFqST029601@rice.haifa.ibm.com>
X-Authentication-Warning: rice.haifa.ibm.com: nyh set sender to "Nadav
	Har'El" <nyh@il.ibm.com> using -f
Cc: gleb@redhat.com, avi@redhat.com
To: kvm@vger.kernel.org
From: "Nadav Har'El" <nyh@il.ibm.com>
References: <1287309814-nyh@il.ibm.com>
Subject: [PATCH 24/27] nVMX: Handling of CR0.TS and #NM for Lazy FPU loading
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org
X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by
	milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]);
	Sun, 17 Oct 2010 10:16:03 +0000 (UTC)


--- .before/arch/x86/kvm/vmx.c	2010-10-17 11:52:03.000000000 +0200
+++ .after/arch/x86/kvm/vmx.c	2010-10-17 11:52:03.000000000 +0200
@@ -1098,6 +1098,17 @@ static void update_exception_bitmap(stru
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	if (vcpu->fpu_active)
 		eb &= ~(1u << NM_VECTOR);
+
+	/* When we are running a nested L2 guest and L1 specified for it a
+	 * certain exception bitmap, we must trap the same exceptions and pass
+	 * them to L1. When running L2, we will only handle the exceptions
+	 * specified above if L1 did not want them.
+	 */
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		u32 nested_eb = get_vmcs12_fields(vcpu)->exception_bitmap;
+		eb |= nested_eb;
+	}
+
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1422,8 +1433,19 @@ static void vmx_fpu_activate(struct kvm_
 	cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
 	cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
 	vmcs_writel(GUEST_CR0, cr0);
-	update_exception_bitmap(vcpu);
 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		/* While we (L0) no longer care about NM exceptions or cr0.TS
+		 * changes, our guest hypervisor (L1) might care in which case
+		 * we must trap them for it.
+		 */
+		u32 eb = vmcs_read32(EXCEPTION_BITMAP) & ~(1u << NM_VECTOR);
+		struct vmcs_fields *vmcs12 = get_vmcs12_fields(vcpu);
+		eb |= vmcs12->exception_bitmap;
+		vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+		vmcs_write32(EXCEPTION_BITMAP, eb);
+	} else
+		update_exception_bitmap(vcpu);
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 }
 
@@ -1431,12 +1453,24 @@ static void vmx_decache_cr0_guest_bits(s
 
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
+	/* Note that there is no vcpu->fpu_active = 0 here. The caller must
+	 * set this *before* calling this function.
+	 */
 	vmx_decache_cr0_guest_bits(vcpu);
 	vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
-	update_exception_bitmap(vcpu);
+	vmcs_write32(EXCEPTION_BITMAP,
+		vmcs_read32(EXCEPTION_BITMAP) | (1u << NM_VECTOR));
 	vcpu->arch.cr0_guest_owned_bits = 0;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-	vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+	if (to_vmx(vcpu)->nested.nested_mode)
+		/* Unfortunately in nested mode we play with arch.cr0's PG
+		 * bit, so we musn't copy it all, just the relevant TS bit
+		 */
+		vmcs_writel(CR0_READ_SHADOW,
+			(vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS) |
+			(vcpu->arch.cr0 & X86_CR0_TS));
+	else
+		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
 }
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -3876,6 +3910,52 @@ static void complete_insn_gp(struct kvm_
 		skip_emulated_instruction(vcpu);
 }
 
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		/* When running L2, we usually do what L1 wants: it decides
+		 * which cr0 bits to intercept, we forward it cr0-change events
+		 * (see nested_vmx_exit_handled()). We only get here when a cr0
+		 * bit was changed that L1 did not ask to intercept, but L0
+		 * nevertheless did. Currently this can only happen with the TS
+		 * bit (see CR0_GUEST_HOST_MASK in prepare_vmcs02()).
+		 * We must change only this bit in GUEST_CR0 and CR0_READ_SHADOW
+		 * and not call kvm_set_cr0 because it enforces a relationship
+		 * between the two that is specific to KVM (i.e., only the TS
+		 * bit might differ) and with which L1 might not agree.
+		 */
+		long new_cr0 = vmcs_readl(GUEST_CR0);
+		long new_cr0_rs = vmcs_readl(CR0_READ_SHADOW);
+		if (val & X86_CR0_TS) {
+			new_cr0 |= X86_CR0_TS;
+			new_cr0_rs |= X86_CR0_TS;
+			vcpu->arch.cr0 |= X86_CR0_TS;
+		} else {
+			new_cr0 &= ~X86_CR0_TS;
+			new_cr0_rs &= ~X86_CR0_TS;
+			vcpu->arch.cr0 &= ~X86_CR0_TS;
+		}
+		vmcs_writel(GUEST_CR0, new_cr0);
+		vmcs_writel(CR0_READ_SHADOW, new_cr0_rs);
+		return 0;
+	} else
+		return kvm_set_cr0(vcpu, val);
+}
+
+/* called to set cr0 as approriate for clts instruction exit. */
+static void handle_clts(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		/* As in handle_set_cr0(), we can't call vmx_set_cr0 here */
+		vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0) & ~X86_CR0_TS);
+		vmcs_writel(CR0_READ_SHADOW,
+			vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+		vcpu->arch.cr0 &= ~X86_CR0_TS;
+	} else
+		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
@@ -3892,7 +3972,7 @@ static int handle_cr(struct kvm_vcpu *vc
 		trace_kvm_cr_write(cr, val);
 		switch (cr) {
 		case 0:
-			err = kvm_set_cr0(vcpu, val);
+			err = handle_set_cr0(vcpu, val);
 			complete_insn_gp(vcpu, err);
 			return 1;
 		case 3:
@@ -3918,7 +3998,7 @@ static int handle_cr(struct kvm_vcpu *vc
 		};
 		break;
 	case 2: /* clts */
-		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+		handle_clts(vcpu);
 		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
 		skip_emulated_instruction(vcpu);
 		vmx_fpu_activate(vcpu);