diff mbox

[08/31] nVMX: Fix local_vcpus_link handling

Message ID 20110522085732.GB1116@fermat.math.technion.ac.il (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Har'El May 22, 2011, 8:57 a.m. UTC
On Wed, May 18, 2011, Marcelo Tosatti wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> Humpf, right. OK, you can handle the x86.c usage with
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>...

Hi Avi and Marcelo, here is a the new first patch to the nvmx patch set,
which overhauls the handling of vmcss on cpus, as you asked.

As you guessed, the nested entry and exit code becomes much simpler and
cleaner, with the whole VMCS switching code on entry, for example, reduced
to:
	cpu = get_cpu();
	vmx->loaded_vmcs = vmcs02;
	vmx_vcpu_put(vcpu);
	vmx_vcpu_load(vcpu, cpu);
	vcpu->cpu = cpu;
	put_cpu();

You can apply this patch separately from the rest of the patch set, if you
wish. I'm sending just this one, like you asked - and can send the rest of
the patches when you ask me to.


Subject: [PATCH 01/31] nVMX: Keep list of loaded VMCSs, instead of vcpus.

In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
because (at least in theory) the processor might not have written all of its
content back to memory. Since a patch from June 26, 2008, this is done using
a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.

The problem is that with nested VMX, we no longer have the concept of a
vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
L2s), and each of those may be have been last loaded on a different cpu.

So instead of linking the vcpus, we link the VMCSs, using a new structure
loaded_vmcs. This structure contains the VMCS, and the information pertaining
to its loading on a specific cpu (namely, the cpu number, and whether it
was already launched on this cpu once). In nested we will also use the same
structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the
currently active VMCS.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/vmx.c |  129 ++++++++++++++++++++++++++-----------------
 arch/x86/kvm/x86.c |    3 -
 2 files changed, 80 insertions(+), 52 deletions(-)

Comments

Avi Kivity May 23, 2011, 3:49 p.m. UTC | #1
On 05/22/2011 11:57 AM, Nadav Har'El wrote:
> Hi Avi and Marcelo, here is a the new first patch to the nvmx patch set,
> which overhauls the handling of vmcss on cpus, as you asked.
>
> As you guessed, the nested entry and exit code becomes much simpler and
> cleaner, with the whole VMCS switching code on entry, for example, reduced
> to:
> 	cpu = get_cpu();
> 	vmx->loaded_vmcs = vmcs02;
> 	vmx_vcpu_put(vcpu);
> 	vmx_vcpu_load(vcpu, cpu);
> 	vcpu->cpu = cpu;
> 	put_cpu();

That's wonderful, it indicates the code is much better integrated.  
Perhaps later we can refine it  to have separate _load and _put for 
host-related and guest-related parts (I think they already exist in the 
code, except they are always called together), but that is an 
optimization, and not the most important one by far.

> You can apply this patch separately from the rest of the patch set, if you
> wish. I'm sending just this one, like you asked - and can send the rest of
> the patches when you ask me to.
>
>
> Subject: [PATCH 01/31] nVMX: Keep list of loaded VMCSs, instead of vcpus.
>
> In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
> because (at least in theory) the processor might not have written all of its
> content back to memory. Since a patch from June 26, 2008, this is done using
> a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.
>
> The problem is that with nested VMX, we no longer have the concept of a
> vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
> L2s), and each of those may be have been last loaded on a different cpu.
>
> So instead of linking the vcpus, we link the VMCSs, using a new structure
> loaded_vmcs. This structure contains the VMCS, and the information pertaining
> to its loading on a specific cpu (namely, the cpu number, and whether it
> was already launched on this cpu once). In nested we will also use the same
> structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the
> currently active VMCS.
>
> --- .before/arch/x86/kvm/x86.c	2011-05-22 11:41:57.000000000 +0300
> +++ .after/arch/x86/kvm/x86.c	2011-05-22 11:41:57.000000000 +0300
> @@ -2119,7 +2119,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu
>   	if (need_emulate_wbinvd(vcpu)) {
>   		if (kvm_x86_ops->has_wbinvd_exit())
>   			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
> -		else if (vcpu->cpu != -1&&  vcpu->cpu != cpu)
> +		else if (vcpu->cpu != -1&&  vcpu->cpu != cpu
> +				&&  cpu_online(vcpu->cpu))
>   			smp_call_function_single(vcpu->cpu,
>   					wbinvd_ipi, NULL, 1);
>   	}

Is this a necessary part of this patch?  Or an semi-related bugfix?

I think that it can't actually trigger before this patch due to luck.  
svm doesn't clear vcpu->cpu on cpu offline, but on the other hand it 
->has_wbinvd_exit().

Joerg, is

     if (unlikely(cpu != vcpu->cpu)) {
         svm->asid_generation = 0;
         mark_all_dirty(svm->vmcb);
     }

susceptible to cpu offline/online?

> @@ -971,22 +992,22 @@ static void vmx_vcpu_load(struct kvm_vcp
>
>   	if (!vmm_exclusive)
>   		kvm_cpu_vmxon(phys_addr);
> -	else if (vcpu->cpu != cpu)
> -		vcpu_clear(vmx);
> +	else if (vmx->loaded_vmcs->cpu != cpu)
> +		loaded_vmcs_clear(vmx->loaded_vmcs);
>
> -	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
> -		per_cpu(current_vmcs, cpu) = vmx->vmcs;
> -		vmcs_load(vmx->vmcs);
> +	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
> +		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
> +		vmcs_load(vmx->loaded_vmcs->vmcs);
>   	}
>
> -	if (vcpu->cpu != cpu) {
> +	if (vmx->loaded_vmcs->cpu != cpu) {
>   		struct desc_ptr *gdt =&__get_cpu_var(host_gdt);
>   		unsigned long sysenter_esp;
>
>   		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>   		local_irq_disable();
> -		list_add(&vmx->local_vcpus_link,
> -			&per_cpu(vcpus_on_cpu, cpu));
> +		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
> +			&per_cpu(loaded_vmcss_on_cpu, cpu));
>   		local_irq_enable();
>
>   		/*
> @@ -999,13 +1020,15 @@ static void vmx_vcpu_load(struct kvm_vcp
>   		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
>   		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
>   	}
> +	vmx->loaded_vmcs->cpu = cpu;

This should be within the if () block.

> @@ -4344,11 +4369,13 @@ static struct kvm_vcpu *vmx_create_vcpu(
>   		goto uninit_vcpu;
>   	}
>
> -	vmx->vmcs = alloc_vmcs();
> -	if (!vmx->vmcs)
> +	vmx->loaded_vmcs =&vmx->vmcs01;
> +	vmx->loaded_vmcs->vmcs = alloc_vmcs();
> +	if (!vmx->loaded_vmcs->vmcs)
>   		goto free_msrs;
> -
> -	vmcs_init(vmx->vmcs);
> +	vmcs_init(vmx->loaded_vmcs->vmcs);
> +	vmx->loaded_vmcs->cpu = -1;
> +	vmx->loaded_vmcs->launched = 0;

Perhaps a loaded_vmcs_init() to encapsulate initialization of these 
three fields, you'll probably reuse it later.

Please repost separately after the fix, I'd like to apply it before the 
rest of the series.

(regarding interrupts, I think we can do that work post-merge.  But I'd 
like to see Kevin's comments addressed)
Gleb Natapov May 23, 2011, 4:17 p.m. UTC | #2
On Mon, May 23, 2011 at 06:49:17PM +0300, Avi Kivity wrote:
> (regarding interrupts, I think we can do that work post-merge.  But
> I'd like to see Kevin's comments addressed)
> 
To be fair this wasn't addressed for almost two years now.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joerg Roedel May 23, 2011, 4:43 p.m. UTC | #3
On Mon, May 23, 2011 at 11:49:17AM -0400, Avi Kivity wrote:

> Joerg, is
> 
>      if (unlikely(cpu != vcpu->cpu)) {
>          svm->asid_generation = 0;
>          mark_all_dirty(svm->vmcb);
>      }
> 
> susceptible to cpu offline/online?

I don't think so. This should be safe for cpu offline/online as long as
the cpu-number value is not reused for another physical cpu. But that
should be the case afaik.

	Joerg


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 23, 2011, 4:51 p.m. UTC | #4
On 05/23/2011 07:43 PM, Roedel, Joerg wrote:
> On Mon, May 23, 2011 at 11:49:17AM -0400, Avi Kivity wrote:
>
> >  Joerg, is
> >
> >       if (unlikely(cpu != vcpu->cpu)) {
> >           svm->asid_generation = 0;
> >           mark_all_dirty(svm->vmcb);
> >       }
> >
> >  susceptible to cpu offline/online?
>
> I don't think so. This should be safe for cpu offline/online as long as
> the cpu-number value is not reused for another physical cpu. But that
> should be the case afaik.
>

Why not? offline/online does reuse cpu numbers AFAIK (and it must, if 
you have a fully populated machine and offline/online just one cpu).
Nadav Har'El May 23, 2011, 6:59 p.m. UTC | #5
On Mon, May 23, 2011, Gleb Natapov wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> On Mon, May 23, 2011 at 06:49:17PM +0300, Avi Kivity wrote:
> > (regarding interrupts, I think we can do that work post-merge.  But
> > I'd like to see Kevin's comments addressed)
> > 
> To be fair this wasn't addressed for almost two years now.

Gleb, I assume by "this" you meant the idt-vectoring information issue, not
Kevin's comments (which I only saw a couple of days ago)?
Gleb Natapov May 23, 2011, 7:03 p.m. UTC | #6
On Mon, May 23, 2011 at 09:59:01PM +0300, Nadav Har'El wrote:
> On Mon, May 23, 2011, Gleb Natapov wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> > On Mon, May 23, 2011 at 06:49:17PM +0300, Avi Kivity wrote:
> > > (regarding interrupts, I think we can do that work post-merge.  But
> > > I'd like to see Kevin's comments addressed)
> > > 
> > To be fair this wasn't addressed for almost two years now.
> 
> Gleb, I assume by "this" you meant the idt-vectoring information issue, not
> Kevin's comments (which I only saw a couple of days ago)?
> 
Yes, of course.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 24, 2011, 12:57 a.m. UTC | #7
> From: Avi Kivity
> Sent: Monday, May 23, 2011 11:49 PM
> (regarding interrupts, I think we can do that work post-merge.  But I'd
> like to see Kevin's comments addressed)

My earlier comment has been addressed by Nadav with his explanation.

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joerg Roedel May 24, 2011, 9:22 a.m. UTC | #8
On Mon, May 23, 2011 at 12:51:55PM -0400, Avi Kivity wrote:
> On 05/23/2011 07:43 PM, Roedel, Joerg wrote:
> > On Mon, May 23, 2011 at 11:49:17AM -0400, Avi Kivity wrote:
> >
> > >  Joerg, is
> > >
> > >       if (unlikely(cpu != vcpu->cpu)) {
> > >           svm->asid_generation = 0;
> > >           mark_all_dirty(svm->vmcb);
> > >       }
> > >
> > >  susceptible to cpu offline/online?
> >
> > I don't think so. This should be safe for cpu offline/online as long as
> > the cpu-number value is not reused for another physical cpu. But that
> > should be the case afaik.
> >
> 
> Why not? offline/online does reuse cpu numbers AFAIK (and it must, if 
> you have a fully populated machine and offline/online just one cpu).

Yes, you are right. There is a slight possibility that the asid is not
updated when a vcpu has asid_generation == 1 and hasn't been running on
another cpu while this given cpu was offlined/onlined. Very unlikely,
but we can not rule it out.

Probably we should make the local_vcpu_list from vmx generic, use it
from svm  and fix it this way.

	Joerg
Nadav Har'El May 24, 2011, 9:28 a.m. UTC | #9
On Tue, May 24, 2011, Roedel, Joerg wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> Probably we should make the local_vcpu_list from vmx generic, use it
> from svm  and fix it this way.

The point is, local_vcpu_list is now gone, replaced by a loaded_vmcss_on_cpu,
and vcpu->cpu is not set to -1 for any vcpu when a CPU is offlined - also in
VMX...
Joerg Roedel May 24, 2011, 9:57 a.m. UTC | #10
On Tue, May 24, 2011 at 05:28:38AM -0400, Nadav Har'El wrote:
> On Tue, May 24, 2011, Roedel, Joerg wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> > Probably we should make the local_vcpu_list from vmx generic, use it
> > from svm  and fix it this way.
> 
> The point is, local_vcpu_list is now gone, replaced by a loaded_vmcss_on_cpu,
> and vcpu->cpu is not set to -1 for any vcpu when a CPU is offlined - also in
> VMX...

loaded_vmcss_on_cpu sound similar, probably this can be generalized. Is
this code already upstream or is this changed with your nVMX patch-set?

	Joerg
Avi Kivity May 24, 2011, 10:08 a.m. UTC | #11
On 05/24/2011 12:57 PM, Roedel, Joerg wrote:
> On Tue, May 24, 2011 at 05:28:38AM -0400, Nadav Har'El wrote:
> >  On Tue, May 24, 2011, Roedel, Joerg wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> >  >  Probably we should make the local_vcpu_list from vmx generic, use it
> >  >  from svm  and fix it this way.
> >
> >  The point is, local_vcpu_list is now gone, replaced by a loaded_vmcss_on_cpu,
> >  and vcpu->cpu is not set to -1 for any vcpu when a CPU is offlined - also in
> >  VMX...
>
> loaded_vmcss_on_cpu sound similar, probably this can be generalized.

It's not the same: there is a main:1 relationship between vmcss and 
vcpus (like vmcbs and vcpus).

However, it may be that the general case for svm also needs to treat 
individual vmcbs differently.


> Is
> this code already upstream or is this changed with your nVMX patch-set?
>

Not upstream yet (however generalization, if needed, will be done after 
it's upstream).
Nadav Har'El May 24, 2011, 10:12 a.m. UTC | #12
On Tue, May 24, 2011, Roedel, Joerg wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> loaded_vmcss_on_cpu sound similar, probably this can be generalized.

I don't think so - now that a VCPU may have several VMCSs (L1, L2), each
of those may be loaded on a different cpu so we have a list of VMCSs
(the new loaded_vmcs structure), not vcpus. When we offline a CPU, we recall
all VMCSs loaded on it from this list, and clear them; We mark cpu=-1 for
each of those vmcs, but vcpu->cpu remains untouched (and not set to -1)
for all the vcpus.

> Is this code already upstream or is this changed with your nVMX patch-set?

Avi asked me to send the patch that does this *before* nvmx. But he did not
yet merge it.
diff mbox

Patch

--- .before/arch/x86/kvm/x86.c	2011-05-22 11:41:57.000000000 +0300
+++ .after/arch/x86/kvm/x86.c	2011-05-22 11:41:57.000000000 +0300
@@ -2119,7 +2119,8 @@  void kvm_arch_vcpu_load(struct kvm_vcpu 
 	if (need_emulate_wbinvd(vcpu)) {
 		if (kvm_x86_ops->has_wbinvd_exit())
 			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
-		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+		else if (vcpu->cpu != -1 && vcpu->cpu != cpu
+				&& cpu_online(vcpu->cpu))
 			smp_call_function_single(vcpu->cpu,
 					wbinvd_ipi, NULL, 1);
 	}
--- .before/arch/x86/kvm/vmx.c	2011-05-22 11:41:57.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2011-05-22 11:41:58.000000000 +0300
@@ -116,6 +116,18 @@  struct vmcs {
 	char data[0];
 };
 
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+	struct vmcs *vmcs;
+	int cpu;
+	int launched;
+	struct list_head loaded_vmcss_on_cpu_link;
+};
+
 struct shared_msr_entry {
 	unsigned index;
 	u64 data;
@@ -124,9 +136,7 @@  struct shared_msr_entry {
 
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
-	struct list_head      local_vcpus_link;
 	unsigned long         host_rsp;
-	int                   launched;
 	u8                    fail;
 	u8                    cpl;
 	bool                  nmi_known_unmasked;
@@ -140,7 +150,14 @@  struct vcpu_vmx {
 	u64 		      msr_host_kernel_gs_base;
 	u64 		      msr_guest_kernel_gs_base;
 #endif
-	struct vmcs          *vmcs;
+	/*
+	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+	 * non-nested (L1) guest, it always points to vmcs01. For a nested
+	 * guest (L2), it points to a different VMCS.
+	 */
+	struct loaded_vmcs    vmcs01;
+	struct loaded_vmcs   *loaded_vmcs;
+	bool                  __launched; /* temporary, used in vmx_vcpu_run */
 	struct msr_autoload {
 		unsigned nr;
 		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -200,7 +217,11 @@  static int vmx_set_tss_addr(struct kvm *
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
@@ -514,25 +535,25 @@  static void vmcs_load(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
-static void __vcpu_clear(void *arg)
+static void __loaded_vmcs_clear(void *arg)
 {
-	struct vcpu_vmx *vmx = arg;
+	struct loaded_vmcs *loaded_vmcs = arg;
 	int cpu = raw_smp_processor_id();
 
-	if (vmx->vcpu.cpu == cpu)
-		vmcs_clear(vmx->vmcs);
-	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+	if (loaded_vmcs->cpu == cpu)
+		vmcs_clear(loaded_vmcs->vmcs);
+	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	list_del(&vmx->local_vcpus_link);
-	vmx->vcpu.cpu = -1;
-	vmx->launched = 0;
+	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+	loaded_vmcs->cpu = -1;
+	loaded_vmcs->launched = 0;
 }
 
-static void vcpu_clear(struct vcpu_vmx *vmx)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-	if (vmx->vcpu.cpu == -1)
-		return;
-	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+	if (loaded_vmcs->cpu != -1)
+		smp_call_function_single(
+			loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -971,22 +992,22 @@  static void vmx_vcpu_load(struct kvm_vcp
 
 	if (!vmm_exclusive)
 		kvm_cpu_vmxon(phys_addr);
-	else if (vcpu->cpu != cpu)
-		vcpu_clear(vmx);
+	else if (vmx->loaded_vmcs->cpu != cpu)
+		loaded_vmcs_clear(vmx->loaded_vmcs);
 
-	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		vmcs_load(vmx->vmcs);
+	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+		vmcs_load(vmx->loaded_vmcs->vmcs);
 	}
 
-	if (vcpu->cpu != cpu) {
+	if (vmx->loaded_vmcs->cpu != cpu) {
 		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
-		list_add(&vmx->local_vcpus_link,
-			 &per_cpu(vcpus_on_cpu, cpu));
+		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+			 &per_cpu(loaded_vmcss_on_cpu, cpu));
 		local_irq_enable();
 
 		/*
@@ -999,13 +1020,15 @@  static void vmx_vcpu_load(struct kvm_vcp
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 	}
+	vmx->loaded_vmcs->cpu = cpu;
 }
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	__vmx_load_host_state(to_vmx(vcpu));
 	if (!vmm_exclusive) {
-		__vcpu_clear(to_vmx(vcpu));
+		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+		vcpu->cpu = -1;
 		kvm_cpu_vmxoff();
 	}
 }
@@ -1469,7 +1492,7 @@  static int hardware_enable(void *garbage
 	if (read_cr4() & X86_CR4_VMXE)
 		return -EBUSY;
 
-	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
 	test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +1516,14 @@  static int hardware_enable(void *garbage
 	return 0;
 }
 
-static void vmclear_local_vcpus(void)
+static void vmclear_local_loaded_vmcss(void)
 {
 	int cpu = raw_smp_processor_id();
-	struct vcpu_vmx *vmx, *n;
+	struct loaded_vmcs *v, *n;
 
-	list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
-				 local_vcpus_link)
-		__vcpu_clear(vmx);
+	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+				 loaded_vmcss_on_cpu_link)
+		__loaded_vmcs_clear(v);
 }
 
 
@@ -1515,7 +1538,7 @@  static void kvm_cpu_vmxoff(void)
 static void hardware_disable(void *garbage)
 {
 	if (vmm_exclusive) {
-		vmclear_local_vcpus();
+		vmclear_local_loaded_vmcss();
 		kvm_cpu_vmxoff();
 	}
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +1719,18 @@  static void free_vmcs(struct vmcs *vmcs)
 	free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+	if (!loaded_vmcs->vmcs)
+		return;
+	loaded_vmcs_clear(loaded_vmcs);
+	free_vmcs(loaded_vmcs->vmcs);
+	loaded_vmcs->vmcs = NULL;
+}
+
 static void free_kvm_area(void)
 {
 	int cpu;
@@ -4166,6 +4201,7 @@  static void __noclone vmx_vcpu_run(struc
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
+	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -4236,7 +4272,7 @@  static void __noclone vmx_vcpu_run(struc
 		"pop  %%"R"bp; pop  %%"R"dx \n\t"
 		"setbe %c[fail](%0) \n\t"
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
-		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
+		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
 		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
 		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
 		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,7 +4312,7 @@  static void __noclone vmx_vcpu_run(struc
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-	vmx->launched = 1;
+	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
@@ -4288,23 +4324,12 @@  static void __noclone vmx_vcpu_run(struc
 #undef R
 #undef Q
 
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vmx->vmcs) {
-		vcpu_clear(vmx);
-		free_vmcs(vmx->vmcs);
-		vmx->vmcs = NULL;
-	}
-}
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	free_vpid(vmx);
-	vmx_free_vmcs(vcpu);
+	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -4344,11 +4369,13 @@  static struct kvm_vcpu *vmx_create_vcpu(
 		goto uninit_vcpu;
 	}
 
-	vmx->vmcs = alloc_vmcs();
-	if (!vmx->vmcs)
+	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx->loaded_vmcs->vmcs = alloc_vmcs();
+	if (!vmx->loaded_vmcs->vmcs)
 		goto free_msrs;
-
-	vmcs_init(vmx->vmcs);
+	vmcs_init(vmx->loaded_vmcs->vmcs);
+	vmx->loaded_vmcs->cpu = -1;
+	vmx->loaded_vmcs->launched = 0;
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4377,7 +4404,7 @@  static struct kvm_vcpu *vmx_create_vcpu(
 	return &vmx->vcpu;
 
 free_vmcs:
-	free_vmcs(vmx->vmcs);
+	free_vmcs(vmx->loaded_vmcs->vmcs);
 free_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu: