diff mbox

new version of loaded_vmcs patch

Message ID 20110524122610.GA8549@fermat.math.technion.ac.il (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Har'El May 24, 2011, 12:26 p.m. UTC
Hi Avi, here is a updated version of the loaded_vmcs patch which you asked me
to send before the rest of the nvmx patches.

Please let me know when you'd like me to send you an updated version of
the rest of the patches.

----------------------------------------------------------------------------
Subject: [PATCH 01/31] nVMX: Keep list of loaded VMCSs, instead of vcpus.

In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
because (at least in theory) the processor might not have written all of its
content back to memory. Since a patch from June 26, 2008, this is done using
a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.

The problem is that with nested VMX, we no longer have the concept of a
vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
L2s), and each of those may be have been last loaded on a different cpu.

So instead of linking the vcpus, we link the VMCSs, using a new structure
loaded_vmcs. This structure contains the VMCS, and the information pertaining
to its loading on a specific cpu (namely, the cpu number, and whether it
was already launched on this cpu once). In nested we will also use the same
structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the
currently active VMCS.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/vmx.c |  150 ++++++++++++++++++++++++-------------------
 1 file changed, 86 insertions(+), 64 deletions(-)

Comments

Tian, Kevin May 24, 2011, 1:02 p.m. UTC | #1
> From: Nadav Har'El
> Sent: Tuesday, May 24, 2011 8:26 PM
> 
> Hi Avi, here is a updated version of the loaded_vmcs patch which you asked me
> to send before the rest of the nvmx patches.
> 
> Please let me know when you'd like me to send you an updated version of
> the rest of the patches.
> 
> ----------------------------------------------------------------------------
> Subject: [PATCH 01/31] nVMX: Keep list of loaded VMCSs, instead of vcpus.
> 
> In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
> because (at least in theory) the processor might not have written all of its
> content back to memory. Since a patch from June 26, 2008, this is done using
> a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.
> 
> The problem is that with nested VMX, we no longer have the concept of a
> vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
> L2s), and each of those may be have been last loaded on a different cpu.
> 
> So instead of linking the vcpus, we link the VMCSs, using a new structure
> loaded_vmcs. This structure contains the VMCS, and the information
> pertaining
> to its loading on a specific cpu (namely, the cpu number, and whether it
> was already launched on this cpu once). In nested we will also use the same
> structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the
> currently active VMCS.
> 
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>

Acked-by: Kevin Tian <kevin.tian@intel.com>

Thanks
Kevin

> ---
>  arch/x86/kvm/vmx.c |  150 ++++++++++++++++++++++++-------------------
>  1 file changed, 86 insertions(+), 64 deletions(-)
> 
> --- .before/arch/x86/kvm/vmx.c	2011-05-24 15:12:22.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c	2011-05-24 15:12:22.000000000 +0300
> @@ -116,6 +116,18 @@ struct vmcs {
>  	char data[0];
>  };
> 
> +/*
> + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
> + * remember whether it was VMLAUNCHed, and maintain a linked list of all
> VMCSs
> + * loaded on this CPU (so we can clear them if the CPU goes down).
> + */
> +struct loaded_vmcs {
> +	struct vmcs *vmcs;
> +	int cpu;
> +	int launched;
> +	struct list_head loaded_vmcss_on_cpu_link;
> +};
> +
>  struct shared_msr_entry {
>  	unsigned index;
>  	u64 data;
> @@ -124,9 +136,7 @@ struct shared_msr_entry {
> 
>  struct vcpu_vmx {
>  	struct kvm_vcpu       vcpu;
> -	struct list_head      local_vcpus_link;
>  	unsigned long         host_rsp;
> -	int                   launched;
>  	u8                    fail;
>  	u8                    cpl;
>  	bool                  nmi_known_unmasked;
> @@ -140,7 +150,14 @@ struct vcpu_vmx {
>  	u64 		      msr_host_kernel_gs_base;
>  	u64 		      msr_guest_kernel_gs_base;
>  #endif
> -	struct vmcs          *vmcs;
> +	/*
> +	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
> +	 * non-nested (L1) guest, it always points to vmcs01. For a nested
> +	 * guest (L2), it points to a different VMCS.
> +	 */
> +	struct loaded_vmcs    vmcs01;
> +	struct loaded_vmcs   *loaded_vmcs;
> +	bool                  __launched; /* temporary, used in
> vmx_vcpu_run */
>  	struct msr_autoload {
>  		unsigned nr;
>  		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
> @@ -200,7 +217,11 @@ static int vmx_set_tss_addr(struct kvm *
> 
>  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> -static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
> +/*
> + * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is
> needed
> + * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded
> on it.
> + */
> +static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
>  static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
> 
>  static unsigned long *vmx_io_bitmap_a;
> @@ -501,6 +522,13 @@ static void vmcs_clear(struct vmcs *vmcs
>  		       vmcs, phys_addr);
>  }
> 
> +static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
> +{
> +	vmcs_clear(loaded_vmcs->vmcs);
> +	loaded_vmcs->cpu = -1;
> +	loaded_vmcs->launched = 0;
> +}
> +
>  static void vmcs_load(struct vmcs *vmcs)
>  {
>  	u64 phys_addr = __pa(vmcs);
> @@ -514,25 +542,24 @@ static void vmcs_load(struct vmcs *vmcs)
>  		       vmcs, phys_addr);
>  }
> 
> -static void __vcpu_clear(void *arg)
> +static void __loaded_vmcs_clear(void *arg)
>  {
> -	struct vcpu_vmx *vmx = arg;
> +	struct loaded_vmcs *loaded_vmcs = arg;
>  	int cpu = raw_smp_processor_id();
> 
> -	if (vmx->vcpu.cpu == cpu)
> -		vmcs_clear(vmx->vmcs);
> -	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
> +	if (loaded_vmcs->cpu != cpu)
> +		return; /* vcpu migration can race with cpu offline */
> +	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
>  		per_cpu(current_vmcs, cpu) = NULL;
> -	list_del(&vmx->local_vcpus_link);
> -	vmx->vcpu.cpu = -1;
> -	vmx->launched = 0;
> +	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
> +	loaded_vmcs_init(loaded_vmcs);
>  }
> 
> -static void vcpu_clear(struct vcpu_vmx *vmx)
> +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
>  {
> -	if (vmx->vcpu.cpu == -1)
> -		return;
> -	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
> +	if (loaded_vmcs->cpu != -1)
> +		smp_call_function_single(
> +			loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
>  }
> 
>  static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
> @@ -971,22 +998,22 @@ static void vmx_vcpu_load(struct kvm_vcp
> 
>  	if (!vmm_exclusive)
>  		kvm_cpu_vmxon(phys_addr);
> -	else if (vcpu->cpu != cpu)
> -		vcpu_clear(vmx);
> +	else if (vmx->loaded_vmcs->cpu != cpu)
> +		loaded_vmcs_clear(vmx->loaded_vmcs);
> 
> -	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
> -		per_cpu(current_vmcs, cpu) = vmx->vmcs;
> -		vmcs_load(vmx->vmcs);
> +	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
> +		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
> +		vmcs_load(vmx->loaded_vmcs->vmcs);
>  	}
> 
> -	if (vcpu->cpu != cpu) {
> +	if (vmx->loaded_vmcs->cpu != cpu) {
>  		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
>  		unsigned long sysenter_esp;
> 
>  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>  		local_irq_disable();
> -		list_add(&vmx->local_vcpus_link,
> -			 &per_cpu(vcpus_on_cpu, cpu));
> +		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
> +			 &per_cpu(loaded_vmcss_on_cpu, cpu));
>  		local_irq_enable();
> 
>  		/*
> @@ -998,6 +1025,7 @@ static void vmx_vcpu_load(struct kvm_vcp
> 
>  		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
>  		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
> +		vmx->loaded_vmcs->cpu = cpu;
>  	}
>  }
> 
> @@ -1005,7 +1033,8 @@ static void vmx_vcpu_put(struct kvm_vcpu
>  {
>  	__vmx_load_host_state(to_vmx(vcpu));
>  	if (!vmm_exclusive) {
> -		__vcpu_clear(to_vmx(vcpu));
> +		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
> +		vcpu->cpu = -1;
>  		kvm_cpu_vmxoff();
>  	}
>  }
> @@ -1469,7 +1498,7 @@ static int hardware_enable(void *garbage
>  	if (read_cr4() & X86_CR4_VMXE)
>  		return -EBUSY;
> 
> -	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
> +	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
>  	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
> 
>  	test_bits = FEATURE_CONTROL_LOCKED;
> @@ -1493,14 +1522,14 @@ static int hardware_enable(void *garbage
>  	return 0;
>  }
> 
> -static void vmclear_local_vcpus(void)
> +static void vmclear_local_loaded_vmcss(void)
>  {
>  	int cpu = raw_smp_processor_id();
> -	struct vcpu_vmx *vmx, *n;
> +	struct loaded_vmcs *v, *n;
> 
> -	list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
> -				 local_vcpus_link)
> -		__vcpu_clear(vmx);
> +	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
> +				 loaded_vmcss_on_cpu_link)
> +		__loaded_vmcs_clear(v);
>  }
> 
> 
> @@ -1515,7 +1544,7 @@ static void kvm_cpu_vmxoff(void)
>  static void hardware_disable(void *garbage)
>  {
>  	if (vmm_exclusive) {
> -		vmclear_local_vcpus();
> +		vmclear_local_loaded_vmcss();
>  		kvm_cpu_vmxoff();
>  	}
>  	write_cr4(read_cr4() & ~X86_CR4_VMXE);
> @@ -1696,6 +1725,18 @@ static void free_vmcs(struct vmcs *vmcs)
>  	free_pages((unsigned long)vmcs, vmcs_config.order);
>  }
> 
> +/*
> + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last
> loaded
> + */
> +static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
> +{
> +	if (!loaded_vmcs->vmcs)
> +		return;
> +	loaded_vmcs_clear(loaded_vmcs);
> +	free_vmcs(loaded_vmcs->vmcs);
> +	loaded_vmcs->vmcs = NULL;
> +}
> +
>  static void free_kvm_area(void)
>  {
>  	int cpu;
> @@ -4166,6 +4207,7 @@ static void __noclone vmx_vcpu_run(struc
>  	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
>  		vmx_set_interrupt_shadow(vcpu, 0);
> 
> +	vmx->__launched = vmx->loaded_vmcs->launched;
>  	asm(
>  		/* Store host registers */
>  		"push %%"R"dx; push %%"R"bp;"
> @@ -4236,7 +4278,7 @@ static void __noclone vmx_vcpu_run(struc
>  		"pop  %%"R"bp; pop  %%"R"dx \n\t"
>  		"setbe %c[fail](%0) \n\t"
>  	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
> -		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
> +		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
>  		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
>  		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
>  		[rax]"i"(offsetof(struct vcpu_vmx,
> vcpu.arch.regs[VCPU_REGS_RAX])),
> @@ -4276,7 +4318,7 @@ static void __noclone vmx_vcpu_run(struc
>  	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> 
>  	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
> -	vmx->launched = 1;
> +	vmx->loaded_vmcs->launched = 1;
> 
>  	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> 
> @@ -4288,41 +4330,17 @@ static void __noclone vmx_vcpu_run(struc
>  #undef R
>  #undef Q
> 
> -static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
> -{
> -	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -
> -	if (vmx->vmcs) {
> -		vcpu_clear(vmx);
> -		free_vmcs(vmx->vmcs);
> -		vmx->vmcs = NULL;
> -	}
> -}
> -
>  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> 
>  	free_vpid(vmx);
> -	vmx_free_vmcs(vcpu);
> +	free_loaded_vmcs(vmx->loaded_vmcs);
>  	kfree(vmx->guest_msrs);
>  	kvm_vcpu_uninit(vcpu);
>  	kmem_cache_free(kvm_vcpu_cache, vmx);
>  }
> 
> -static inline void vmcs_init(struct vmcs *vmcs)
> -{
> -	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
> -
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxon(phys_addr);
> -
> -	vmcs_clear(vmcs);
> -
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxoff();
> -}
> -
>  static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>  {
>  	int err;
> @@ -4344,11 +4362,15 @@ static struct kvm_vcpu *vmx_create_vcpu(
>  		goto uninit_vcpu;
>  	}
> 
> -	vmx->vmcs = alloc_vmcs();
> -	if (!vmx->vmcs)
> +	vmx->loaded_vmcs = &vmx->vmcs01;
> +	vmx->loaded_vmcs->vmcs = alloc_vmcs();
> +	if (!vmx->loaded_vmcs->vmcs)
>  		goto free_msrs;
> -
> -	vmcs_init(vmx->vmcs);
> +	if (!vmm_exclusive)
> +		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
> +	loaded_vmcs_init(vmx->loaded_vmcs);
> +	if (!vmm_exclusive)
> +		kvm_cpu_vmxoff();
> 
>  	cpu = get_cpu();
>  	vmx_vcpu_load(&vmx->vcpu, cpu);
> @@ -4377,7 +4399,7 @@ static struct kvm_vcpu *vmx_create_vcpu(
>  	return &vmx->vcpu;
> 
>  free_vmcs:
> -	free_vmcs(vmx->vmcs);
> +	free_vmcs(vmx->loaded_vmcs->vmcs);
>  free_msrs:
>  	kfree(vmx->guest_msrs);
>  uninit_vcpu:
> 
> --
> Nadav Har'El                        |      Tuesday, May 24 2011, 20
> Iyyar 5771
> nyh@math.technion.ac.il             |-----------------------------------------
> Phone +972-523-790466, ICQ 13349191 |I am thinking about a new signature.
> Stay
> http://nadav.harel.org.il           |tuned.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 24, 2011, 1:14 p.m. UTC | #2
On 05/24/2011 03:26 PM, Nadav Har'El wrote:
> Hi Avi, here is a updated version of the loaded_vmcs patch which you asked me
> to send before the rest of the nvmx patches.
>
> Please let me know when you'd like me to send you an updated version of
> the rest of the patches.
>
> ----------------------------------------------------------------------------
> Subject: [PATCH 01/31] nVMX: Keep list of loaded VMCSs, instead of vcpus.
>
> In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
> because (at least in theory) the processor might not have written all of its
> content back to memory. Since a patch from June 26, 2008, this is done using
> a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.
>
> The problem is that with nested VMX, we no longer have the concept of a
> vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
> L2s), and each of those may be have been last loaded on a different cpu.
>
> So instead of linking the vcpus, we link the VMCSs, using a new structure
> loaded_vmcs. This structure contains the VMCS, and the information pertaining
> to its loading on a specific cpu (namely, the cpu number, and whether it
> was already launched on this cpu once). In nested we will also use the same
> structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the
> currently active VMCS.
>

Applied, thanks.

Do you have a list of unresolved issues?

If not, please repost the series.
Avi Kivity May 24, 2011, 1:57 p.m. UTC | #3
On 05/24/2011 04:14 PM, Avi Kivity wrote:
> On 05/24/2011 03:26 PM, Nadav Har'El wrote:
>> Hi Avi, here is a updated version of the loaded_vmcs patch which you 
>> asked me
>> to send before the rest of the nvmx patches.
>>
>> Please let me know when you'd like me to send you an updated version of
>> the rest of the patches.
>
> Do you have a list of unresolved issues?
>
> If not, please repost the series.
>

btw, since Kevin is now plowing through the patchset, please wait for 
the rest of his review.
Nadav Har'El May 24, 2011, 2:10 p.m. UTC | #4
On Tue, May 24, 2011, Avi Kivity wrote about "Re: [PATCH] new version of loaded_vmcs patch":
> btw, since Kevin is now plowing through the patchset, please wait for 
> the rest of his review.

Ok, I will, but note that I'll also be here after the merge, and will continue
to explain the code to Kevin and others - and to modify the code - afterwards.

With nvmx in the tree, other people will also be able to directly modify it
themselves - if they feel strongly about some issue that I don't.
Avi Kivity May 24, 2011, 2:28 p.m. UTC | #5
On 05/24/2011 05:10 PM, Nadav Har'El wrote:
> On Tue, May 24, 2011, Avi Kivity wrote about "Re: [PATCH] new version of loaded_vmcs patch":
> >  btw, since Kevin is now plowing through the patchset, please wait for
> >  the rest of his review.
>
> Ok, I will, but note that I'll also be here after the merge, and will continue
> to explain the code to Kevin and others - and to modify the code - afterwards.

Certainly.

> With nvmx in the tree, other people will also be able to directly modify it
> themselves - if they feel strongly about some issue that I don't.

Sure.  I'll still ask you to review those changes, of course.
Gleb Natapov May 24, 2011, 2:45 p.m. UTC | #6
On Tue, May 24, 2011 at 05:10:56PM +0300, Nadav Har'El wrote:
> On Tue, May 24, 2011, Avi Kivity wrote about "Re: [PATCH] new version of loaded_vmcs patch":
> > btw, since Kevin is now plowing through the patchset, please wait for 
> > the rest of his review.
> 
> Ok, I will, but note that I'll also be here after the merge, and will continue
> to explain the code to Kevin and others - and to modify the code - afterwards.
> 
> With nvmx in the tree, other people will also be able to directly modify it
> themselves - if they feel strongly about some issue that I don't.
> 
Like event handling? 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 25, 2011, 10:34 a.m. UTC | #7
> From: Avi Kivity
> Sent: Tuesday, May 24, 2011 9:57 PM
> 
> On 05/24/2011 04:14 PM, Avi Kivity wrote:
> > On 05/24/2011 03:26 PM, Nadav Har'El wrote:
> >> Hi Avi, here is a updated version of the loaded_vmcs patch which you
> >> asked me
> >> to send before the rest of the nvmx patches.
> >>
> >> Please let me know when you'd like me to send you an updated version of
> >> the rest of the patches.
> >
> > Do you have a list of unresolved issues?
> >
> > If not, please repost the series.
> >
> 
> btw, since Kevin is now plowing through the patchset, please wait for
> the rest of his review.
> 

I have finished my current round of review of nVMX-v10 with all comments
sent out.

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- .before/arch/x86/kvm/vmx.c	2011-05-24 15:12:22.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2011-05-24 15:12:22.000000000 +0300
@@ -116,6 +116,18 @@  struct vmcs {
 	char data[0];
 };
 
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+	struct vmcs *vmcs;
+	int cpu;
+	int launched;
+	struct list_head loaded_vmcss_on_cpu_link;
+};
+
 struct shared_msr_entry {
 	unsigned index;
 	u64 data;
@@ -124,9 +136,7 @@  struct shared_msr_entry {
 
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
-	struct list_head      local_vcpus_link;
 	unsigned long         host_rsp;
-	int                   launched;
 	u8                    fail;
 	u8                    cpl;
 	bool                  nmi_known_unmasked;
@@ -140,7 +150,14 @@  struct vcpu_vmx {
 	u64 		      msr_host_kernel_gs_base;
 	u64 		      msr_guest_kernel_gs_base;
 #endif
-	struct vmcs          *vmcs;
+	/*
+	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+	 * non-nested (L1) guest, it always points to vmcs01. For a nested
+	 * guest (L2), it points to a different VMCS.
+	 */
+	struct loaded_vmcs    vmcs01;
+	struct loaded_vmcs   *loaded_vmcs;
+	bool                  __launched; /* temporary, used in vmx_vcpu_run */
 	struct msr_autoload {
 		unsigned nr;
 		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -200,7 +217,11 @@  static int vmx_set_tss_addr(struct kvm *
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
@@ -501,6 +522,13 @@  static void vmcs_clear(struct vmcs *vmcs
 		       vmcs, phys_addr);
 }
 
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+	vmcs_clear(loaded_vmcs->vmcs);
+	loaded_vmcs->cpu = -1;
+	loaded_vmcs->launched = 0;
+}
+
 static void vmcs_load(struct vmcs *vmcs)
 {
 	u64 phys_addr = __pa(vmcs);
@@ -514,25 +542,24 @@  static void vmcs_load(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
-static void __vcpu_clear(void *arg)
+static void __loaded_vmcs_clear(void *arg)
 {
-	struct vcpu_vmx *vmx = arg;
+	struct loaded_vmcs *loaded_vmcs = arg;
 	int cpu = raw_smp_processor_id();
 
-	if (vmx->vcpu.cpu == cpu)
-		vmcs_clear(vmx->vmcs);
-	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+	if (loaded_vmcs->cpu != cpu)
+		return; /* vcpu migration can race with cpu offline */
+	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	list_del(&vmx->local_vcpus_link);
-	vmx->vcpu.cpu = -1;
-	vmx->launched = 0;
+	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+	loaded_vmcs_init(loaded_vmcs);
 }
 
-static void vcpu_clear(struct vcpu_vmx *vmx)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-	if (vmx->vcpu.cpu == -1)
-		return;
-	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+	if (loaded_vmcs->cpu != -1)
+		smp_call_function_single(
+			loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -971,22 +998,22 @@  static void vmx_vcpu_load(struct kvm_vcp
 
 	if (!vmm_exclusive)
 		kvm_cpu_vmxon(phys_addr);
-	else if (vcpu->cpu != cpu)
-		vcpu_clear(vmx);
+	else if (vmx->loaded_vmcs->cpu != cpu)
+		loaded_vmcs_clear(vmx->loaded_vmcs);
 
-	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		vmcs_load(vmx->vmcs);
+	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+		vmcs_load(vmx->loaded_vmcs->vmcs);
 	}
 
-	if (vcpu->cpu != cpu) {
+	if (vmx->loaded_vmcs->cpu != cpu) {
 		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
-		list_add(&vmx->local_vcpus_link,
-			 &per_cpu(vcpus_on_cpu, cpu));
+		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+			 &per_cpu(loaded_vmcss_on_cpu, cpu));
 		local_irq_enable();
 
 		/*
@@ -998,6 +1025,7 @@  static void vmx_vcpu_load(struct kvm_vcp
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+		vmx->loaded_vmcs->cpu = cpu;
 	}
 }
 
@@ -1005,7 +1033,8 @@  static void vmx_vcpu_put(struct kvm_vcpu
 {
 	__vmx_load_host_state(to_vmx(vcpu));
 	if (!vmm_exclusive) {
-		__vcpu_clear(to_vmx(vcpu));
+		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+		vcpu->cpu = -1;
 		kvm_cpu_vmxoff();
 	}
 }
@@ -1469,7 +1498,7 @@  static int hardware_enable(void *garbage
 	if (read_cr4() & X86_CR4_VMXE)
 		return -EBUSY;
 
-	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
 	test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +1522,14 @@  static int hardware_enable(void *garbage
 	return 0;
 }
 
-static void vmclear_local_vcpus(void)
+static void vmclear_local_loaded_vmcss(void)
 {
 	int cpu = raw_smp_processor_id();
-	struct vcpu_vmx *vmx, *n;
+	struct loaded_vmcs *v, *n;
 
-	list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
-				 local_vcpus_link)
-		__vcpu_clear(vmx);
+	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+				 loaded_vmcss_on_cpu_link)
+		__loaded_vmcs_clear(v);
 }
 
 
@@ -1515,7 +1544,7 @@  static void kvm_cpu_vmxoff(void)
 static void hardware_disable(void *garbage)
 {
 	if (vmm_exclusive) {
-		vmclear_local_vcpus();
+		vmclear_local_loaded_vmcss();
 		kvm_cpu_vmxoff();
 	}
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +1725,18 @@  static void free_vmcs(struct vmcs *vmcs)
 	free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+	if (!loaded_vmcs->vmcs)
+		return;
+	loaded_vmcs_clear(loaded_vmcs);
+	free_vmcs(loaded_vmcs->vmcs);
+	loaded_vmcs->vmcs = NULL;
+}
+
 static void free_kvm_area(void)
 {
 	int cpu;
@@ -4166,6 +4207,7 @@  static void __noclone vmx_vcpu_run(struc
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
+	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -4236,7 +4278,7 @@  static void __noclone vmx_vcpu_run(struc
 		"pop  %%"R"bp; pop  %%"R"dx \n\t"
 		"setbe %c[fail](%0) \n\t"
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
-		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
+		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
 		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
 		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
 		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,7 +4318,7 @@  static void __noclone vmx_vcpu_run(struc
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-	vmx->launched = 1;
+	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
@@ -4288,41 +4330,17 @@  static void __noclone vmx_vcpu_run(struc
 #undef R
 #undef Q
 
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vmx->vmcs) {
-		vcpu_clear(vmx);
-		free_vmcs(vmx->vmcs);
-		vmx->vmcs = NULL;
-	}
-}
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	free_vpid(vmx);
-	vmx_free_vmcs(vcpu);
+	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
-static inline void vmcs_init(struct vmcs *vmcs)
-{
-	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
-
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(phys_addr);
-
-	vmcs_clear(vmcs);
-
-	if (!vmm_exclusive)
-		kvm_cpu_vmxoff();
-}
-
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
@@ -4344,11 +4362,15 @@  static struct kvm_vcpu *vmx_create_vcpu(
 		goto uninit_vcpu;
 	}
 
-	vmx->vmcs = alloc_vmcs();
-	if (!vmx->vmcs)
+	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx->loaded_vmcs->vmcs = alloc_vmcs();
+	if (!vmx->loaded_vmcs->vmcs)
 		goto free_msrs;
-
-	vmcs_init(vmx->vmcs);
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+	loaded_vmcs_init(vmx->loaded_vmcs);
+	if (!vmm_exclusive)
+		kvm_cpu_vmxoff();
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4377,7 +4399,7 @@  static struct kvm_vcpu *vmx_create_vcpu(
 	return &vmx->vcpu;
 
 free_vmcs:
-	free_vmcs(vmx->vmcs);
+	free_vmcs(vmx->loaded_vmcs->vmcs);
 free_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu: