diff mbox

[08/31] nVMX: Fix local_vcpus_link handling

Message ID 201105161948.p4GJm1as001742@rice.haifa.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Har'El May 16, 2011, 7:48 p.m. UTC
In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
because (at least in theory) the processor might not have written all of its
content back to memory. Since a patch from June 26, 2008, this is done using
a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.

The problem is that with nested VMX, we no longer have the concept of a
vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
L2s), and each of those may be have been last loaded on a different cpu.

Our solution is to hold, in addition to vcpus_on_cpu, a second linked list
saved_vmcss_on_cpu, which holds the current list of "saved" VMCSs, i.e.,
VMCSs which are loaded on this CPU but are not the vmx->vmcs of any of
the vcpus. These saved VMCSs include L1's VMCS while L2 is running
(saved_vmcs01), and L2 VMCSs not currently used - because L1 is running or
because the vmcs02_pool contains more than one entry.

When we will switch between L1's and L2's VMCSs, they need to be moved
between vcpus_on_cpu and saved_vmcs_on_cpu lists and vice versa. A new
function, nested_maintain_per_cpu_lists(), takes care of that.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/vmx.c |   67 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Marcelo Tosatti May 17, 2011, 1:19 p.m. UTC | #1
On Mon, May 16, 2011 at 10:48:01PM +0300, Nadav Har'El wrote:
> In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
> because (at least in theory) the processor might not have written all of its
> content back to memory. Since a patch from June 26, 2008, this is done using
> a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.
> 
> The problem is that with nested VMX, we no longer have the concept of a
> vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
> L2s), and each of those may be have been last loaded on a different cpu.
> 
> Our solution is to hold, in addition to vcpus_on_cpu, a second linked list
> saved_vmcss_on_cpu, which holds the current list of "saved" VMCSs, i.e.,
> VMCSs which are loaded on this CPU but are not the vmx->vmcs of any of
> the vcpus. These saved VMCSs include L1's VMCS while L2 is running
> (saved_vmcs01), and L2 VMCSs not currently used - because L1 is running or
> because the vmcs02_pool contains more than one entry.
> 
> When we will switch between L1's and L2's VMCSs, they need to be moved
> between vcpus_on_cpu and saved_vmcs_on_cpu lists and vice versa. A new
> function, nested_maintain_per_cpu_lists(), takes care of that.
> 
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> ---
>  arch/x86/kvm/vmx.c |   67 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 67 insertions(+)
> 
> --- .before/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
> @@ -181,6 +181,7 @@ struct saved_vmcs {
>  	struct vmcs *vmcs;
>  	int cpu;
>  	int launched;
> +	struct list_head local_saved_vmcss_link; /* see saved_vmcss_on_cpu */
>  };
>  
>  /* Used to remember the last vmcs02 used for some recently used vmcs12s */
> @@ -315,7 +316,20 @@ static int vmx_set_tss_addr(struct kvm *
>  
>  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> +/*
> + * We maintain a per-CPU linked-list vcpus_on_cpu, holding for each CPU a list
> + * of vcpus whose VMCS are loaded on that CPU. This is needed when a CPU is
> + * brought down, and we need to VMCLEAR all VMCSs loaded on it.
> + *
> + * With nested VMX, we have additional VMCSs which are not the current
> + * vmx->vmcs of any vcpu, but may also be loaded on some CPU: While L2 is
> + * running, L1's VMCS is loaded but not the VMCS of any vcpu; While L1 is
> + * running, a previously used L2 VMCS might still be around and loaded on some
> + * CPU, somes even more than one such L2 VMCSs is kept (see VMCS02_POOL_SIZE).
> + * The list of these additional VMCSs is kept on cpu saved_vmcss_on_cpu.
> + */
>  static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
> +static DEFINE_PER_CPU(struct list_head, saved_vmcss_on_cpu);
>  static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
>  
>  static unsigned long *vmx_io_bitmap_a;
> @@ -1818,6 +1832,7 @@ static int hardware_enable(void *garbage
>  		return -EBUSY;
>  
>  	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
> +	INIT_LIST_HEAD(&per_cpu(saved_vmcss_on_cpu, cpu));
>  	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
>  
>  	test_bits = FEATURE_CONTROL_LOCKED;
> @@ -1860,10 +1875,13 @@ static void kvm_cpu_vmxoff(void)
>  	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
>  }
>  
> +static void vmclear_local_saved_vmcss(void);
> +
>  static void hardware_disable(void *garbage)
>  {
>  	if (vmm_exclusive) {
>  		vmclear_local_vcpus();
> +		vmclear_local_saved_vmcss();
>  		kvm_cpu_vmxoff();
>  	}
>  	write_cr4(read_cr4() & ~X86_CR4_VMXE);
> @@ -4248,6 +4266,8 @@ static void __nested_free_saved_vmcs(voi
>  	vmcs_clear(saved_vmcs->vmcs);
>  	if (per_cpu(current_vmcs, saved_vmcs->cpu) == saved_vmcs->vmcs)
>  		per_cpu(current_vmcs, saved_vmcs->cpu) = NULL;
> +	list_del(&saved_vmcs->local_saved_vmcss_link);
> +	saved_vmcs->cpu = -1;
>  }
>  
>  /*
> @@ -4265,6 +4285,21 @@ static void nested_free_saved_vmcs(struc
>  	free_vmcs(saved_vmcs->vmcs);
>  }
>  
> +/*
> + * VMCLEAR all the currently unused (not vmx->vmcs on any vcpu) saved_vmcss
> + * which were loaded on the current CPU. See also vmclear_load_vcpus(), which
> + * does the same for VMCS currently used in vcpus.
> + */
> +static void vmclear_local_saved_vmcss(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +	struct saved_vmcs *v, *n;
> +
> +	list_for_each_entry_safe(v, n, &per_cpu(saved_vmcss_on_cpu, cpu),
> +				 local_saved_vmcss_link)
> +		__nested_free_saved_vmcs(v);
> +}
> +
>  /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
>  static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
>  {
> @@ -5143,6 +5178,38 @@ static void vmx_set_supported_cpuid(u32 
>  {
>  }
>  
> +/*
> + * Maintain the vcpus_on_cpu and saved_vmcss_on_cpu lists of vcpus and
> + * inactive saved_vmcss on nested entry (L1->L2) or nested exit (L2->L1).
> + *
> + * nested_maintain_per_cpu_lists should be called after the VMCS was switched
> + * to the new one, with parameters giving both the new on (after the entry
> + * or exit) and the old one, in that order.
> + */
> +static void nested_maintain_per_cpu_lists(struct vcpu_vmx *vmx,
> +		struct saved_vmcs *new_vmcs,
> +		struct saved_vmcs *old_vmcs)
> +{
> +	/*
> +	 * When a vcpus's old vmcs is saved, we need to drop it from
> +	 * vcpus_on_cpu and put it on saved_vmcss_on_cpu.
> +	 */
> +	if (old_vmcs->cpu != -1) {
> +		list_del(&vmx->local_vcpus_link);
> +		list_add(&old_vmcs->local_saved_vmcss_link,
> +			 &per_cpu(saved_vmcss_on_cpu, old_vmcs->cpu));
> +	}

This new handling of vmcs could be simplified (local_vcpus_link must be
manipulated with interrupts disabled, BTW).

What about having a per-CPU VMCS list instead of per-CPU vcpu list?
"local_vmcs_link" list node could be in "struct saved_vmcs" (and 
a current_saved_vmcs pointer in "struct vcpu_vmx").

vmx_vcpu_load would then add to this list at

        if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
                per_cpu(current_vmcs, cpu) = vmx->vmcs;
                vmcs_load(vmx->vmcs);
        }
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 17, 2011, 1:35 p.m. UTC | #2
On 05/17/2011 04:19 PM, Marcelo Tosatti wrote:
> >  +/*
> >  + * Maintain the vcpus_on_cpu and saved_vmcss_on_cpu lists of vcpus and
> >  + * inactive saved_vmcss on nested entry (L1->L2) or nested exit (L2->L1).
> >  + *
> >  + * nested_maintain_per_cpu_lists should be called after the VMCS was switched
> >  + * to the new one, with parameters giving both the new on (after the entry
> >  + * or exit) and the old one, in that order.
> >  + */
> >  +static void nested_maintain_per_cpu_lists(struct vcpu_vmx *vmx,
> >  +		struct saved_vmcs *new_vmcs,
> >  +		struct saved_vmcs *old_vmcs)
> >  +{
> >  +	/*
> >  +	 * When a vcpus's old vmcs is saved, we need to drop it from
> >  +	 * vcpus_on_cpu and put it on saved_vmcss_on_cpu.
> >  +	 */
> >  +	if (old_vmcs->cpu != -1) {
> >  +		list_del(&vmx->local_vcpus_link);
> >  +		list_add(&old_vmcs->local_saved_vmcss_link,
> >  +			&per_cpu(saved_vmcss_on_cpu, old_vmcs->cpu));
> >  +	}
>
> This new handling of vmcs could be simplified (local_vcpus_link must be
> manipulated with interrupts disabled, BTW).
>
> What about having a per-CPU VMCS list instead of per-CPU vcpu list?
> "local_vmcs_link" list node could be in "struct saved_vmcs" (and
> a current_saved_vmcs pointer in "struct vcpu_vmx").
>
> vmx_vcpu_load would then add to this list at
>
>          if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
>                  per_cpu(current_vmcs, cpu) = vmx->vmcs;
>                  vmcs_load(vmx->vmcs);
>          }

Right, that's the easiest thing to do.

Perhaps even easier (avoids duplication):

struct raw_vmcs {
     u32 revision_id;
     u32 abort;
     char data[0];
};

struct vmcs {
     struct raw_vmcs *raw_vmcs;
     struct list_head local_vmcs_link;
};

struct vcpu_vmx {
     ...
     struct vmcs *vmcs;  /* often points at l1_vmcs */
     struct vmcs l1_vmcs;
     ...
};

static DEFINE_PER_CPU(struct list_head, vmcss_on_cpu);
Nadav Har'El May 17, 2011, 2:35 p.m. UTC | #3
On Tue, May 17, 2011, Avi Kivity wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> >(local_vcpus_link must be >manipulated with interrupts disabled, BTW).

Thanks, I'll look into that.

> >What about having a per-CPU VMCS list instead of per-CPU vcpu list?
> Perhaps even easier (avoids duplication):
> 
> struct raw_vmcs {
>     u32 revision_id;
>     u32 abort;
>     char data[0];
> };
> 
> struct vmcs {
>     struct raw_vmcs *raw_vmcs;
>     struct list_head local_vmcs_link;
> };
> 
> struct vcpu_vmx {
>     ...
>     struct vmcs *vmcs;  /* often points at l1_vmcs */
>     struct vmcs l1_vmcs;
>     ...
> };
> 
> static DEFINE_PER_CPU(struct list_head, vmcss_on_cpu);

This is an interesting suggestion. My initial plan was to do something similar
to this, and I agree it could have been nicer code, but I had to change it
after bumping into too many obstacles.

For example, currently, vmclear_local_vcpus() not only VMCLEARs the vmcss,
it also sets vmx->vcpu.cpu = -1, xmv->launched=0 for the vcpus holding these
VMCSs.  If we had only a list of VMCSs, how can we mark the vcpus as being not
currently loaded (cpu=-1)?
Marcelo Tosatti May 17, 2011, 2:42 p.m. UTC | #4
On Tue, May 17, 2011 at 05:35:32PM +0300, Nadav Har'El wrote:
> On Tue, May 17, 2011, Avi Kivity wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> > static DEFINE_PER_CPU(struct list_head, vmcss_on_cpu);
> 
> This is an interesting suggestion. My initial plan was to do something similar
> to this, and I agree it could have been nicer code, but I had to change it
> after bumping into too many obstacles.
> 
> For example, currently, vmclear_local_vcpus() not only VMCLEARs the vmcss,
> it also sets vmx->vcpu.cpu = -1, xmv->launched=0 for the vcpus holding these
> VMCSs.  If we had only a list of VMCSs, how can we mark the vcpus as being not
> currently loaded (cpu=-1)?

Do it in vcpu_clear, its just an optimization not necessary in
vmclear_local_vcpus path.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 17, 2011, 3:11 p.m. UTC | #5
On 05/17/2011 05:35 PM, Nadav Har'El wrote:
>
>>> What about having a per-CPU VMCS list instead of per-CPU vcpu list?
>> Perhaps even easier (avoids duplication):
>>
>> struct raw_vmcs {
>>      u32 revision_id;
>>      u32 abort;
>>      char data[0];
>> };
>>
>> struct vmcs {
>>      struct raw_vmcs *raw_vmcs;
>>      struct list_head local_vmcs_link;
>> };
>>
>> struct vcpu_vmx {
>>      ...
>>      struct vmcs *vmcs;  /* often points at l1_vmcs */
>>      struct vmcs l1_vmcs;
>>      ...
>> };
>>
>> static DEFINE_PER_CPU(struct list_head, vmcss_on_cpu);
> This is an interesting suggestion. My initial plan was to do something similar
> to this, and I agree it could have been nicer code, but I had to change it
> after bumping into too many obstacles.
>
> For example, currently, vmclear_local_vcpus() not only VMCLEARs the vmcss,
> it also sets vmx->vcpu.cpu = -1, xmv->launched=0 for the vcpus holding these
> VMCSs.  If we had only a list of VMCSs, how can we mark the vcpus as being not
> currently loaded (cpu=-1)?
>

->launched and ->cpu simply move into struct vmcs.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nadav Har'El May 17, 2011, 5:57 p.m. UTC | #6
On Tue, May 17, 2011, Marcelo Tosatti wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> > For example, currently, vmclear_local_vcpus() not only VMCLEARs the vmcss,
> > it also sets vmx->vcpu.cpu = -1, xmv->launched=0 for the vcpus holding these
> > VMCSs.  If we had only a list of VMCSs, how can we mark the vcpus as being not
> > currently loaded (cpu=-1)?
> 
> Do it in vcpu_clear, its just an optimization not necessary in
> vmclear_local_vcpus path.

Well, what if (say) we're running L2, and L1's vmcs is saved in saved_vmcs01
and is not the current vmcs of the vcpu, and then we shut down the CPU on
which this saved_vmcs01 was loaded. We need not only to VMCLEAR this vmcs,
we need to also remember that this vmcs is not loaded, so when we nested_vmexit
back to L1, we know we need to load the vmcs again.

There's solution to this (which Avi also mentioned in his email) - it is to
use everywhere my "saved_vmcs" type (which I'd rename "loaded vmcs"), which
includes the vmcs *and* the cpu (and possibly "launched").
If the "cpu" field was part of vmx, this was easy - but "cpu" is a field of
vcpu, not vmx, so I have problems encapsulating both "vmcs" and "cpu" in
one structure everywhere.

These are the kind of problems I wrapped my head with, until I gave up and
came up with the current solution...
Nadav Har'El May 17, 2011, 6:11 p.m. UTC | #7
On Tue, May 17, 2011, Avi Kivity wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> >VMCSs.  If we had only a list of VMCSs, how can we mark the vcpus as being 
> >not
> >currently loaded (cpu=-1)?
> >
> 
> ->launched and ->cpu simply move into struct vmcs.

As I explained in the sister thread (this discussion is becoming a tree ;-))
this is what I planned to do, until it dawned on me that I can't, because "cpu"
isn't part of vmx (where the vmcs and launched sit in the standard KVM), but
rather part of vcpu... When I gave up trying to "solve" these interdependencies
and avoiding modifying half of KVM, I came up with the current solution.

Maybe I'm missing something - I'd be happy if we do find a solution that
simplifies this code.
Marcelo Tosatti May 17, 2011, 6:43 p.m. UTC | #8
On Tue, May 17, 2011 at 09:11:32PM +0300, Nadav Har'El wrote:
> On Tue, May 17, 2011, Avi Kivity wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> > >VMCSs.  If we had only a list of VMCSs, how can we mark the vcpus as being 
> > >not
> > >currently loaded (cpu=-1)?
> > >
> > 
> > ->launched and ->cpu simply move into struct vmcs.
> 
> As I explained in the sister thread (this discussion is becoming a tree ;-))
> this is what I planned to do, until it dawned on me that I can't, because "cpu"
> isn't part of vmx (where the vmcs and launched sit in the standard KVM), but
> rather part of vcpu... When I gave up trying to "solve" these interdependencies
> and avoiding modifying half of KVM, I came up with the current solution.
> 
> Maybe I'm missing something - I'd be happy if we do find a solution that
> simplifies this code.

vcpu->cpu remains there. There is a new ->cpu field on struct vmcs, just
as saved_vmcs has in the current patches, to note the cpu which the VMCS 
was last loaded.

As mentioned there is no need to set "vcpu->cpu = -1" in __vcpu_clear,
the IPI handler, that can be done in vcpu_clear.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nadav Har'El May 17, 2011, 7:30 p.m. UTC | #9
On Tue, May 17, 2011, Marcelo Tosatti wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> > this is what I planned to do, until it dawned on me that I can't, because "cpu"
> > isn't part of vmx (where the vmcs and launched sit in the standard KVM), but
>...
> vcpu->cpu remains there. There is a new ->cpu field on struct vmcs, just
> as saved_vmcs has in the current patches, to note the cpu which the VMCS 
> was last loaded.

So we'll have two fields, vmx.vcpu.cpu and vmx.vmcs.cpu, which are supposed
to always contain the same value. Are you fine with that?

> As mentioned there is no need to set "vcpu->cpu = -1" in __vcpu_clear,
> the IPI handler, that can be done in vcpu_clear.

Right, this is true.
Marcelo Tosatti May 17, 2011, 7:52 p.m. UTC | #10
On Tue, May 17, 2011 at 10:30:30PM +0300, Nadav Har'El wrote:
> On Tue, May 17, 2011, Marcelo Tosatti wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> > > this is what I planned to do, until it dawned on me that I can't, because "cpu"
> > > isn't part of vmx (where the vmcs and launched sit in the standard KVM), but
> >...
> > vcpu->cpu remains there. There is a new ->cpu field on struct vmcs, just
> > as saved_vmcs has in the current patches, to note the cpu which the VMCS 
> > was last loaded.
> 
> So we'll have two fields, vmx.vcpu.cpu and vmx.vmcs.cpu, which are supposed
> to always contain the same value. Are you fine with that?

Yes. Avi?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity May 18, 2011, 8:29 a.m. UTC | #11
On 05/17/2011 10:52 PM, Marcelo Tosatti wrote:
> On Tue, May 17, 2011 at 10:30:30PM +0300, Nadav Har'El wrote:
> >  On Tue, May 17, 2011, Marcelo Tosatti wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> >  >  >  this is what I planned to do, until it dawned on me that I can't, because "cpu"
> >  >  >  isn't part of vmx (where the vmcs and launched sit in the standard KVM), but
> >  >...
> >  >  vcpu->cpu remains there. There is a new ->cpu field on struct vmcs, just
> >  >  as saved_vmcs has in the current patches, to note the cpu which the VMCS
> >  >  was last loaded.
> >
> >  So we'll have two fields, vmx.vcpu.cpu and vmx.vmcs.cpu, which are supposed
> >  to always contain the same value. Are you fine with that?
>
> Yes. Avi?

Yes.

They have different meanings.  vcpu->cpu means where the task that runs 
the vcpu is running (or last ran).  vmcs->cpu means which cpu has the 
vmcs cached.  They need not be the same when we have multiple vmcs's for 
a vcpu; but vmx->vmcs->cpu will chase vcpu->cpu as it changes.

Please post this patch separately instead of reposting the entire 
series, we can apply it independently.
Avi Kivity May 18, 2011, 8:31 a.m. UTC | #12
On 05/18/2011 08:52 AM, Nadav Har'El wrote:
> On Tue, May 17, 2011, Marcelo Tosatti wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> >  On Tue, May 17, 2011 at 10:30:30PM +0300, Nadav Har'El wrote:
> >  >  So we'll have two fields, vmx.vcpu.cpu and vmx.vmcs.cpu, which are supposed
> >  >  to always contain the same value. Are you fine with that?
> >
> >  Yes. Avi?
>
> Oops, it's even worse than I said, because if the new vmclear_local_vmcss
> clears the vmcs currently used on some vcpu, it will update vmcs.cpu on that
> vcpu to -1, but will *not* update vmx.vcpu.cpu, which remain its old value,
> and potentially cause problems when it is used (e.g., in x86.c) instead
> of vmx.vmcs.cpu.
>

I did a quick audit and it seems fine.  If it isn't, we'll fix it when 
we see the problem.
Nadav Har'El May 18, 2011, 9:02 a.m. UTC | #13
On Wed, May 18, 2011, Avi Kivity wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> I did a quick audit and it seems fine.  If it isn't, we'll fix it when 
> we see the problem.

Ok, then, I'm working on the code with the new approach.

My fear was that some CPU 7 is taken down, but vcpu.cpu remains 7 (not set to
-1). If cpu 7 nevers comes up again, it's not a problem because when we run
the same vcpu again on a different cpu, it's not 7 so we do what needs to be
done on CPU switch. But, what if CPU 7 does come up again later, and we find
ourselves running again on CPU 7, but it's not the same CPU 7 and we don't
know it? Is this case at all possible?
Avi Kivity May 18, 2011, 9:16 a.m. UTC | #14
On 05/18/2011 12:02 PM, Nadav Har'El wrote:
> On Wed, May 18, 2011, Avi Kivity wrote about "Re: [PATCH 08/31] nVMX: Fix local_vcpus_link handling":
> >  I did a quick audit and it seems fine.  If it isn't, we'll fix it when
> >  we see the problem.
>
> Ok, then, I'm working on the code with the new approach.
>
> My fear was that some CPU 7 is taken down, but vcpu.cpu remains 7 (not set to
> -1). If cpu 7 nevers comes up again, it's not a problem because when we run
> the same vcpu again on a different cpu, it's not 7 so we do what needs to be
> done on CPU switch. But, what if CPU 7 does come up again later, and we find
> ourselves running again on CPU 7, but it's not the same CPU 7 and we don't
> know it? Is this case at all possible?

It's certainly possible, but it's independent of this patch.

It's even handled, see kvm_arch_hardware_enable().
diff mbox

Patch

--- .before/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
@@ -181,6 +181,7 @@  struct saved_vmcs {
 	struct vmcs *vmcs;
 	int cpu;
 	int launched;
+	struct list_head local_saved_vmcss_link; /* see saved_vmcss_on_cpu */
 };
 
 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
@@ -315,7 +316,20 @@  static int vmx_set_tss_addr(struct kvm *
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+/*
+ * We maintain a per-CPU linked-list vcpus_on_cpu, holding for each CPU a list
+ * of vcpus whose VMCS are loaded on that CPU. This is needed when a CPU is
+ * brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ *
+ * With nested VMX, we have additional VMCSs which are not the current
+ * vmx->vmcs of any vcpu, but may also be loaded on some CPU: While L2 is
+ * running, L1's VMCS is loaded but not the VMCS of any vcpu; While L1 is
+ * running, a previously used L2 VMCS might still be around and loaded on some
+ * CPU, somes even more than one such L2 VMCSs is kept (see VMCS02_POOL_SIZE).
+ * The list of these additional VMCSs is kept on cpu saved_vmcss_on_cpu.
+ */
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct list_head, saved_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
@@ -1818,6 +1832,7 @@  static int hardware_enable(void *garbage
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(saved_vmcss_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
 	test_bits = FEATURE_CONTROL_LOCKED;
@@ -1860,10 +1875,13 @@  static void kvm_cpu_vmxoff(void)
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 }
 
+static void vmclear_local_saved_vmcss(void);
+
 static void hardware_disable(void *garbage)
 {
 	if (vmm_exclusive) {
 		vmclear_local_vcpus();
+		vmclear_local_saved_vmcss();
 		kvm_cpu_vmxoff();
 	}
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -4248,6 +4266,8 @@  static void __nested_free_saved_vmcs(voi
 	vmcs_clear(saved_vmcs->vmcs);
 	if (per_cpu(current_vmcs, saved_vmcs->cpu) == saved_vmcs->vmcs)
 		per_cpu(current_vmcs, saved_vmcs->cpu) = NULL;
+	list_del(&saved_vmcs->local_saved_vmcss_link);
+	saved_vmcs->cpu = -1;
 }
 
 /*
@@ -4265,6 +4285,21 @@  static void nested_free_saved_vmcs(struc
 	free_vmcs(saved_vmcs->vmcs);
 }
 
+/*
+ * VMCLEAR all the currently unused (not vmx->vmcs on any vcpu) saved_vmcss
+ * which were loaded on the current CPU. See also vmclear_load_vcpus(), which
+ * does the same for VMCS currently used in vcpus.
+ */
+static void vmclear_local_saved_vmcss(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct saved_vmcs *v, *n;
+
+	list_for_each_entry_safe(v, n, &per_cpu(saved_vmcss_on_cpu, cpu),
+				 local_saved_vmcss_link)
+		__nested_free_saved_vmcs(v);
+}
+
 /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
 static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
 {
@@ -5143,6 +5178,38 @@  static void vmx_set_supported_cpuid(u32 
 {
 }
 
+/*
+ * Maintain the vcpus_on_cpu and saved_vmcss_on_cpu lists of vcpus and
+ * inactive saved_vmcss on nested entry (L1->L2) or nested exit (L2->L1).
+ *
+ * nested_maintain_per_cpu_lists should be called after the VMCS was switched
+ * to the new one, with parameters giving both the new on (after the entry
+ * or exit) and the old one, in that order.
+ */
+static void nested_maintain_per_cpu_lists(struct vcpu_vmx *vmx,
+		struct saved_vmcs *new_vmcs,
+		struct saved_vmcs *old_vmcs)
+{
+	/*
+	 * When a vcpus's old vmcs is saved, we need to drop it from
+	 * vcpus_on_cpu and put it on saved_vmcss_on_cpu.
+	 */
+	if (old_vmcs->cpu != -1) {
+		list_del(&vmx->local_vcpus_link);
+		list_add(&old_vmcs->local_saved_vmcss_link,
+			 &per_cpu(saved_vmcss_on_cpu, old_vmcs->cpu));
+	}
+	/*
+	 * When a saved vmcs becomes a vcpu's new vmcs, we need to drop it
+	 * from saved_vmcss_on_cpu and put it on vcpus_on_cpu.
+	 */
+	if (new_vmcs->cpu != -1) {
+		list_del(&new_vmcs->local_saved_vmcss_link);
+		list_add(&vmx->local_vcpus_link,
+			 &per_cpu(vcpus_on_cpu, new_vmcs->cpu));
+	}
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)