diff mbox

[07/31] nVMX: Introduce vmcs02: VMCS used to run L2

Message ID 201105161947.p4GJlUJb001735@rice.haifa.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Har'El May 16, 2011, 7:47 p.m. UTC
We saw in a previous patch that L1 controls its L2 guest with a vcms12.
L0 needs to create a real VMCS for running L2. We call that "vmcs02".
A later patch will contain the code, prepare_vmcs02(), for filling the vmcs02
fields. This patch only contains code for allocating vmcs02.

In this version, prepare_vmcs02() sets *all* of vmcs02's fields each time we
enter from L1 to L2, so keeping just one vmcs02 for the vcpu is enough: It can
be reused even when L1 runs multiple L2 guests. However, in future versions
we'll probably want to add an optimization where vmcs02 fields that rarely
change will not be set each time. For that, we may want to keep around several
vmcs02s of L2 guests that have recently run, so that potentially we could run
these L2s again more quickly because less vmwrites to vmcs02 will be needed.

This patch adds to each vcpu a vmcs02 pool, vmx->nested.vmcs02_pool,
which remembers the vmcs02s last used to run up to VMCS02_POOL_SIZE L2s.
As explained above, in the current version we choose VMCS02_POOL_SIZE=1,
I.e., one vmcs02 is allocated (and loaded onto the processor), and it is
reused to enter any L2 guest. In the future, when prepare_vmcs02() is
optimized not to set all fields every time, VMCS02_POOL_SIZE should be
increased.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/vmx.c |  139 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tian, Kevin May 20, 2011, 8:04 a.m. UTC | #1
> From: Nadav Har'El
> Sent: Tuesday, May 17, 2011 3:48 AM
> 
> We saw in a previous patch that L1 controls its L2 guest with a vcms12.
> L0 needs to create a real VMCS for running L2. We call that "vmcs02".
> A later patch will contain the code, prepare_vmcs02(), for filling the vmcs02
> fields. This patch only contains code for allocating vmcs02.
> 
> In this version, prepare_vmcs02() sets *all* of vmcs02's fields each time we
> enter from L1 to L2, so keeping just one vmcs02 for the vcpu is enough: It can
> be reused even when L1 runs multiple L2 guests. However, in future versions
> we'll probably want to add an optimization where vmcs02 fields that rarely
> change will not be set each time. For that, we may want to keep around several
> vmcs02s of L2 guests that have recently run, so that potentially we could run
> these L2s again more quickly because less vmwrites to vmcs02 will be needed.

That would be a neat enhancement and should have an obvious improvement.
Possibly we can maintain the vmcs02 pool along with L1 VMCLEAR ops, which
is similar to the hardware behavior regarding to cleared and launched state.

> 
> This patch adds to each vcpu a vmcs02 pool, vmx->nested.vmcs02_pool,
> which remembers the vmcs02s last used to run up to VMCS02_POOL_SIZE L2s.
> As explained above, in the current version we choose VMCS02_POOL_SIZE=1,
> I.e., one vmcs02 is allocated (and loaded onto the processor), and it is
> reused to enter any L2 guest. In the future, when prepare_vmcs02() is
> optimized not to set all fields every time, VMCS02_POOL_SIZE should be
> increased.
> 
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> ---
>  arch/x86/kvm/vmx.c |  139
> +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 139 insertions(+)
> 
> --- .before/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
> @@ -117,6 +117,7 @@ static int ple_window = KVM_VMX_DEFAULT_
>  module_param(ple_window, int, S_IRUGO);
> 
>  #define NR_AUTOLOAD_MSRS 1
> +#define VMCS02_POOL_SIZE 1
> 
>  struct vmcs {
>  	u32 revision_id;
> @@ -166,6 +167,30 @@ struct __packed vmcs12 {
>  #define VMCS12_SIZE 0x1000
> 
>  /*
> + * When we temporarily switch a vcpu's VMCS (e.g., stop using an L1's VMCS
> + * while we use L2's VMCS), and we wish to save the previous VMCS, we must
> also
> + * remember on which CPU it was last loaded (vcpu->cpu), so when we return
> to
> + * using this VMCS we'll know if we're now running on a different CPU and
> need
> + * to clear the VMCS on the old CPU, and load it on the new one. Additionally,
> + * we need to remember whether this VMCS was launched (vmx->launched),
> so when
> + * we return to it we know if to VMLAUNCH or to VMRESUME it (we cannot
> deduce
> + * this from other state, because it's possible that this VMCS had once been
> + * launched, but has since been cleared after a CPU switch).
> + */
> +struct saved_vmcs {
> +	struct vmcs *vmcs;
> +	int cpu;
> +	int launched;
> +};

"saved" looks a bit misleading here. It's simply a list of all active vmcs02 tracked
by kvm, isn't it?

> +
> +/* Used to remember the last vmcs02 used for some recently used vmcs12s
> */
> +struct vmcs02_list {
> +	struct list_head list;
> +	gpa_t vmcs12_addr;

uniform the name 'vmptr' as nested_vmx strucure:
 /* The guest-physical address of the current VMCS L1 keeps for L2 */
	gpa_t current_vmptr;
	/* The host-usable pointer to the above */
	struct page *current_vmcs12_page;
	struct vmcs12 *current_vmcs12;

you should keep consistent meaning for vmcs12, which means the arch-neutral
state interpreted by KVM only.

> +	struct saved_vmcs vmcs02;
> +};
> +
> +/*
>   * The nested_vmx structure is part of vcpu_vmx, and holds information we
> need
>   * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
>   */
> @@ -178,6 +203,10 @@ struct nested_vmx {
>  	/* The host-usable pointer to the above */
>  	struct page *current_vmcs12_page;
>  	struct vmcs12 *current_vmcs12;
> +
> +	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
> +	struct list_head vmcs02_pool;
> +	int vmcs02_num;
>  };
> 
>  struct vcpu_vmx {
> @@ -4200,6 +4229,111 @@ static int handle_invalid_op(struct kvm_
>  }
> 
>  /*
> + * To run an L2 guest, we need a vmcs02 based the L1-specified vmcs12.
> + * We could reuse a single VMCS for all the L2 guests, but we also want the
> + * option to allocate a separate vmcs02 for each separate loaded vmcs12 -
> this
> + * allows keeping them loaded on the processor, and in the future will allow
> + * optimizations where prepare_vmcs02 doesn't need to set all the fields on
> + * every entry if they never change.
> + * So we keep, in vmx->nested.vmcs02_pool, a cache of size
> VMCS02_POOL_SIZE
> + * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
> + *
> + * The following functions allocate and free a vmcs02 in this pool.
> + */
> +
> +static void __nested_free_saved_vmcs(void *arg)
> +{
> +	struct saved_vmcs *saved_vmcs = arg;
> +
> +	vmcs_clear(saved_vmcs->vmcs);
> +	if (per_cpu(current_vmcs, saved_vmcs->cpu) == saved_vmcs->vmcs)
> +		per_cpu(current_vmcs, saved_vmcs->cpu) = NULL;
> +}
> +
> +/*
> + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last
> loaded
> + * (the necessary information is in the saved_vmcs structure).
> + * See also vcpu_clear() (with different parameters and side-effects)
> + */
> +static void nested_free_saved_vmcs(struct vcpu_vmx *vmx,
> +		struct saved_vmcs *saved_vmcs)
> +{
> +	if (saved_vmcs->cpu != -1)
> +		smp_call_function_single(saved_vmcs->cpu,
> +				__nested_free_saved_vmcs, saved_vmcs, 1);
> +
> +	free_vmcs(saved_vmcs->vmcs);
> +}
> +
> +/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
> +static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
> +{
> +	struct vmcs02_list *item;
> +	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
> +		if (item->vmcs12_addr == vmptr) {
> +			nested_free_saved_vmcs(vmx, &item->vmcs02);
> +			list_del(&item->list);
> +			kfree(item);
> +			vmx->nested.vmcs02_num--;
> +			return;
> +		}
> +}
> +
> +/*
> + * Free all VMCSs saved for this vcpu, except the actual vmx->vmcs.
> + * These include the VMCSs in vmcs02_pool (except the one currently used,
> + * if running L2), and saved_vmcs01 when running L2.
> + */
> +static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
> +{
> +	struct vmcs02_list *item, *n;
> +	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
> +		if (vmx->vmcs != item->vmcs02.vmcs)
> +			nested_free_saved_vmcs(vmx, &item->vmcs02);
> +		list_del(&item->list);
> +		kfree(item);
> +	}
> +	vmx->nested.vmcs02_num = 0;
> +}
> +
> +/* Get a vmcs02 for the current vmcs12. */
> +static struct saved_vmcs *nested_get_current_vmcs02(struct vcpu_vmx
> *vmx)
> +{
> +	struct vmcs02_list *item;
> +	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
> +		if (item->vmcs12_addr == vmx->nested.current_vmptr) {
> +			list_move(&item->list, &vmx->nested.vmcs02_pool);
> +			return &item->vmcs02;
> +		}
> +
> +	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
> +		/* Recycle the least recently used VMCS. */
> +		item = list_entry(vmx->nested.vmcs02_pool.prev,
> +			struct vmcs02_list, list);
> +		item->vmcs12_addr = vmx->nested.current_vmptr;
> +		list_move(&item->list, &vmx->nested.vmcs02_pool);
> +		return &item->vmcs02;
> +	}
> +
> +	/* Create a new vmcs02 */
> +	item = (struct vmcs02_list *)
> +		kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
> +	if (!item)
> +		return NULL;
> +	item->vmcs02.vmcs = alloc_vmcs();
> +	if (!item->vmcs02.vmcs) {
> +		kfree(item);
> +		return NULL;
> +	}
> +	item->vmcs12_addr = vmx->nested.current_vmptr;
> +	item->vmcs02.cpu = -1;
> +	item->vmcs02.launched = 0;
> +	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
> +	vmx->nested.vmcs02_num++;
> +	return &item->vmcs02;
> +}
> +
> +/*
>   * Emulate the VMXON instruction.
>   * Currently, we just remember that VMX is active, and do not save or even
>   * inspect the argument to VMXON (the so-called "VMXON pointer") because
> we
> @@ -4235,6 +4369,9 @@ static int handle_vmon(struct kvm_vcpu *
>  		return 1;
>  	}
> 
> +	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
> +	vmx->nested.vmcs02_num = 0;
> +
>  	vmx->nested.vmxon = true;
> 
>  	skip_emulated_instruction(vcpu);
> @@ -4286,6 +4423,8 @@ static void free_nested(struct vcpu_vmx
>  		vmx->nested.current_vmptr = -1ull;
>  		vmx->nested.current_vmcs12 = NULL;
>  	}
> +
> +	nested_free_all_saved_vmcss(vmx);
>  }
> 
>  /* Emulate the VMXOFF instruction */
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 20, 2011, 8:48 a.m. UTC | #2
> From: Tian, Kevin
> Sent: Friday, May 20, 2011 4:05 PM
> 
> > From: Nadav Har'El
> > Sent: Tuesday, May 17, 2011 3:48 AM
> >
> > We saw in a previous patch that L1 controls its L2 guest with a vcms12.
> > L0 needs to create a real VMCS for running L2. We call that "vmcs02".
> > A later patch will contain the code, prepare_vmcs02(), for filling the vmcs02
> > fields. This patch only contains code for allocating vmcs02.
> >
> > In this version, prepare_vmcs02() sets *all* of vmcs02's fields each time we
> > enter from L1 to L2, so keeping just one vmcs02 for the vcpu is enough: It can
> > be reused even when L1 runs multiple L2 guests. However, in future versions
> > we'll probably want to add an optimization where vmcs02 fields that rarely
> > change will not be set each time. For that, we may want to keep around
> several
> > vmcs02s of L2 guests that have recently run, so that potentially we could run
> > these L2s again more quickly because less vmwrites to vmcs02 will be
> needed.
> 
> That would be a neat enhancement and should have an obvious improvement.
> Possibly we can maintain the vmcs02 pool along with L1 VMCLEAR ops, which
> is similar to the hardware behavior regarding to cleared and launched state.
> 
> >
> > This patch adds to each vcpu a vmcs02 pool, vmx->nested.vmcs02_pool,
> > which remembers the vmcs02s last used to run up to VMCS02_POOL_SIZE
> L2s.
> > As explained above, in the current version we choose VMCS02_POOL_SIZE=1,
> > I.e., one vmcs02 is allocated (and loaded onto the processor), and it is
> > reused to enter any L2 guest. In the future, when prepare_vmcs02() is
> > optimized not to set all fields every time, VMCS02_POOL_SIZE should be
> > increased.
> >
> > Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> > ---
> >  arch/x86/kvm/vmx.c |  139
> > +++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 139 insertions(+)
> >
> > --- .before/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
> > +++ .after/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
> > @@ -117,6 +117,7 @@ static int ple_window = KVM_VMX_DEFAULT_
> >  module_param(ple_window, int, S_IRUGO);
> >
> >  #define NR_AUTOLOAD_MSRS 1
> > +#define VMCS02_POOL_SIZE 1
> >
> >  struct vmcs {
> >  	u32 revision_id;
> > @@ -166,6 +167,30 @@ struct __packed vmcs12 {
> >  #define VMCS12_SIZE 0x1000
> >
> >  /*
> > + * When we temporarily switch a vcpu's VMCS (e.g., stop using an L1's
> VMCS
> > + * while we use L2's VMCS), and we wish to save the previous VMCS, we
> must
> > also
> > + * remember on which CPU it was last loaded (vcpu->cpu), so when we
> return
> > to
> > + * using this VMCS we'll know if we're now running on a different CPU and
> > need
> > + * to clear the VMCS on the old CPU, and load it on the new one.
> Additionally,
> > + * we need to remember whether this VMCS was launched (vmx->launched),
> > so when
> > + * we return to it we know if to VMLAUNCH or to VMRESUME it (we cannot
> > deduce
> > + * this from other state, because it's possible that this VMCS had once been
> > + * launched, but has since been cleared after a CPU switch).
> > + */
> > +struct saved_vmcs {
> > +	struct vmcs *vmcs;
> > +	int cpu;
> > +	int launched;
> > +};
> 
> "saved" looks a bit misleading here. It's simply a list of all active vmcs02
> tracked
> by kvm, isn't it?
> 
> > +
> > +/* Used to remember the last vmcs02 used for some recently used vmcs12s
> > */
> > +struct vmcs02_list {
> > +	struct list_head list;
> > +	gpa_t vmcs12_addr;
> 
> uniform the name 'vmptr' as nested_vmx strucure:
>  /* The guest-physical address of the current VMCS L1 keeps for L2 */
> 	gpa_t current_vmptr;
> 	/* The host-usable pointer to the above */
> 	struct page *current_vmcs12_page;
> 	struct vmcs12 *current_vmcs12;
> 
> you should keep consistent meaning for vmcs12, which means the arch-neutral
> state interpreted by KVM only.
> 
> > +	struct saved_vmcs vmcs02;
> > +};
> > +
> > +/*
> >   * The nested_vmx structure is part of vcpu_vmx, and holds information we
> > need
> >   * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
> >   */
> > @@ -178,6 +203,10 @@ struct nested_vmx {
> >  	/* The host-usable pointer to the above */
> >  	struct page *current_vmcs12_page;
> >  	struct vmcs12 *current_vmcs12;
> > +
> > +	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
> > +	struct list_head vmcs02_pool;
> > +	int vmcs02_num;
> >  };
> >
> >  struct vcpu_vmx {
> > @@ -4200,6 +4229,111 @@ static int handle_invalid_op(struct kvm_
> >  }
> >
> >  /*
> > + * To run an L2 guest, we need a vmcs02 based the L1-specified vmcs12.
> > + * We could reuse a single VMCS for all the L2 guests, but we also want the
> > + * option to allocate a separate vmcs02 for each separate loaded vmcs12 -
> > this
> > + * allows keeping them loaded on the processor, and in the future will allow
> > + * optimizations where prepare_vmcs02 doesn't need to set all the fields on
> > + * every entry if they never change.
> > + * So we keep, in vmx->nested.vmcs02_pool, a cache of size
> > VMCS02_POOL_SIZE
> > + * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
> > + *
> > + * The following functions allocate and free a vmcs02 in this pool.
> > + */
> > +
> > +static void __nested_free_saved_vmcs(void *arg)
> > +{
> > +	struct saved_vmcs *saved_vmcs = arg;
> > +
> > +	vmcs_clear(saved_vmcs->vmcs);
> > +	if (per_cpu(current_vmcs, saved_vmcs->cpu) == saved_vmcs->vmcs)
> > +		per_cpu(current_vmcs, saved_vmcs->cpu) = NULL;
> > +}
> > +
> > +/*
> > + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last
> > loaded
> > + * (the necessary information is in the saved_vmcs structure).
> > + * See also vcpu_clear() (with different parameters and side-effects)
> > + */
> > +static void nested_free_saved_vmcs(struct vcpu_vmx *vmx,
> > +		struct saved_vmcs *saved_vmcs)
> > +{
> > +	if (saved_vmcs->cpu != -1)
> > +		smp_call_function_single(saved_vmcs->cpu,
> > +				__nested_free_saved_vmcs, saved_vmcs, 1);
> > +
> > +	free_vmcs(saved_vmcs->vmcs);
> > +}
> > +
> > +/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one)
> */
> > +static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
> > +{
> > +	struct vmcs02_list *item;
> > +	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
> > +		if (item->vmcs12_addr == vmptr) {
> > +			nested_free_saved_vmcs(vmx, &item->vmcs02);
> > +			list_del(&item->list);
> > +			kfree(item);
> > +			vmx->nested.vmcs02_num--;
> > +			return;
> > +		}
> > +}
> > +
> > +/*
> > + * Free all VMCSs saved for this vcpu, except the actual vmx->vmcs.
> > + * These include the VMCSs in vmcs02_pool (except the one currently used,
> > + * if running L2), and saved_vmcs01 when running L2.
> > + */
> > +static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
> > +{
> > +	struct vmcs02_list *item, *n;
> > +	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
> > +		if (vmx->vmcs != item->vmcs02.vmcs)
> > +			nested_free_saved_vmcs(vmx, &item->vmcs02);
> > +		list_del(&item->list);
> > +		kfree(item);
> > +	}
> > +	vmx->nested.vmcs02_num = 0;
> > +}
> > +
> > +/* Get a vmcs02 for the current vmcs12. */
> > +static struct saved_vmcs *nested_get_current_vmcs02(struct vcpu_vmx
> > *vmx)
> > +{
> > +	struct vmcs02_list *item;
> > +	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
> > +		if (item->vmcs12_addr == vmx->nested.current_vmptr) {
> > +			list_move(&item->list, &vmx->nested.vmcs02_pool);
> > +			return &item->vmcs02;
> > +		}
> > +
> > +	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
> > +		/* Recycle the least recently used VMCS. */
> > +		item = list_entry(vmx->nested.vmcs02_pool.prev,
> > +			struct vmcs02_list, list);
> > +		item->vmcs12_addr = vmx->nested.current_vmptr;
> > +		list_move(&item->list, &vmx->nested.vmcs02_pool);
> > +		return &item->vmcs02;

btw, shouldn't you clear recycled VMCS and reset 'cpu' and 'launched' fields?

Have you tried SMP L2 guest?

Thanks
Kevin

> > +	}
> > +
> > +	/* Create a new vmcs02 */
> > +	item = (struct vmcs02_list *)
> > +		kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
> > +	if (!item)
> > +		return NULL;
> > +	item->vmcs02.vmcs = alloc_vmcs();
> > +	if (!item->vmcs02.vmcs) {
> > +		kfree(item);
> > +		return NULL;
> > +	}
> > +	item->vmcs12_addr = vmx->nested.current_vmptr;
> > +	item->vmcs02.cpu = -1;
> > +	item->vmcs02.launched = 0;
> > +	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
> > +	vmx->nested.vmcs02_num++;
> > +	return &item->vmcs02;
> > +}
> > +
> > +/*
> >   * Emulate the VMXON instruction.
> >   * Currently, we just remember that VMX is active, and do not save or even
> >   * inspect the argument to VMXON (the so-called "VMXON pointer")
> because
> > we
> > @@ -4235,6 +4369,9 @@ static int handle_vmon(struct kvm_vcpu *
> >  		return 1;
> >  	}
> >
> > +	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
> > +	vmx->nested.vmcs02_num = 0;
> > +
> >  	vmx->nested.vmxon = true;
> >
> >  	skip_emulated_instruction(vcpu);
> > @@ -4286,6 +4423,8 @@ static void free_nested(struct vcpu_vmx
> >  		vmx->nested.current_vmptr = -1ull;
> >  		vmx->nested.current_vmcs12 = NULL;
> >  	}
> > +
> > +	nested_free_all_saved_vmcss(vmx);
> >  }
> >
> >  /* Emulate the VMXOFF instruction */
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nadav Har'El May 20, 2011, 8:32 p.m. UTC | #3
On Fri, May 20, 2011, Tian, Kevin wrote about "RE: [PATCH 07/31] nVMX: Introduce vmcs02: VMCS used to run L2":
> btw, shouldn't you clear recycled VMCS and reset 'cpu' and 'launched' fields?

Well, I believe the answer is "no": As far as I understood, a host is allowed
to take a VMCS that has been used once to launch a certain guest, and then
modify all the VMCS's fields to define a completely different guest, and then
VMRESUME it, without doing the regular VMCLEAR/VMLAUNCH, even though it's
"a different guest". Is there something wrong in my assumption? Does VMX keep
anything constant between successive VMRESUMEs?

> Have you tried SMP L2 guest?

It "sort of" works, but it *does* appear to still have a bug which I didn't
yet have the time to hunt... In one case, for example, an 8-vcpu L2 on an
8-vcpu L1 seemed to work well (e.g., doing parallel make) for about a minute,
and then hung with some sort of page fault in the kernel.

Nadav.
Tian, Kevin May 22, 2011, 2 a.m. UTC | #4
> From: Nadav Har'El [mailto:nyh@math.technion.ac.il]
> Sent: Saturday, May 21, 2011 4:32 AM
> 
> On Fri, May 20, 2011, Tian, Kevin wrote about "RE: [PATCH 07/31] nVMX:
> Introduce vmcs02: VMCS used to run L2":
> > btw, shouldn't you clear recycled VMCS and reset 'cpu' and 'launched' fields?
> 
> Well, I believe the answer is "no": As far as I understood, a host is allowed
> to take a VMCS that has been used once to launch a certain guest, and then
> modify all the VMCS's fields to define a completely different guest, and then
> VMRESUME it, without doing the regular VMCLEAR/VMLAUNCH, even though
> it's
> "a different guest". Is there something wrong in my assumption? Does VMX
> keep
> anything constant between successive VMRESUMEs?

Yes, you can reuse a VMCS with a completely different state if the VMCS is used
on the same processor, and you must ensure that VMCS not having a dirty state
on other processors. The SDM 3B (21.10.1) explicitly requires:

----

No VMCS should ever be active on more than one logical processor. If a VMCS is to be
"migrated" from one logical processor to another, the first logical processor should
execute VMCLEAR for the VMCS (to make it inactive on that logical processor and to
ensure that all VMCS data are in memory) before the other logical processor
executes VMPTRLD for the VMCS (to make it active on the second logical processor).
A VMCS that is made active on more than one logical processor may become
corrupted

----

Here the vmcs02 being overridden may have been run on another processor before
but is not vmclear-ed yet. When you resume this vmcs02 with new content on a 
separate processor, the risk of corruption exists.

> 
> > Have you tried SMP L2 guest?
> 
> It "sort of" works, but it *does* appear to still have a bug which I didn't
> yet have the time to hunt... In one case, for example, an 8-vcpu L2 on an
> 8-vcpu L1 seemed to work well (e.g., doing parallel make) for about a minute,
> and then hung with some sort of page fault in the kernel.
> 

See whether cleaning up above can help here.

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nadav Har'El May 22, 2011, 7:22 a.m. UTC | #5
Hi,

On Sun, May 22, 2011, Tian, Kevin wrote about "RE: [PATCH 07/31] nVMX: Introduce vmcs02: VMCS used to run L2":
> Here the vmcs02 being overridden may have been run on another processor before
> but is not vmclear-ed yet. When you resume this vmcs02 with new content on a 
> separate processor, the risk of corruption exists.

I still believe that my current code is correct (in this area). I'll try to
explain it here and would be grateful if you could point to me the error (if
there is one) in my logic:

Nested_vmx_run() is our function which is switches from running L1 to L2
(patch 18).

This function starts by calling nested_get_current_vmcs02(), which gets us
*some* vmcs to use for vmcs02. This may be a fresh new VMCS, or a "recycled"
VMCS, some VMCS we've previously used to run some, potentially different L2
guest on some, potentially different, CPU.
nested_get_current_vmcs02() returns a "saved_vmcs" structure, which
not only contains a VMCS, but also remembers on which (if any) cpu it is
currently loaded (and whether it was VMLAUNCHed once on that cpu).

The next thing that Nested_vmx_run() now does is to set up in the vcpu object
the vmcs, cpu and launched fields according to what was returned above.

Now it calls vmx_vcpu_load(). This standard KVM function checks if we're now
running on a different CPU from the vcpu->cpu, and if it a different one, is
uses vcpu_clear() to VMCLEAR the vmcs on the CPU where it was last loaded
(using an IPI). Only after it vmclears the VMCS on the old CPU, it can finally
load the VMCS on the new CPU.

Only now Nested_vmx_run() can call prepare_vmcs02, which starts VMWRITEing
to this VMCS, and finally returns.

P.S. Seeing that you're from Intel, maybe you can help me with a pointer:
I found what appears to be a small error in the SDM - who can I report it to?

Thanks,
Nadav.
Nadav Har'El May 22, 2011, 8:29 a.m. UTC | #6
Hi,

On Fri, May 20, 2011, Tian, Kevin wrote about "RE: [PATCH 07/31] nVMX: Introduce vmcs02: VMCS used to run L2":
> Possibly we can maintain the vmcs02 pool along with L1 VMCLEAR ops, which
> is similar to the hardware behavior regarding to cleared and launched state.

If you set VMCS02_POOL_SIZE to a large size, and L1, like typical hypervisors,
only keeps around a few VMCSs (and VMCLEARs the ones it will not use again),
then we'll only have a few vmcs02: handle_vmclear() removes from the pool the
vmcs02 that L1 explicitly told us it won't need again.

> > +struct saved_vmcs {
> > +	struct vmcs *vmcs;
> > +	int cpu;
> > +	int launched;
> > +};
> 
> "saved" looks a bit misleading here. It's simply a list of all active vmcs02 tracked
> by kvm, isn't it?

I have rewritten this part of the code, based on Avi's and Marcelo's requests,
and the new name for this structure is "loaded_vmcs", i.e., a structure
describing where a VMCS was loaded.


> > +/* Used to remember the last vmcs02 used for some recently used vmcs12s
> > */
> > +struct vmcs02_list {
> > +	struct list_head list;
> > +	gpa_t vmcs12_addr;
> 
> uniform the name 'vmptr' as nested_vmx strucure:

Ok. Changing all the mentions of "vmcs12_addr" to vmptr.
Tian, Kevin May 24, 2011, 12:54 a.m. UTC | #7
> From: Nadav Har'El [mailto:nyh@math.technion.ac.il]
> Sent: Sunday, May 22, 2011 3:23 PM
> 
> Hi,
> 
> On Sun, May 22, 2011, Tian, Kevin wrote about "RE: [PATCH 07/31] nVMX:
> Introduce vmcs02: VMCS used to run L2":
> > Here the vmcs02 being overridden may have been run on another processor
> before
> > but is not vmclear-ed yet. When you resume this vmcs02 with new content on
> a
> > separate processor, the risk of corruption exists.
> 
> I still believe that my current code is correct (in this area). I'll try to
> explain it here and would be grateful if you could point to me the error (if
> there is one) in my logic:
> 
> Nested_vmx_run() is our function which is switches from running L1 to L2
> (patch 18).
> 
> This function starts by calling nested_get_current_vmcs02(), which gets us
> *some* vmcs to use for vmcs02. This may be a fresh new VMCS, or a
> "recycled"
> VMCS, some VMCS we've previously used to run some, potentially different L2
> guest on some, potentially different, CPU.
> nested_get_current_vmcs02() returns a "saved_vmcs" structure, which
> not only contains a VMCS, but also remembers on which (if any) cpu it is
> currently loaded (and whether it was VMLAUNCHed once on that cpu).
> 
> The next thing that Nested_vmx_run() now does is to set up in the vcpu object
> the vmcs, cpu and launched fields according to what was returned above.
> 
> Now it calls vmx_vcpu_load(). This standard KVM function checks if we're now
> running on a different CPU from the vcpu->cpu, and if it a different one, is
> uses vcpu_clear() to VMCLEAR the vmcs on the CPU where it was last loaded
> (using an IPI). Only after it vmclears the VMCS on the old CPU, it can finally
> load the VMCS on the new CPU.
> 
> Only now Nested_vmx_run() can call prepare_vmcs02, which starts
> VMWRITEing
> to this VMCS, and finally returns.
> 

yes, you're correct. Previously I just looked around 07/31 and raised above concern.
Along with nested_vmx_run you explained above, this part is clear to me now. :-)

> P.S. Seeing that you're from Intel, maybe you can help me with a pointer:
> I found what appears to be a small error in the SDM - who can I report it to?
> 

Let me ask for you.

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tian, Kevin May 24, 2011, 1:03 a.m. UTC | #8
> From: Nadav Har'El [mailto:nyh@math.technion.ac.il]
> Sent: Sunday, May 22, 2011 4:30 PM
> 
> Hi,
> 
> On Fri, May 20, 2011, Tian, Kevin wrote about "RE: [PATCH 07/31] nVMX:
> Introduce vmcs02: VMCS used to run L2":
> > Possibly we can maintain the vmcs02 pool along with L1 VMCLEAR ops, which
> > is similar to the hardware behavior regarding to cleared and launched state.
> 
> If you set VMCS02_POOL_SIZE to a large size, and L1, like typical hypervisors,
> only keeps around a few VMCSs (and VMCLEARs the ones it will not use again),
> then we'll only have a few vmcs02: handle_vmclear() removes from the pool the
> vmcs02 that L1 explicitly told us it won't need again.

yes

> 
> > > +struct saved_vmcs {
> > > +	struct vmcs *vmcs;
> > > +	int cpu;
> > > +	int launched;
> > > +};
> >
> > "saved" looks a bit misleading here. It's simply a list of all active vmcs02
> tracked
> > by kvm, isn't it?
> 
> I have rewritten this part of the code, based on Avi's and Marcelo's requests,
> and the new name for this structure is "loaded_vmcs", i.e., a structure
> describing where a VMCS was loaded.

great, I'll take a look at your new code.

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- .before/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
@@ -117,6 +117,7 @@  static int ple_window = KVM_VMX_DEFAULT_
 module_param(ple_window, int, S_IRUGO);
 
 #define NR_AUTOLOAD_MSRS 1
+#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
 	u32 revision_id;
@@ -166,6 +167,30 @@  struct __packed vmcs12 {
 #define VMCS12_SIZE 0x1000
 
 /*
+ * When we temporarily switch a vcpu's VMCS (e.g., stop using an L1's VMCS
+ * while we use L2's VMCS), and we wish to save the previous VMCS, we must also
+ * remember on which CPU it was last loaded (vcpu->cpu), so when we return to
+ * using this VMCS we'll know if we're now running on a different CPU and need
+ * to clear the VMCS on the old CPU, and load it on the new one. Additionally,
+ * we need to remember whether this VMCS was launched (vmx->launched), so when
+ * we return to it we know if to VMLAUNCH or to VMRESUME it (we cannot deduce
+ * this from other state, because it's possible that this VMCS had once been
+ * launched, but has since been cleared after a CPU switch).
+ */
+struct saved_vmcs {
+	struct vmcs *vmcs;
+	int cpu;
+	int launched;
+};
+
+/* Used to remember the last vmcs02 used for some recently used vmcs12s */
+struct vmcs02_list {
+	struct list_head list;
+	gpa_t vmcs12_addr;
+	struct saved_vmcs vmcs02;
+};
+
+/*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
  */
@@ -178,6 +203,10 @@  struct nested_vmx {
 	/* The host-usable pointer to the above */
 	struct page *current_vmcs12_page;
 	struct vmcs12 *current_vmcs12;
+
+	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
+	struct list_head vmcs02_pool;
+	int vmcs02_num;
 };
 
 struct vcpu_vmx {
@@ -4200,6 +4229,111 @@  static int handle_invalid_op(struct kvm_
 }
 
 /*
+ * To run an L2 guest, we need a vmcs02 based the L1-specified vmcs12.
+ * We could reuse a single VMCS for all the L2 guests, but we also want the
+ * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
+ * allows keeping them loaded on the processor, and in the future will allow
+ * optimizations where prepare_vmcs02 doesn't need to set all the fields on
+ * every entry if they never change.
+ * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
+ * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
+ *
+ * The following functions allocate and free a vmcs02 in this pool.
+ */
+
+static void __nested_free_saved_vmcs(void *arg)
+{
+	struct saved_vmcs *saved_vmcs = arg;
+
+	vmcs_clear(saved_vmcs->vmcs);
+	if (per_cpu(current_vmcs, saved_vmcs->cpu) == saved_vmcs->vmcs)
+		per_cpu(current_vmcs, saved_vmcs->cpu) = NULL;
+}
+
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ * (the necessary information is in the saved_vmcs structure).
+ * See also vcpu_clear() (with different parameters and side-effects)
+ */
+static void nested_free_saved_vmcs(struct vcpu_vmx *vmx,
+		struct saved_vmcs *saved_vmcs)
+{
+	if (saved_vmcs->cpu != -1)
+		smp_call_function_single(saved_vmcs->cpu,
+				__nested_free_saved_vmcs, saved_vmcs, 1);
+
+	free_vmcs(saved_vmcs->vmcs);
+}
+
+/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
+static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+	struct vmcs02_list *item;
+	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+		if (item->vmcs12_addr == vmptr) {
+			nested_free_saved_vmcs(vmx, &item->vmcs02);
+			list_del(&item->list);
+			kfree(item);
+			vmx->nested.vmcs02_num--;
+			return;
+		}
+}
+
+/*
+ * Free all VMCSs saved for this vcpu, except the actual vmx->vmcs.
+ * These include the VMCSs in vmcs02_pool (except the one currently used,
+ * if running L2), and saved_vmcs01 when running L2.
+ */
+static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
+{
+	struct vmcs02_list *item, *n;
+	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
+		if (vmx->vmcs != item->vmcs02.vmcs)
+			nested_free_saved_vmcs(vmx, &item->vmcs02);
+		list_del(&item->list);
+		kfree(item);
+	}
+	vmx->nested.vmcs02_num = 0;
+}
+
+/* Get a vmcs02 for the current vmcs12. */
+static struct saved_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
+{
+	struct vmcs02_list *item;
+	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+		if (item->vmcs12_addr == vmx->nested.current_vmptr) {
+			list_move(&item->list, &vmx->nested.vmcs02_pool);
+			return &item->vmcs02;
+		}
+
+	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
+		/* Recycle the least recently used VMCS. */
+		item = list_entry(vmx->nested.vmcs02_pool.prev,
+			struct vmcs02_list, list);
+		item->vmcs12_addr = vmx->nested.current_vmptr;
+		list_move(&item->list, &vmx->nested.vmcs02_pool);
+		return &item->vmcs02;
+	}
+
+	/* Create a new vmcs02 */
+	item = (struct vmcs02_list *)
+		kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+	if (!item)
+		return NULL;
+	item->vmcs02.vmcs = alloc_vmcs();
+	if (!item->vmcs02.vmcs) {
+		kfree(item);
+		return NULL;
+	}
+	item->vmcs12_addr = vmx->nested.current_vmptr;
+	item->vmcs02.cpu = -1;
+	item->vmcs02.launched = 0;
+	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+	vmx->nested.vmcs02_num++;
+	return &item->vmcs02;
+}
+
+/*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
@@ -4235,6 +4369,9 @@  static int handle_vmon(struct kvm_vcpu *
 		return 1;
 	}
 
+	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+	vmx->nested.vmcs02_num = 0;
+
 	vmx->nested.vmxon = true;
 
 	skip_emulated_instruction(vcpu);
@@ -4286,6 +4423,8 @@  static void free_nested(struct vcpu_vmx 
 		vmx->nested.current_vmptr = -1ull;
 		vmx->nested.current_vmcs12 = NULL;
 	}
+
+	nested_free_all_saved_vmcss(vmx);
 }
 
 /* Emulate the VMXOFF instruction */