diff mbox

[3/6] Nested VMX patch 3 implements vmptrld and vmptrst

Message ID 1251905916-2834-4-git-send-email-oritw@il.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

oritw@il.ibm.com Sept. 2, 2009, 3:38 p.m. UTC
From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  533 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 523 insertions(+), 10 deletions(-)

Comments

Avi Kivity Sept. 2, 2009, 8:05 p.m. UTC | #1
On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> +struct __attribute__ ((__packed__)) level_state {
> +	struct shadow_vmcs *shadow_vmcs;
> +
> +	u16 vpid;
> +	u64 shadow_efer;
> +	unsigned long cr2;
> +	unsigned long cr3;
> +	unsigned long cr4;
> +	unsigned long cr8;
> +
> +	u64 io_bitmap_a;
> +	u64 io_bitmap_b;
> +	u64 msr_bitmap;
> +
> +	struct vmcs *vmcs;
> +	int cpu;
> +	int launched;
> +};
>    



> +
>   struct vmcs {
>   	u32 revision_id;
>   	u32 abort;
> @@ -72,6 +217,17 @@ struct nested_vmx {
>   	bool vmon;
>   	/* Has the level1 guest done vmclear? */
>   	bool vmclear;
> +	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
> +	u64 l1_cur_vmcs;
>    

This is the vmptr (exactly as loaded by vmptrld), right?  If so, please 
call it vmptr.

> +	/*
> +	 * Level 2 state : includes vmcs,registers and
> +	 * a copy of vmcs12 for vmread/vmwrite
> +	 */
> +	struct level_state *l2_state;
> +
> +	/* Level 1 state for switching to level 2 and back */
> +	struct level_state *l1_state;
>    

Can you explain why we need two of them?  in the guest vmcs we have host 
and guest values, and in l1_state and l2_state we have more copies, and 
in struct vcpu we have yet another set of copies.  We also have a couple 
of copies in the host vmcs.  I'm getting dizzy...


>   static int init_rmode(struct kvm *kvm);
>   static u64 construct_eptp(unsigned long root_hpa);
>
>
>
> +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> +{
> +	gpa_t gpa;
> +	struct page *page;
> +	int r = 0;
> +
> +	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
> +
> +	/* checking guest gpa */
> +	page = gfn_to_page(vcpu->kvm, gpa>>  PAGE_SHIFT);
> +	if (is_error_page(page)) {
> +		printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
> +		       __func__, gpa);
> +		r = 1;
> +		goto out;
> +	}
> +
> +	r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
> +	if (r) {
> +		printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
> +		       __func__, gpa, r);
> +		goto out;
> +	}
>    

You can use kvm_read_guest_virt() to simplify this.

> +
> +	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> +		printk(KERN_DEBUG "%s addr %llx not aligned\n",
> +		       __func__, *gentry);
> +		return 1;
> +	}
> +
> +out:
> +	kvm_release_page_clean(page);
> +	return r;
> +}
> +
> +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page;
> +	u64 guest_vmcs_addr;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> +		return 1;
> +
> +	if (create_l1_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	if (create_l2_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	vmx->nested.l2_state->vmcs = alloc_vmcs();
> +	if (!vmx->nested.l2_state->vmcs) {
> +		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> +		return 1;
> +	}
> +
> +	if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> +		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> +		if (vmcs_page == NULL)
> +			return 1;
> +
> +		/* load nested vmcs to processor */
> +		if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
>    

So, you're loading a guest page as the vmcs.  This is dangerous as the 
guest can play with it.  Much better to use inaccessible memory (and you 
do alloc_vmcs() earlier?)

> +
> +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> +{
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
>    

Should store to mem64 according to the docs?

Better done through the emulator.

> +void save_vmcs(struct shadow_vmcs *dst)
> +{
> +	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	if (cpu_has_vmx_msr_bitmap())
> +		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> +
> +	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> +	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> +	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> +	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> +	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> +	if (enable_ept)
> +		dst->ept_pointer = vmcs_read64(EPT_POINTER);
> +
> +	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	if (enable_ept) {
> +		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> +		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> +		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> +		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> +	}
> +	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
> +	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> +	dst->page_fault_error_code_mask =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> +	dst->page_fault_error_code_match =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> +	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> +	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
> +	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> +	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> +	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
> +	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	dst->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> +	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +	if (enable_vpid&&  dst->secondary_vm_exec_control&
> +	    SECONDARY_EXEC_ENABLE_VPID)
> +		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> +	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> +	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	dst->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> +	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> +	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> +	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> +	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> +	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> +	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> +	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> +	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> +	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> +	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> +	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> +	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	dst->guest_rsp = vmcs_readl(GUEST_RSP);
> +	dst->guest_rip = vmcs_readl(GUEST_RIP);
> +	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	dst->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +	dst->host_cr0 = vmcs_readl(HOST_CR0);
> +	dst->host_cr3 = vmcs_readl(HOST_CR3);
> +	dst->host_cr4 = vmcs_readl(HOST_CR4);
> +	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> +	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> +	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> +	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> +	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> +	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> +	dst->host_rsp = vmcs_readl(HOST_RSP);
> +	dst->host_rip = vmcs_readl(HOST_RIP);
> +	if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
> +		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> +}
>    

I see.  You're using the processor's format when reading the guest 
vmcs.  But we don't have to do that, we can use the shadow_vmcs 
structure (and a memcpy).
oritw@il.ibm.com Sept. 3, 2009, 2:25 p.m. UTC | #2
Avi Kivity <avi@redhat.com> wrote on 02/09/2009 23:05:09:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mmday@us.ibm.com
>
> Date:
>
> 02/09/2009 23:04
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> > +struct __attribute__ ((__packed__)) level_state {
> > +   struct shadow_vmcs *shadow_vmcs;
> > +
> > +   u16 vpid;
> > +   u64 shadow_efer;
> > +   unsigned long cr2;
> > +   unsigned long cr3;
> > +   unsigned long cr4;
> > +   unsigned long cr8;
> > +
> > +   u64 io_bitmap_a;
> > +   u64 io_bitmap_b;
> > +   u64 msr_bitmap;
> > +
> > +   struct vmcs *vmcs;
> > +   int cpu;
> > +   int launched;
> > +};
> >
>
>
>
> > +
> >   struct vmcs {
> >      u32 revision_id;
> >      u32 abort;
> > @@ -72,6 +217,17 @@ struct nested_vmx {
> >      bool vmon;
> >      /* Has the level1 guest done vmclear? */
> >      bool vmclear;
> > +   /* What is the location of the  vmcs l1 keeps for l2? (in level1
gpa) */
> > +   u64 l1_cur_vmcs;
> >
>
> This is the vmptr (exactly as loaded by vmptrld), right?  If so, please
> call it vmptr.
OK I will change it.
>
> > +   /*
> > +    * Level 2 state : includes vmcs,registers and
> > +    * a copy of vmcs12 for vmread/vmwrite
> > +    */
> > +   struct level_state *l2_state;
> > +
> > +   /* Level 1 state for switching to level 2 and back */
> > +   struct level_state *l1_state;
> >
>
> Can you explain why we need two of them?  in the guest vmcs we have host
> and guest values, and in l1_state and l2_state we have more copies, and
> in struct vcpu we have yet another set of copies.  We also have a couple
> of copies in the host vmcs.  I'm getting dizzy...
L2_state stores all the L2 guest state:
      vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
      shadow vmcs - a structure storing the values of VMCS12 (the vmcs L1
create to run L2).
      cpu - the cpu id
      launched- launched flag
      vpid - the vpid allocate by L0 for L2 (we need to store it somewhere)
      msr_bitmap - At the moment we use L0 msr_bitmap(as we are running kvm
on kvm) in the future we will use a merge of both bitmaps.
      io_bitmaps - At the moment we use L0 io_bitmaps (as we are running
kvm on kvm) in the future we will use a merge of both io_bitmaps.

L1 state stores the L1 state -
      vmcs - pointer to VMCS01
      shadow vmcs - a structure storing the values of VMCS01. we use it
when updating VMCS02 in order to avoid the need to switch between VMCS02
and VMCS01.
      cpu - the cpu id
      launched- launched flag
      vpid - the vpid allocate by L0 for L1 (we need to store it somewhere)
      shadow_efer - Till recently wasn't part of the VMCS and may be needed
for older processors.
      cr0 - not used I will remove it
      cr2 - not used I will remove it
      cr3
      cr4

We didn't use the state stored in the vcpu for L1 because sometime it
changes during L2 run.
The vmcs in the vcpu point to the active vmcs  it is pointing to VMCS01
when L1 is running and to VMCS02 when L2 guest is running.

>
>
> >   static int init_rmode(struct kvm *kvm);
> >   static u64 construct_eptp(unsigned long root_hpa);
> >
> >
> >
> > +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> > +{
> > +   gpa_t gpa;
> > +   struct page *page;
> > +   int r = 0;
> > +
> > +   gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs
[VCPU_REGS_RAX]);
> > +
> > +   /* checking guest gpa */
> > +   page = gfn_to_page(vcpu->kvm, gpa>>  PAGE_SHIFT);
> > +   if (is_error_page(page)) {
> > +      printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
> > +             __func__, gpa);
> > +      r = 1;
> > +      goto out;
> > +   }
> > +
> > +   r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
> > +   if (r) {
> > +      printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
> > +             __func__, gpa, r);
> > +      goto out;
> > +   }
> >
>
> You can use kvm_read_guest_virt() to simplify this.
I will fix it.
>
> > +
> > +   if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> > +      printk(KERN_DEBUG "%s addr %llx not aligned\n",
> > +             __func__, *gentry);
> > +      return 1;
> > +   }
> > +
> > +out:
> > +   kvm_release_page_clean(page);
> > +   return r;
> > +}
> > +
> > +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct page *vmcs_page;
> > +   u64 guest_vmcs_addr;
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> > +      return 1;
> > +
> > +   if (create_l1_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   if (create_l2_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   vmx->nested.l2_state->vmcs = alloc_vmcs();
> > +   if (!vmx->nested.l2_state->vmcs) {
> > +      printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> > +      return 1;
> > +   }
> > +
> > +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> > +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> > +      if (vmcs_page == NULL)
> > +         return 1;
> > +
> > +      /* load nested vmcs to processor */
> > +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
> >
>
> So, you're loading a guest page as the vmcs.  This is dangerous as the
> guest can play with it.  Much better to use inaccessible memory (and you
> do alloc_vmcs() earlier?)
We can copy the vmcs and than vmptrld it. As for the allocate vmcs this is
a memory leak and I will fix it (it should be allocated only once).

> > +
> > +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> > +{
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
> >
>
> Should store to mem64 according to the docs?
>
> Better done through the emulator.
Sure I will fix it.
>
> > +void save_vmcs(struct shadow_vmcs *dst)
> > +{
> > +   dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> > +   dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +   dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> > +   dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> > +   dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> > +   dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> > +   dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> > +   dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> > +   dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> > +   dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> > +   dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> > +   if (cpu_has_vmx_msr_bitmap())
> > +      dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> > +
> > +   dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> > +   dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> > +   dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> > +   dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> > +   dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> > +   if (enable_ept)
> > +      dst->ept_pointer = vmcs_read64(EPT_POINTER);
> > +
> > +   dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> > +   dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> > +   if (enable_ept) {
> > +      dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> > +      dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> > +      dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> > +      dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> > +   }
> > +   dst->pin_based_vm_exec_control = vmcs_read32
(PIN_BASED_VM_EXEC_CONTROL);
> > +   dst->cpu_based_vm_exec_control = vmcs_read32
(CPU_BASED_VM_EXEC_CONTROL);
> > +   dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> > +   dst->page_fault_error_code_mask =
> > +      vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> > +   dst->page_fault_error_code_match =
> > +      vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> > +   dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> > +   dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> > +   dst->vm_exit_msr_store_count = vmcs_read32
(VM_EXIT_MSR_STORE_COUNT);
> > +   dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> > +   dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> > +   dst->vm_entry_msr_load_count = vmcs_read32
(VM_ENTRY_MSR_LOAD_COUNT);
> > +   dst->vm_entry_intr_info_field = vmcs_read32
(VM_ENTRY_INTR_INFO_FIELD);
> > +   dst->vm_entry_exception_error_code =
> > +      vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> > +   dst->vm_entry_instruction_len = vmcs_read32
(VM_ENTRY_INSTRUCTION_LEN);
> > +   dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> > +   dst->secondary_vm_exec_control = vmcs_read32
(SECONDARY_VM_EXEC_CONTROL);
> > +   if (enable_vpid&&  dst->secondary_vm_exec_control&
> > +       SECONDARY_EXEC_ENABLE_VPID)
> > +      dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> > +   dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> > +   dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> > +   dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   dst->vm_exit_intr_error_code = vmcs_read32
(VM_EXIT_INTR_ERROR_CODE);
> > +   dst->idt_vectoring_info_field = vmcs_read32
(IDT_VECTORING_INFO_FIELD);
> > +   dst->idt_vectoring_error_code = vmcs_read32
(IDT_VECTORING_ERROR_CODE);
> > +   dst->vm_exit_instruction_len = vmcs_read32
(VM_EXIT_INSTRUCTION_LEN);
> > +   dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> > +   dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> > +   dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> > +   dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> > +   dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> > +   dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> > +   dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> > +   dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> > +   dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> > +   dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> > +   dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> > +   dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> > +   dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> > +   dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> > +   dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> > +   dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> > +   dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> > +   dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> > +   dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> > +   dst->guest_interruptibility_info =
> > +      vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +   dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> > +   dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> > +   dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> > +   dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> > +   dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> > +   dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> > +   dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> > +   dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> > +   dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> > +   dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> > +   dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> > +   dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> > +   dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> > +   dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +   dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> > +   dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> > +   dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> > +   dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> > +   dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> > +   dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> > +   dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> > +   dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> > +   dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> > +   dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> > +   dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> > +   dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > +   dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> > +   dst->guest_rsp = vmcs_readl(GUEST_RSP);
> > +   dst->guest_rip = vmcs_readl(GUEST_RIP);
> > +   dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> > +   dst->guest_pending_dbg_exceptions =
> > +      vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > +   dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> > +   dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> > +   dst->host_cr0 = vmcs_readl(HOST_CR0);
> > +   dst->host_cr3 = vmcs_readl(HOST_CR3);
> > +   dst->host_cr4 = vmcs_readl(HOST_CR4);
> > +   dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> > +   dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> > +   dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> > +   dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> > +   dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> > +   dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> > +   dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> > +   dst->host_rsp = vmcs_readl(HOST_RSP);
> > +   dst->host_rip = vmcs_readl(HOST_RIP);
> > +   if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
> > +      dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> > +}
> >
>
> I see.  You're using the processor's format when reading the guest
> vmcs.  But we don't have to do that, we can use the shadow_vmcs
> structure (and a memcpy).
I'm sorry I don't understand your comment can u elaborate ?
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 6, 2009, 9:25 a.m. UTC | #3
On 09/03/2009 05:25 PM, Orit Wasserman wrote:
>>
>>> +   /*
>>> +    * Level 2 state : includes vmcs,registers and
>>> +    * a copy of vmcs12 for vmread/vmwrite
>>> +    */
>>> +   struct level_state *l2_state;
>>> +
>>> +   /* Level 1 state for switching to level 2 and back */
>>> +   struct level_state *l1_state;
>>>
>>>        
>> Can you explain why we need two of them?  in the guest vmcs we have host
>> and guest values, and in l1_state and l2_state we have more copies, and
>> in struct vcpu we have yet another set of copies.  We also have a couple
>> of copies in the host vmcs.  I'm getting dizzy...
>>      
> L2_state stores all the L2 guest state:
>        vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
>        shadow vmcs - a structure storing the values of VMCS12 (the vmcs L1
> create to run L2).
>    

When we support multiple nested guests, we'll run into a problem of 
where to store shadow_vmcs.  I see these options:

- maintain a cache of limited size of shadow_vmcs; when evicting, copy 
the shadow_vmcs into the guest's vmptr]
- always put shadow_vmcs in the guest's vmptr, and write protect it so 
the guest can't play with it
- always put shadow_vmcs in the guest's vmptr, and verify everything you 
read (that's what nsvm does)

>        cpu - the cpu id
>    

Why is it needed?

>        launched- launched flag
>    

Can be part of shadow_vmcs

>        vpid - the vpid allocate by L0 for L2 (we need to store it somewhere)
>    

Note the guest can DoS the host by allocating a lot of vpids.  So we to 
allocate host vpids on demand and be able to flush them out.

>        msr_bitmap - At the moment we use L0 msr_bitmap(as we are running kvm
> on kvm) in the future we will use a merge of both bitmaps.
>    

Note kvm uses two bitmaps (for long mode and legacy mode).

> L1 state stores the L1 state -
>        vmcs - pointer to VMCS01
>    

So it's the same as vmx->vmcs in normal operation?

>        shadow vmcs - a structure storing the values of VMCS01. we use it
> when updating VMCS02 in order to avoid the need to switch between VMCS02
> and VMCS01.
>    

Sorry, don't understand.

>        cpu - the cpu id
>        launched- launched flag
>    

This is a copy of vmx->launched?

>>> +
>>> +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
>>> +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
>>> +      if (vmcs_page == NULL)
>>> +         return 1;
>>> +
>>> +      /* load nested vmcs to processor */
>>> +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
>>>
>>>        
>> So, you're loading a guest page as the vmcs.  This is dangerous as the
>> guest can play with it.  Much better to use inaccessible memory (and you
>> do alloc_vmcs() earlier?)
>>      
> We can copy the vmcs and than vmptrld it. As for the allocate vmcs this is
> a memory leak and I will fix it (it should be allocated only once).
>    

But why do it?  Your approach is to store the guest vmcs in the same 
format as the processor (which we don't really know), so you have to use 
vmread/vmwrite to maintain it.  Instead, you can choose that the guest 
vmcs is a shadow_vmcs structure and then you can access it using normal 
memory operations.

>> I see.  You're using the processor's format when reading the guest
>> vmcs.  But we don't have to do that, we can use the shadow_vmcs
>> structure (and a memcpy).
>>      
> I'm sorry I don't understand your comment can u elaborate ?
>    
>

See previous comment.  Basically you can do

   struct shadow_vmcs *svmcs = kmap_atomic(gpa_to_page(vmx->vmptr));
   printk("guest_cs = %x\n", svmcs->guest_cs_selector);

instead of

   vmptrld(gpa_to_hpa(vmx->vmptr))
   printk("guest_cs = %x\n", vmcs_read16(GUEST_CS_SELECTOR));
oritw@il.ibm.com Sept. 6, 2009, 1:36 p.m. UTC | #4
Avi Kivity <avi@redhat.com> wrote on 06/09/2009 12:25:17:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mmday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 06/09/2009 12:25
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/03/2009 05:25 PM, Orit Wasserman wrote:
> >>
> >>> +   /*
> >>> +    * Level 2 state : includes vmcs,registers and
> >>> +    * a copy of vmcs12 for vmread/vmwrite
> >>> +    */
> >>> +   struct level_state *l2_state;
> >>> +
> >>> +   /* Level 1 state for switching to level 2 and back */
> >>> +   struct level_state *l1_state;
> >>>
> >>>
> >> Can you explain why we need two of them?  in the guest vmcs we have
host
> >> and guest values, and in l1_state and l2_state we have more copies,
and
> >> in struct vcpu we have yet another set of copies.  We also have a
couple
> >> of copies in the host vmcs.  I'm getting dizzy...
> >>
> > L2_state stores all the L2 guest state:
> >        vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
> >        shadow vmcs - a structure storing the values of VMCS12 (the vmcs
L1
> > create to run L2).
> >
>
> When we support multiple nested guests, we'll run into a problem of
> where to store shadow_vmcs.  I see these options:
>
> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
> the shadow_vmcs into the guest's vmptr]
> - always put shadow_vmcs in the guest's vmptr, and write protect it so
> the guest can't play with it
> - always put shadow_vmcs in the guest's vmptr, and verify everything you
> read (that's what nsvm does)
>
The second option looks a bit complicated I prefer one of the other two.
> >        cpu - the cpu id
> >
>
> Why is it needed?
This is a copy of the cpu id from the vcpu to store the last cpu id the L2
guest run on.
>
> >        launched- launched flag
> >
>
> Can be part of shadow_vmcs
I prefer to keep the shadow_vmcs as a separate structure to store only VMCS
fields.
>
> >        vpid - the vpid allocate by L0 for L2 (we need to store it
somewhere)
> >
>
> Note the guest can DoS the host by allocating a lot of vpids.  So we to
> allocate host vpids on demand and be able to flush them out.
The guest is not allocating the vpids the host(L0) does using
allocate_vpid.
I agree that with nested the danger for them to run out is bigger.
>
> >        msr_bitmap - At the moment we use L0 msr_bitmap(as we are
running kvm
> > on kvm) in the future we will use a merge of both bitmaps.
> >
>
> Note kvm uses two bitmaps (for long mode and legacy mode).
OK.
>
> > L1 state stores the L1 state -
> >        vmcs - pointer to VMCS01
> >
>
> So it's the same as vmx->vmcs in normal operation?
yes , but with nested the vmx->vmcs is changed when running an L2 (nested)
guest.
>
> >        shadow vmcs - a structure storing the values of VMCS01. we use
it
> > when updating VMCS02 in order to avoid the need to switch between
VMCS02
> > and VMCS01.
> >
>
> Sorry, don't understand.
VMCS02 - the VMCS L0 uses to run L2.
When we create/update VMCS02 we need to read fields from VMCS01 (host state
is taken fully, control fields ).
For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we used
the same structure.
>
> >        cpu - the cpu id
> >        launched- launched flag
> >
>
> This is a copy of vmx->launched?
Exactly .The vmx->launched is updated when switching from L1/L2 and back so
we need to store it here.
>
> >>> +
> >>> +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> >>> +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> >>> +      if (vmcs_page == NULL)
> >>> +         return 1;
> >>> +
> >>> +      /* load nested vmcs to processor */
> >>> +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
> >>>
> >>>
> >> So, you're loading a guest page as the vmcs.  This is dangerous as the
> >> guest can play with it.  Much better to use inaccessible memory (and
you
> >> do alloc_vmcs() earlier?)
> >>
> > We can copy the vmcs and than vmptrld it. As for the allocate vmcs this
is
> > a memory leak and I will fix it (it should be allocated only once).
> >
>
> But why do it?  Your approach is to store the guest vmcs in the same
> format as the processor (which we don't really know), so you have to use
> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
> vmcs is a shadow_vmcs structure and then you can access it using normal
> memory operations.
I got it now.
We will need a way to distinguish between processor format VMCS and
structure based VMCS,
we can use the revision id field (create a unique revision id for nested
like 0xffff or 0x0).
>
> >> I see.  You're using the processor's format when reading the guest
> >> vmcs.  But we don't have to do that, we can use the shadow_vmcs
> >> structure (and a memcpy).
> >>
> > I'm sorry I don't understand your comment can u elaborate ?
> >
> >
>
> See previous comment.  Basically you can do
>
>    struct shadow_vmcs *svmcs = kmap_atomic(gpa_to_page(vmx->vmptr));
>    printk("guest_cs = %x\n", svmcs->guest_cs_selector);
See above.
>
> instead of
>
>    vmptrld(gpa_to_hpa(vmx->vmptr))
>    printk("guest_cs = %x\n", vmcs_read16(GUEST_CS_SELECTOR));
>
>
> --
> error compiling committee.c: too many arguments to function
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 6, 2009, 1:52 p.m. UTC | #5
On 09/06/2009 04:36 PM, Orit Wasserman wrote:
>
>> When we support multiple nested guests, we'll run into a problem of
>> where to store shadow_vmcs.  I see these options:
>>
>> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
>> the shadow_vmcs into the guest's vmptr]
>> - always put shadow_vmcs in the guest's vmptr, and write protect it so
>> the guest can't play with it
>> - always put shadow_vmcs in the guest's vmptr, and verify everything you
>> read (that's what nsvm does)
>>
>>      
> The second option looks a bit complicated I prefer one of the other two.
>    

I agree, the third option looks easiest but not sure how much 
verification is needed.

Note other things like the msr bitmaps may need write protection, 
otherwise you have to re-merge the bitmap on every guest entry, which 
can be very slow.  So we may be forced to add write protection anyway.

>>>         launched- launched flag
>>>
>>>        
>> Can be part of shadow_vmcs
>>      
> I prefer to keep the shadow_vmcs as a separate structure to store only VMCS
> fields.
>    

It is a vmcs field - it is manipulated by vmx instructions which operate 
on the vmcs.  You'll need to store it in guest memory when you support 
multiple nested guests.

You can put the vmcs fields in a sub-structure if you want to separate 
between explicit fields and implicit fields (I can only see one implicit 
field (launched), but maybe there are more).

>>      
>>>         vpid - the vpid allocate by L0 for L2 (we need to store it
>>>        
> somewhere)
>    
>>>        
>> Note the guest can DoS the host by allocating a lot of vpids.  So we to
>> allocate host vpids on demand and be able to flush them out.
>>      
> The guest is not allocating the vpids the host(L0) does using
> allocate_vpid.
>    

I meant, the guest can force the host to allocate vpids if we don't 
protect against it.

> I agree that with nested the danger for them to run out is bigger.
>    


>> Sorry, don't understand.
>>      
> VMCS02 - the VMCS L0 uses to run L2.
> When we create/update VMCS02 we need to read fields from VMCS01 (host state
> is taken fully, control fields ).
> For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we used
> the same structure.
>    

I don't understand why you need it.  Host state shouldn't change.  Only 
the control fields are interesting, and things like exception_bitmap.

>> But why do it?  Your approach is to store the guest vmcs in the same
>> format as the processor (which we don't really know), so you have to use
>> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
>> vmcs is a shadow_vmcs structure and then you can access it using normal
>> memory operations.
>>      
> I got it now.
> We will need a way to distinguish between processor format VMCS and
> structure based VMCS,
> we can use the revision id field (create a unique revision id for nested
> like 0xffff or 0x0).
>    

No, you can always store guest vmcs in software format, since we'll 
never load it with vmptrld.  We'll only load a real vmcs with vmptrld.

Note it also solves live migration, since now all guest vmcss are copied 
as part of normal guest memory (including their launched state).
oritw@il.ibm.com Sept. 6, 2009, 4:55 p.m. UTC | #6
Avi Kivity <avi@redhat.com> wrote on 06/09/2009 16:52:56:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 06/09/2009 16:53
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/06/2009 04:36 PM, Orit Wasserman wrote:
> >
> >> When we support multiple nested guests, we'll run into a problem of
> >> where to store shadow_vmcs.  I see these options:
> >>
> >> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
> >> the shadow_vmcs into the guest's vmptr]
> >> - always put shadow_vmcs in the guest's vmptr, and write protect it so
> >> the guest can't play with it
> >> - always put shadow_vmcs in the guest's vmptr, and verify everything
you
> >> read (that's what nsvm does)
> >>
> >>
> > The second option looks a bit complicated I prefer one of the other
two.
> >
>
> I agree, the third option looks easiest but not sure how much
> verification is needed.
>
> Note other things like the msr bitmaps may need write protection,
> otherwise you have to re-merge the bitmap on every guest entry, which
> can be very slow.  So we may be forced to add write protection anyway.
We will also need to write protected L1's EPT tables , to allow L1 to swap
out his guests.
>
> >>>         launched- launched flag
> >>>
> >>>
> >> Can be part of shadow_vmcs
> >>
> > I prefer to keep the shadow_vmcs as a separate structure to store only
VMCS
> > fields.
> >
>
> It is a vmcs field - it is manipulated by vmx instructions which operate
> on the vmcs.  You'll need to store it in guest memory when you support
> multiple nested guests.
>
> You can put the vmcs fields in a sub-structure if you want to separate
> between explicit fields and implicit fields (I can only see one implicit
> field (launched), but maybe there are more).
OK.
>
> >>
> >>>         vpid - the vpid allocate by L0 for L2 (we need to store it
> >>>
> > somewhere)
> >
> >>>
> >> Note the guest can DoS the host by allocating a lot of vpids.  So we
to
> >> allocate host vpids on demand and be able to flush them out.
> >>
> > The guest is not allocating the vpids the host(L0) does using
> > allocate_vpid.
> >
>
> I meant, the guest can force the host to allocate vpids if we don't
> protect against it.
You meant by launching a lot of guests ?
We can limit the number of guests as a very quick solution.
More complicated is limiting the number of vpids per L1 hypervisor and
reusing them.
This means we will sometime need to invalidate the vpid when switching
between L2 guests.
>
> > I agree that with nested the danger for them to run out is bigger.
> >
>
>
> >> Sorry, don't understand.
> >>
> > VMCS02 - the VMCS L0 uses to run L2.
> > When we create/update VMCS02 we need to read fields from VMCS01 (host
state
> > is taken fully, control fields ).
> > For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we
used
> > the same structure.
> >
>
> I don't understand why you need it.  Host state shouldn't change.  Only
> the control fields are interesting, and things like exception_bitmap.
I think that when KVM switches to Qemu the host state can change (L0 host
state). If this happens between different runs of L2
we will need to update VMCS02 host state. Of course we can optimize and
update it only than.
>
> >> But why do it?  Your approach is to store the guest vmcs in the same
> >> format as the processor (which we don't really know), so you have to
use
> >> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
> >> vmcs is a shadow_vmcs structure and then you can access it using
normal
> >> memory operations.
> >>
> > I got it now.
> > We will need a way to distinguish between processor format VMCS and
> > structure based VMCS,
> > we can use the revision id field (create a unique revision id for
nested
> > like 0xffff or 0x0).
> >
>
> No, you can always store guest vmcs in software format, since we'll
> never load it with vmptrld.  We'll only load a real vmcs with vmptrld.
You are right , a new VMCS will be zeroed.
>
> Note it also solves live migration, since now all guest vmcss are copied
> as part of normal guest memory (including their launched state).
Great.
>
> --
> error compiling committee.c: too many arguments to function
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 6, 2009, 7:10 p.m. UTC | #7
On 09/06/2009 07:55 PM, Orit Wasserman wrote:
>> Note other things like the msr bitmaps may need write protection,
>> otherwise you have to re-merge the bitmap on every guest entry, which
>> can be very slow.  So we may be forced to add write protection anyway.
>>      
> We will also need to write protected L1's EPT tables , to allow L1 to swap
> out his guests.
>    

That comes naturally with the shadow mmu.  In the same way normal shadow 
mmu protects guest page tables, nested EPT shadow should protect the 
guest's EPT pages.

(unfortunately there is no INVEPT instruction that accepts a gpa 
operand; this would make write protection unnecessary).

>> I meant, the guest can force the host to allocate vpids if we don't
>> protect against it.
>>      
> You meant by launching a lot of guests ?
>    

Yes.

> We can limit the number of guests as a very quick solution.
>    

How?  There is no way to tell the guest not to launch more guests.

> More complicated is limiting the number of vpids per L1 hypervisor and
> reusing them.
>    

When the bitmap is full, clear it.  Use a generation count to tell vcpus 
to reload.  svm does that (svm only has 63 asids).

> This means we will sometime need to invalidate the vpid when switching
> between L2 guests.
>    

Yes.

>> I don't understand why you need it.  Host state shouldn't change.  Only
>> the control fields are interesting, and things like exception_bitmap.
>>      
> I think that when KVM switches to Qemu the host state can change (L0 host
> state). If this happens between different runs of L2
> we will need to update VMCS02 host state. Of course we can optimize and
> update it only than.
>    

No, I don't think any host state changes, except for cr0.ts.
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2b1fc3b..5ab07a0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,151 @@  module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
+struct __attribute__ ((__packed__)) shadow_vmcs {
+	uint16_t virtual_processor_id;
+	uint16_t guest_es_selector;
+	uint16_t guest_cs_selector;
+	uint16_t guest_ss_selector;
+	uint16_t guest_ds_selector;
+	uint16_t guest_fs_selector;
+	uint16_t guest_gs_selector;
+	uint16_t guest_ldtr_selector;
+	uint16_t guest_tr_selector;
+	uint16_t host_es_selector;
+	uint16_t host_cs_selector;
+	uint16_t host_ss_selector;
+	uint16_t host_ds_selector;
+	uint16_t host_fs_selector;
+	uint16_t host_gs_selector;
+	uint16_t host_tr_selector;
+	uint64_t io_bitmap_a;
+	uint64_t io_bitmap_b;
+	uint64_t msr_bitmap;
+	uint64_t vm_exit_msr_store_addr;
+	uint64_t vm_exit_msr_load_addr;
+	uint64_t vm_entry_msr_load_addr;
+	uint64_t tsc_offset;
+	uint64_t virtual_apic_page_addr;
+	uint64_t apic_access_addr;
+	uint64_t ept_pointer;
+	uint64_t guest_physical_address;
+	uint64_t vmcs_link_pointer;
+	uint64_t guest_ia32_debugctl;
+	uint64_t guest_ia32_pat;
+	uint64_t guest_pdptr0;
+	uint64_t guest_pdptr1;
+	uint64_t guest_pdptr2;
+	uint64_t guest_pdptr3;
+	uint64_t host_ia32_pat;
+	uint32_t pin_based_vm_exec_control;
+	uint32_t cpu_based_vm_exec_control;
+	uint32_t exception_bitmap;
+	uint32_t page_fault_error_code_mask;
+	uint32_t page_fault_error_code_match;
+	uint32_t cr3_target_count;
+	uint32_t vm_exit_controls;
+	uint32_t vm_exit_msr_store_count;
+	uint32_t vm_exit_msr_load_count;
+	uint32_t vm_entry_controls;
+	uint32_t vm_entry_msr_load_count;
+	uint32_t vm_entry_intr_info_field;
+	uint32_t vm_entry_exception_error_code;
+	uint32_t vm_entry_instruction_len;
+	uint32_t tpr_threshold;
+	uint32_t secondary_vm_exec_control;
+	uint32_t vm_instruction_error;
+	uint32_t vm_exit_reason;
+	uint32_t vm_exit_intr_info;
+	uint32_t vm_exit_intr_error_code;
+	uint32_t idt_vectoring_info_field;
+	uint32_t idt_vectoring_error_code;
+	uint32_t vm_exit_instruction_len;
+	uint32_t vmx_instruction_info;
+	uint32_t guest_es_limit;
+	uint32_t guest_cs_limit;
+	uint32_t guest_ss_limit;
+	uint32_t guest_ds_limit;
+	uint32_t guest_fs_limit;
+	uint32_t guest_gs_limit;
+	uint32_t guest_ldtr_limit;
+	uint32_t guest_tr_limit;
+	uint32_t guest_gdtr_limit;
+	uint32_t guest_idtr_limit;
+	uint32_t guest_es_ar_bytes;
+	uint32_t guest_cs_ar_bytes;
+	uint32_t guest_ss_ar_bytes;
+	uint32_t guest_ds_ar_bytes;
+	uint32_t guest_fs_ar_bytes;
+	uint32_t guest_gs_ar_bytes;
+	uint32_t guest_ldtr_ar_bytes;
+	uint32_t guest_tr_ar_bytes;
+	uint32_t guest_interruptibility_info;
+	uint32_t guest_activity_state;
+	uint32_t guest_sysenter_cs;
+	uint32_t host_ia32_sysenter_cs;
+	unsigned long cr0_guest_host_mask;
+	unsigned long cr4_guest_host_mask;
+	unsigned long cr0_read_shadow;
+	unsigned long cr4_read_shadow;
+	unsigned long cr3_target_value0;
+	unsigned long cr3_target_value1;
+	unsigned long cr3_target_value2;
+	unsigned long cr3_target_value3;
+	unsigned long exit_qualification;
+	unsigned long guest_linear_address;
+	unsigned long guest_cr0;
+	unsigned long guest_cr3;
+	unsigned long guest_cr4;
+	unsigned long guest_es_base;
+	unsigned long guest_cs_base;
+	unsigned long guest_ss_base;
+	unsigned long guest_ds_base;
+	unsigned long guest_fs_base;
+	unsigned long guest_gs_base;
+	unsigned long guest_ldtr_base;
+	unsigned long guest_tr_base;
+	unsigned long guest_gdtr_base;
+	unsigned long guest_idtr_base;
+	unsigned long guest_dr7;
+	unsigned long guest_rsp;
+	unsigned long guest_rip;
+	unsigned long guest_rflags;
+	unsigned long guest_pending_dbg_exceptions;
+	unsigned long guest_sysenter_esp;
+	unsigned long guest_sysenter_eip;
+	unsigned long host_cr0;
+	unsigned long host_cr3;
+	unsigned long host_cr4;
+	unsigned long host_fs_base;
+	unsigned long host_gs_base;
+	unsigned long host_tr_base;
+	unsigned long host_gdtr_base;
+	unsigned long host_idtr_base;
+	unsigned long host_ia32_sysenter_esp;
+	unsigned long host_ia32_sysenter_eip;
+	unsigned long host_rsp;
+	unsigned long host_rip;
+};
+
+struct __attribute__ ((__packed__)) level_state {
+	struct shadow_vmcs *shadow_vmcs;
+
+	u16 vpid;
+	u64 shadow_efer;
+	unsigned long cr2;
+	unsigned long cr3;
+	unsigned long cr4;
+	unsigned long cr8;
+
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+
+	struct vmcs *vmcs;
+	int cpu;
+	int launched;
+};
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -72,6 +217,17 @@  struct nested_vmx {
 	bool vmon;
 	/* Has the level1 guest done vmclear? */
 	bool vmclear;
+	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
+	u64 l1_cur_vmcs;
+	/*
+	 * Level 2 state : includes vmcs,registers and
+	 * a copy of vmcs12 for vmread/vmwrite
+	 */
+	struct level_state *l2_state;
+
+	/* Level 1 state for switching to level 2 and back */
+	struct level_state *l1_state;
+
 };
 
 struct vcpu_vmx {
@@ -131,6 +287,25 @@  static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static struct page *nested_get_page(struct kvm_vcpu *vcpu,
+				    u64 vmcs_addr)
+{
+	struct page *vmcs_page = NULL;
+
+	down_read(&current->mm->mmap_sem);
+	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(vmcs_page)) {
+		printk(KERN_ERR "%s error allocating page \n", __func__);
+		kvm_release_page_clean(vmcs_page);
+		return NULL;
+	}
+
+	return vmcs_page;
+
+}
+
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 
@@ -188,6 +363,10 @@  static struct kvm_vmx_segment_field {
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
+static int create_l1_state(struct kvm_vcpu *vcpu);
+static int create_l2_state(struct kvm_vcpu *vcpu);
+static int shadow_vmcs_load(struct kvm_vcpu *vcpu);
+
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
  * away by decrementing the array size.
@@ -704,6 +883,24 @@  static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	preempt_enable();
 }
 
+
+static int vmptrld(struct kvm_vcpu *vcpu,
+		   u64 phys_addr)
+{
+	u8 error;
+
+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+		      : "cc");
+	if (error) {
+		printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
+		       __func__, phys_addr);
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -725,15 +922,8 @@  static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		u8 error;
-
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-			      : "cc");
-		if (error)
-			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-			       vmx->vmcs, phys_addr);
+		vmptrld(vcpu, phys_addr);
 	}
 
 	if (vcpu->cpu != cpu) {
@@ -3252,6 +3442,115 @@  static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
+{
+	gpa_t gpa;
+	struct page *page;
+	int r = 0;
+
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+
+	/* checking guest gpa */
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
+		       __func__, gpa);
+		r = 1;
+		goto out;
+	}
+
+	r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
+	if (r) {
+		printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
+		       __func__, gpa, r);
+		goto out;
+	}
+
+	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
+		printk(KERN_DEBUG "%s addr %llx not aligned\n",
+		       __func__, *gentry);
+		return 1;
+	}
+
+out:
+	kvm_release_page_clean(page);
+	return r;
+}
+
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *vmcs_page;
+	u64 guest_vmcs_addr;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (read_guest_vmcs_gpa(vcpu, &guest_vmcs_addr))
+		return 1;
+
+	if (create_l1_state(vcpu)) {
+		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
+		return 1;
+	}
+
+	if (create_l2_state(vcpu)) {
+		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
+		return 1;
+	}
+
+	vmx->nested.l2_state->vmcs = alloc_vmcs();
+	if (!vmx->nested.l2_state->vmcs) {
+		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
+		return 1;
+	}
+
+	if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
+		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
+		if (vmcs_page == NULL)
+			return 1;
+
+		/* load nested vmcs to processor */
+		if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
+			printk(KERN_INFO "%s error in vmptrld \n",
+			       __func__);
+			kvm_release_page_clean(vmcs_page);
+			return 1;
+		}
+
+		/* save nested vmcs in the shadow vmcs */
+		if (shadow_vmcs_load(vcpu)) {
+			kvm_release_page_clean(vmcs_page);
+			return 1;
+		}
+
+		vmx->nested.l1_cur_vmcs = guest_vmcs_addr;
+
+		/* load to previous vmcs */
+		if (vmptrld(vcpu, __pa(to_vmx(vcpu)->vmcs))) {
+			kvm_release_page_clean(vmcs_page);
+				return 1;
+		}
+
+		kvm_release_page_clean(vmcs_page);
+	}
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
+
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
 	kvm_queue_exception(vcpu, UD_VECTOR);
@@ -3576,8 +3875,8 @@  static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
-	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
-	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
+	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
+	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
@@ -4115,6 +4414,220 @@  static bool vmx_gb_page_enable(void)
 	return false;
 }
 
+void save_vmcs(struct shadow_vmcs *dst)
+{
+	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	if (cpu_has_vmx_msr_bitmap())
+		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
+
+	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
+	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
+	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
+	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
+	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
+	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
+	if (enable_ept)
+		dst->ept_pointer = vmcs_read64(EPT_POINTER);
+
+	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	if (enable_ept) {
+		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+	}
+	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
+	dst->page_fault_error_code_mask =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
+	dst->page_fault_error_code_match =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
+	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
+	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
+	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
+	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
+	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
+	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	dst->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
+	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	if (enable_vpid && dst->secondary_vm_exec_control &
+	    SECONDARY_EXEC_ENABLE_VPID)
+		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
+	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	dst->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
+	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
+	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
+	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
+	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
+	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
+	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
+	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
+	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
+	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
+	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
+	dst->guest_rsp = vmcs_readl(GUEST_RSP);
+	dst->guest_rip = vmcs_readl(GUEST_RIP);
+	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	dst->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+	dst->host_cr0 = vmcs_readl(HOST_CR0);
+	dst->host_cr3 = vmcs_readl(HOST_CR3);
+	dst->host_cr4 = vmcs_readl(HOST_CR4);
+	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
+	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
+	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
+	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
+	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
+	dst->host_rsp = vmcs_readl(HOST_RSP);
+	dst->host_rip = vmcs_readl(HOST_RIP);
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
+}
+
+static int shadow_vmcs_load(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		vmx->nested.l2_state->shadow_vmcs =
+			kzalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!vmx->nested.l2_state->shadow_vmcs) {
+			printk(KERN_INFO "%s error creating nested vmcs\n",
+			       __func__);
+			return -ENOMEM;
+		}
+	}
+
+	save_vmcs(vmx->nested.l2_state->shadow_vmcs);
+
+	return 0;
+}
+
+struct level_state *create_state(void)
+{
+	struct level_state *state = NULL;
+
+	state = kzalloc(sizeof(struct level_state), GFP_KERNEL);
+	if (!state) {
+		printk(KERN_INFO "Error create level state\n");
+		return NULL;
+	}
+	state->shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!state->shadow_vmcs) {
+		printk(KERN_INFO "%s error creating shadow vmcs\n",
+		       __func__);
+		kfree(state);
+		return NULL;
+	}
+	return state;
+}
+
+int create_l1_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l1_state) {
+		vmx->nested.l1_state = create_state();
+		if (!vmx->nested.l1_state)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int create_l2_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l2_state) {
+		vmx->nested.l2_state = create_state();
+		if (!vmx->nested.l2_state)
+			return -ENOMEM;
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.l2_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.l2_state->msr_bitmap = 0;
+
+	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+
+	return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,