[3/6] Nested VMX patch 3 implements vmptrld and vmptrst

Message ID	1251905916-2834-4-git-send-email-oritw@il.ibm.com (mailing list archive)
State	New, archived
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n82Fco8Q004330 for <patchwork-kvm@patchwork.kernel.org>; Wed, 2 Sep 2009 15:38:50 GMT From: oritw@il.ibm.com To: kvm@vger.kernel.org Cc: oritw@il.ibm.com, benami@il.ibm.com, muli@il.ibm.com, abelg@il.ibm.com, aliguori@us.ibm.com, mmday@us.ibm.com Subject: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst Date: Wed, 2 Sep 2009 18:38:33 +0300 Message-Id: <1251905916-2834-4-git-send-email-oritw@il.ibm.com> In-Reply-To: <1251905916-2834-3-git-send-email-oritw@il.ibm.com> References: <1251905916-2834-1-git-send-email-oritw@il.ibm.com> <1251905916-2834-2-git-send-email-oritw@il.ibm.com> <1251905916-2834-3-git-send-email-oritw@il.ibm.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

oritw@il.ibm.com Sept. 2, 2009, 3:38 p.m. UTC

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  533 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 523 insertions(+), 10 deletions(-)

Avi Kivity Sept. 2, 2009, 8:05 p.m. UTC | #1

On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> +struct __attribute__ ((__packed__)) level_state {
> +	struct shadow_vmcs *shadow_vmcs;
> +
> +	u16 vpid;
> +	u64 shadow_efer;
> +	unsigned long cr2;
> +	unsigned long cr3;
> +	unsigned long cr4;
> +	unsigned long cr8;
> +
> +	u64 io_bitmap_a;
> +	u64 io_bitmap_b;
> +	u64 msr_bitmap;
> +
> +	struct vmcs *vmcs;
> +	int cpu;
> +	int launched;
> +};
>    



> +
>   struct vmcs {
>   	u32 revision_id;
>   	u32 abort;
> @@ -72,6 +217,17 @@ struct nested_vmx {
>   	bool vmon;
>   	/* Has the level1 guest done vmclear? */
>   	bool vmclear;
> +	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
> +	u64 l1_cur_vmcs;
>    

This is the vmptr (exactly as loaded by vmptrld), right?  If so, please 
call it vmptr.

> +	/*
> +	 * Level 2 state : includes vmcs,registers and
> +	 * a copy of vmcs12 for vmread/vmwrite
> +	 */
> +	struct level_state *l2_state;
> +
> +	/* Level 1 state for switching to level 2 and back */
> +	struct level_state *l1_state;
>    

Can you explain why we need two of them?  in the guest vmcs we have host 
and guest values, and in l1_state and l2_state we have more copies, and 
in struct vcpu we have yet another set of copies.  We also have a couple 
of copies in the host vmcs.  I'm getting dizzy...


>   static int init_rmode(struct kvm *kvm);
>   static u64 construct_eptp(unsigned long root_hpa);
>
>
>
> +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> +{
> +	gpa_t gpa;
> +	struct page *page;
> +	int r = 0;
> +
> +	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
> +
> +	/* checking guest gpa */
> +	page = gfn_to_page(vcpu->kvm, gpa>>  PAGE_SHIFT);
> +	if (is_error_page(page)) {
> +		printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
> +		       __func__, gpa);
> +		r = 1;
> +		goto out;
> +	}
> +
> +	r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
> +	if (r) {
> +		printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
> +		       __func__, gpa, r);
> +		goto out;
> +	}
>    

You can use kvm_read_guest_virt() to simplify this.

> +
> +	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> +		printk(KERN_DEBUG "%s addr %llx not aligned\n",
> +		       __func__, *gentry);
> +		return 1;
> +	}
> +
> +out:
> +	kvm_release_page_clean(page);
> +	return r;
> +}
> +
> +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page;
> +	u64 guest_vmcs_addr;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> +		return 1;
> +
> +	if (create_l1_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	if (create_l2_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	vmx->nested.l2_state->vmcs = alloc_vmcs();
> +	if (!vmx->nested.l2_state->vmcs) {
> +		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> +		return 1;
> +	}
> +
> +	if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> +		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> +		if (vmcs_page == NULL)
> +			return 1;
> +
> +		/* load nested vmcs to processor */
> +		if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
>    

So, you're loading a guest page as the vmcs.  This is dangerous as the 
guest can play with it.  Much better to use inaccessible memory (and you 
do alloc_vmcs() earlier?)

> +
> +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> +{
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
>    

Should store to mem64 according to the docs?

Better done through the emulator.

> +void save_vmcs(struct shadow_vmcs *dst)
> +{
> +	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	if (cpu_has_vmx_msr_bitmap())
> +		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> +
> +	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> +	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> +	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> +	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> +	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> +	if (enable_ept)
> +		dst->ept_pointer = vmcs_read64(EPT_POINTER);
> +
> +	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	if (enable_ept) {
> +		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> +		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> +		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> +		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> +	}
> +	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
> +	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> +	dst->page_fault_error_code_mask =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> +	dst->page_fault_error_code_match =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> +	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> +	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
> +	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> +	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> +	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
> +	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	dst->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> +	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +	if (enable_vpid&&  dst->secondary_vm_exec_control&
> +	    SECONDARY_EXEC_ENABLE_VPID)
> +		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> +	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> +	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	dst->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> +	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> +	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> +	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> +	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> +	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> +	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> +	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> +	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> +	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> +	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> +	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> +	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	dst->guest_rsp = vmcs_readl(GUEST_RSP);
> +	dst->guest_rip = vmcs_readl(GUEST_RIP);
> +	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	dst->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +	dst->host_cr0 = vmcs_readl(HOST_CR0);
> +	dst->host_cr3 = vmcs_readl(HOST_CR3);
> +	dst->host_cr4 = vmcs_readl(HOST_CR4);
> +	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> +	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> +	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> +	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> +	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> +	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> +	dst->host_rsp = vmcs_readl(HOST_RSP);
> +	dst->host_rip = vmcs_readl(HOST_RIP);
> +	if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
> +		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> +}
>    

I see.  You're using the processor's format when reading the guest 
vmcs.  But we don't have to do that, we can use the shadow_vmcs 
structure (and a memcpy).

oritw@il.ibm.com Sept. 3, 2009, 2:25 p.m. UTC | #2

Avi Kivity <avi@redhat.com> wrote on 02/09/2009 23:05:09:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mmday@us.ibm.com
>
> Date:
>
> 02/09/2009 23:04
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> > +struct __attribute__ ((__packed__)) level_state {
> > +   struct shadow_vmcs *shadow_vmcs;
> > +
> > +   u16 vpid;
> > +   u64 shadow_efer;
> > +   unsigned long cr2;
> > +   unsigned long cr3;
> > +   unsigned long cr4;
> > +   unsigned long cr8;
> > +
> > +   u64 io_bitmap_a;
> > +   u64 io_bitmap_b;
> > +   u64 msr_bitmap;
> > +
> > +   struct vmcs *vmcs;
> > +   int cpu;
> > +   int launched;
> > +};
> >
>
>
>
> > +
> >   struct vmcs {
> >      u32 revision_id;
> >      u32 abort;
> > @@ -72,6 +217,17 @@ struct nested_vmx {
> >      bool vmon;
> >      /* Has the level1 guest done vmclear? */
> >      bool vmclear;
> > +   /* What is the location of the  vmcs l1 keeps for l2? (in level1
gpa) */
> > +   u64 l1_cur_vmcs;
> >
>
> This is the vmptr (exactly as loaded by vmptrld), right?  If so, please
> call it vmptr.
OK I will change it.
>
> > +   /*
> > +    * Level 2 state : includes vmcs,registers and
> > +    * a copy of vmcs12 for vmread/vmwrite
> > +    */
> > +   struct level_state *l2_state;
> > +
> > +   /* Level 1 state for switching to level 2 and back */
> > +   struct level_state *l1_state;
> >
>
> Can you explain why we need two of them?  in the guest vmcs we have host
> and guest values, and in l1_state and l2_state we have more copies, and
> in struct vcpu we have yet another set of copies.  We also have a couple
> of copies in the host vmcs.  I'm getting dizzy...
L2_state stores all the L2 guest state:
      vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
      shadow vmcs - a structure storing the values of VMCS12 (the vmcs L1
create to run L2).
      cpu - the cpu id
      launched- launched flag
      vpid - the vpid allocate by L0 for L2 (we need to store it somewhere)
      msr_bitmap - At the moment we use L0 msr_bitmap(as we are running kvm
on kvm) in the future we will use a merge of both bitmaps.
      io_bitmaps - At the moment we use L0 io_bitmaps (as we are running
kvm on kvm) in the future we will use a merge of both io_bitmaps.

L1 state stores the L1 state -
      vmcs - pointer to VMCS01
      shadow vmcs - a structure storing the values of VMCS01. we use it
when updating VMCS02 in order to avoid the need to switch between VMCS02
and VMCS01.
      cpu - the cpu id
      launched- launched flag
      vpid - the vpid allocate by L0 for L1 (we need to store it somewhere)
      shadow_efer - Till recently wasn't part of the VMCS and may be needed
for older processors.
      cr0 - not used I will remove it
      cr2 - not used I will remove it
      cr3
      cr4

We didn't use the state stored in the vcpu for L1 because sometime it
changes during L2 run.
The vmcs in the vcpu point to the active vmcs  it is pointing to VMCS01
when L1 is running and to VMCS02 when L2 guest is running.

>
>
> >   static int init_rmode(struct kvm *kvm);
> >   static u64 construct_eptp(unsigned long root_hpa);
> >
> >
> >
> > +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> > +{
> > +   gpa_t gpa;
> > +   struct page *page;
> > +   int r = 0;
> > +
> > +   gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs
[VCPU_REGS_RAX]);
> > +
> > +   /* checking guest gpa */
> > +   page = gfn_to_page(vcpu->kvm, gpa>>  PAGE_SHIFT);
> > +   if (is_error_page(page)) {
> > +      printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
> > +             __func__, gpa);
> > +      r = 1;
> > +      goto out;
> > +   }
> > +
> > +   r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
> > +   if (r) {
> > +      printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
> > +             __func__, gpa, r);
> > +      goto out;
> > +   }
> >
>
> You can use kvm_read_guest_virt() to simplify this.
I will fix it.
>
> > +
> > +   if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> > +      printk(KERN_DEBUG "%s addr %llx not aligned\n",
> > +             __func__, *gentry);
> > +      return 1;
> > +   }
> > +
> > +out:
> > +   kvm_release_page_clean(page);
> > +   return r;
> > +}
> > +
> > +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct page *vmcs_page;
> > +   u64 guest_vmcs_addr;
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> > +      return 1;
> > +
> > +   if (create_l1_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   if (create_l2_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   vmx->nested.l2_state->vmcs = alloc_vmcs();
> > +   if (!vmx->nested.l2_state->vmcs) {
> > +      printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> > +      return 1;
> > +   }
> > +
> > +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> > +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> > +      if (vmcs_page == NULL)
> > +         return 1;
> > +
> > +      /* load nested vmcs to processor */
> > +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
> >
>
> So, you're loading a guest page as the vmcs.  This is dangerous as the
> guest can play with it.  Much better to use inaccessible memory (and you
> do alloc_vmcs() earlier?)
We can copy the vmcs and than vmptrld it. As for the allocate vmcs this is
a memory leak and I will fix it (it should be allocated only once).

> > +
> > +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> > +{
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
> >
>
> Should store to mem64 according to the docs?
>
> Better done through the emulator.
Sure I will fix it.
>
> > +void save_vmcs(struct shadow_vmcs *dst)
> > +{
> > +   dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> > +   dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +   dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> > +   dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> > +   dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> > +   dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> > +   dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> > +   dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> > +   dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> > +   dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> > +   dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> > +   if (cpu_has_vmx_msr_bitmap())
> > +      dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> > +
> > +   dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> > +   dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> > +   dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> > +   dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> > +   dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> > +   if (enable_ept)
> > +      dst->ept_pointer = vmcs_read64(EPT_POINTER);
> > +
> > +   dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> > +   dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> > +   if (enable_ept) {
> > +      dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> > +      dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> > +      dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> > +      dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> > +   }
> > +   dst->pin_based_vm_exec_control = vmcs_read32
(PIN_BASED_VM_EXEC_CONTROL);
> > +   dst->cpu_based_vm_exec_control = vmcs_read32
(CPU_BASED_VM_EXEC_CONTROL);
> > +   dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> > +   dst->page_fault_error_code_mask =
> > +      vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> > +   dst->page_fault_error_code_match =
> > +      vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> > +   dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> > +   dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> > +   dst->vm_exit_msr_store_count = vmcs_read32
(VM_EXIT_MSR_STORE_COUNT);
> > +   dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> > +   dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> > +   dst->vm_entry_msr_load_count = vmcs_read32
(VM_ENTRY_MSR_LOAD_COUNT);
> > +   dst->vm_entry_intr_info_field = vmcs_read32
(VM_ENTRY_INTR_INFO_FIELD);
> > +   dst->vm_entry_exception_error_code =
> > +      vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> > +   dst->vm_entry_instruction_len = vmcs_read32
(VM_ENTRY_INSTRUCTION_LEN);
> > +   dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> > +   dst->secondary_vm_exec_control = vmcs_read32
(SECONDARY_VM_EXEC_CONTROL);
> > +   if (enable_vpid&&  dst->secondary_vm_exec_control&
> > +       SECONDARY_EXEC_ENABLE_VPID)
> > +      dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> > +   dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> > +   dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> > +   dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   dst->vm_exit_intr_error_code = vmcs_read32
(VM_EXIT_INTR_ERROR_CODE);
> > +   dst->idt_vectoring_info_field = vmcs_read32
(IDT_VECTORING_INFO_FIELD);
> > +   dst->idt_vectoring_error_code = vmcs_read32
(IDT_VECTORING_ERROR_CODE);
> > +   dst->vm_exit_instruction_len = vmcs_read32
(VM_EXIT_INSTRUCTION_LEN);
> > +   dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> > +   dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> > +   dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> > +   dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> > +   dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> > +   dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> > +   dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> > +   dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> > +   dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> > +   dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> > +   dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> > +   dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> > +   dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> > +   dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> > +   dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> > +   dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> > +   dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> > +   dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> > +   dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> > +   dst->guest_interruptibility_info =
> > +      vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +   dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> > +   dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> > +   dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> > +   dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> > +   dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> > +   dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> > +   dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> > +   dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> > +   dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> > +   dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> > +   dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> > +   dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> > +   dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> > +   dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +   dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> > +   dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> > +   dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> > +   dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> > +   dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> > +   dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> > +   dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> > +   dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> > +   dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> > +   dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> > +   dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> > +   dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > +   dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> > +   dst->guest_rsp = vmcs_readl(GUEST_RSP);
> > +   dst->guest_rip = vmcs_readl(GUEST_RIP);
> > +   dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> > +   dst->guest_pending_dbg_exceptions =
> > +      vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > +   dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> > +   dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> > +   dst->host_cr0 = vmcs_readl(HOST_CR0);
> > +   dst->host_cr3 = vmcs_readl(HOST_CR3);
> > +   dst->host_cr4 = vmcs_readl(HOST_CR4);
> > +   dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> > +   dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> > +   dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> > +   dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> > +   dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> > +   dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> > +   dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> > +   dst->host_rsp = vmcs_readl(HOST_RSP);
> > +   dst->host_rip = vmcs_readl(HOST_RIP);
> > +   if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
> > +      dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> > +}
> >
>
> I see.  You're using the processor's format when reading the guest
> vmcs.  But we don't have to do that, we can use the shadow_vmcs
> structure (and a memcpy).
I'm sorry I don't understand your comment can u elaborate ?
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity Sept. 6, 2009, 9:25 a.m. UTC | #3

On 09/03/2009 05:25 PM, Orit Wasserman wrote:
>>
>>> +   /*
>>> +    * Level 2 state : includes vmcs,registers and
>>> +    * a copy of vmcs12 for vmread/vmwrite
>>> +    */
>>> +   struct level_state *l2_state;
>>> +
>>> +   /* Level 1 state for switching to level 2 and back */
>>> +   struct level_state *l1_state;
>>>
>>>        
>> Can you explain why we need two of them?  in the guest vmcs we have host
>> and guest values, and in l1_state and l2_state we have more copies, and
>> in struct vcpu we have yet another set of copies.  We also have a couple
>> of copies in the host vmcs.  I'm getting dizzy...
>>      
> L2_state stores all the L2 guest state:
>        vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
>        shadow vmcs - a structure storing the values of VMCS12 (the vmcs L1
> create to run L2).
>    

When we support multiple nested guests, we'll run into a problem of 
where to store shadow_vmcs.  I see these options:

- maintain a cache of limited size of shadow_vmcs; when evicting, copy 
the shadow_vmcs into the guest's vmptr]
- always put shadow_vmcs in the guest's vmptr, and write protect it so 
the guest can't play with it
- always put shadow_vmcs in the guest's vmptr, and verify everything you 
read (that's what nsvm does)

>        cpu - the cpu id
>    

Why is it needed?

>        launched- launched flag
>    

Can be part of shadow_vmcs

>        vpid - the vpid allocate by L0 for L2 (we need to store it somewhere)
>    

Note the guest can DoS the host by allocating a lot of vpids.  So we to 
allocate host vpids on demand and be able to flush them out.

>        msr_bitmap - At the moment we use L0 msr_bitmap(as we are running kvm
> on kvm) in the future we will use a merge of both bitmaps.
>    

Note kvm uses two bitmaps (for long mode and legacy mode).

> L1 state stores the L1 state -
>        vmcs - pointer to VMCS01
>    

So it's the same as vmx->vmcs in normal operation?

>        shadow vmcs - a structure storing the values of VMCS01. we use it
> when updating VMCS02 in order to avoid the need to switch between VMCS02
> and VMCS01.
>    

Sorry, don't understand.

>        cpu - the cpu id
>        launched- launched flag
>    

This is a copy of vmx->launched?

>>> +
>>> +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
>>> +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
>>> +      if (vmcs_page == NULL)
>>> +         return 1;
>>> +
>>> +      /* load nested vmcs to processor */
>>> +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
>>>
>>>        
>> So, you're loading a guest page as the vmcs.  This is dangerous as the
>> guest can play with it.  Much better to use inaccessible memory (and you
>> do alloc_vmcs() earlier?)
>>      
> We can copy the vmcs and than vmptrld it. As for the allocate vmcs this is
> a memory leak and I will fix it (it should be allocated only once).
>    

But why do it?  Your approach is to store the guest vmcs in the same 
format as the processor (which we don't really know), so you have to use 
vmread/vmwrite to maintain it.  Instead, you can choose that the guest 
vmcs is a shadow_vmcs structure and then you can access it using normal 
memory operations.

>> I see.  You're using the processor's format when reading the guest
>> vmcs.  But we don't have to do that, we can use the shadow_vmcs
>> structure (and a memcpy).
>>      
> I'm sorry I don't understand your comment can u elaborate ?
>    
>

See previous comment.  Basically you can do

   struct shadow_vmcs *svmcs = kmap_atomic(gpa_to_page(vmx->vmptr));
   printk("guest_cs = %x\n", svmcs->guest_cs_selector);

instead of

   vmptrld(gpa_to_hpa(vmx->vmptr))
   printk("guest_cs = %x\n", vmcs_read16(GUEST_CS_SELECTOR));

oritw@il.ibm.com Sept. 6, 2009, 1:36 p.m. UTC | #4

Avi Kivity <avi@redhat.com> wrote on 06/09/2009 12:25:17:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mmday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 06/09/2009 12:25
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/03/2009 05:25 PM, Orit Wasserman wrote:
> >>
> >>> +   /*
> >>> +    * Level 2 state : includes vmcs,registers and
> >>> +    * a copy of vmcs12 for vmread/vmwrite
> >>> +    */
> >>> +   struct level_state *l2_state;
> >>> +
> >>> +   /* Level 1 state for switching to level 2 and back */
> >>> +   struct level_state *l1_state;
> >>>
> >>>
> >> Can you explain why we need two of them?  in the guest vmcs we have
host
> >> and guest values, and in l1_state and l2_state we have more copies,
and
> >> in struct vcpu we have yet another set of copies.  We also have a
couple
> >> of copies in the host vmcs.  I'm getting dizzy...
> >>
> > L2_state stores all the L2 guest state:
> >        vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
> >        shadow vmcs - a structure storing the values of VMCS12 (the vmcs
L1
> > create to run L2).
> >
>
> When we support multiple nested guests, we'll run into a problem of
> where to store shadow_vmcs.  I see these options:
>
> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
> the shadow_vmcs into the guest's vmptr]
> - always put shadow_vmcs in the guest's vmptr, and write protect it so
> the guest can't play with it
> - always put shadow_vmcs in the guest's vmptr, and verify everything you
> read (that's what nsvm does)
>
The second option looks a bit complicated I prefer one of the other two.
> >        cpu - the cpu id
> >
>
> Why is it needed?
This is a copy of the cpu id from the vcpu to store the last cpu id the L2
guest run on.
>
> >        launched- launched flag
> >
>
> Can be part of shadow_vmcs
I prefer to keep the shadow_vmcs as a separate structure to store only VMCS
fields.
>
> >        vpid - the vpid allocate by L0 for L2 (we need to store it
somewhere)
> >
>
> Note the guest can DoS the host by allocating a lot of vpids.  So we to
> allocate host vpids on demand and be able to flush them out.
The guest is not allocating the vpids the host(L0) does using
allocate_vpid.
I agree that with nested the danger for them to run out is bigger.
>
> >        msr_bitmap - At the moment we use L0 msr_bitmap(as we are
running kvm
> > on kvm) in the future we will use a merge of both bitmaps.
> >
>
> Note kvm uses two bitmaps (for long mode and legacy mode).
OK.
>
> > L1 state stores the L1 state -
> >        vmcs - pointer to VMCS01
> >
>
> So it's the same as vmx->vmcs in normal operation?
yes , but with nested the vmx->vmcs is changed when running an L2 (nested)
guest.
>
> >        shadow vmcs - a structure storing the values of VMCS01. we use
it
> > when updating VMCS02 in order to avoid the need to switch between
VMCS02
> > and VMCS01.
> >
>
> Sorry, don't understand.
VMCS02 - the VMCS L0 uses to run L2.
When we create/update VMCS02 we need to read fields from VMCS01 (host state
is taken fully, control fields ).
For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we used
the same structure.
>
> >        cpu - the cpu id
> >        launched- launched flag
> >
>
> This is a copy of vmx->launched?
Exactly .The vmx->launched is updated when switching from L1/L2 and back so
we need to store it here.
>
> >>> +
> >>> +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> >>> +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> >>> +      if (vmcs_page == NULL)
> >>> +         return 1;
> >>> +
> >>> +      /* load nested vmcs to processor */
> >>> +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
> >>>
> >>>
> >> So, you're loading a guest page as the vmcs.  This is dangerous as the
> >> guest can play with it.  Much better to use inaccessible memory (and
you
> >> do alloc_vmcs() earlier?)
> >>
> > We can copy the vmcs and than vmptrld it. As for the allocate vmcs this
is
> > a memory leak and I will fix it (it should be allocated only once).
> >
>
> But why do it?  Your approach is to store the guest vmcs in the same
> format as the processor (which we don't really know), so you have to use
> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
> vmcs is a shadow_vmcs structure and then you can access it using normal
> memory operations.
I got it now.
We will need a way to distinguish between processor format VMCS and
structure based VMCS,
we can use the revision id field (create a unique revision id for nested
like 0xffff or 0x0).
>
> >> I see.  You're using the processor's format when reading the guest
> >> vmcs.  But we don't have to do that, we can use the shadow_vmcs
> >> structure (and a memcpy).
> >>
> > I'm sorry I don't understand your comment can u elaborate ?
> >
> >
>
> See previous comment.  Basically you can do
>
>    struct shadow_vmcs *svmcs = kmap_atomic(gpa_to_page(vmx->vmptr));
>    printk("guest_cs = %x\n", svmcs->guest_cs_selector);
See above.
>
> instead of
>
>    vmptrld(gpa_to_hpa(vmx->vmptr))
>    printk("guest_cs = %x\n", vmcs_read16(GUEST_CS_SELECTOR));
>
>
> --
> error compiling committee.c: too many arguments to function
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity Sept. 6, 2009, 1:52 p.m. UTC | #5

On 09/06/2009 04:36 PM, Orit Wasserman wrote:
>
>> When we support multiple nested guests, we'll run into a problem of
>> where to store shadow_vmcs.  I see these options:
>>
>> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
>> the shadow_vmcs into the guest's vmptr]
>> - always put shadow_vmcs in the guest's vmptr, and write protect it so
>> the guest can't play with it
>> - always put shadow_vmcs in the guest's vmptr, and verify everything you
>> read (that's what nsvm does)
>>
>>      
> The second option looks a bit complicated I prefer one of the other two.
>    

I agree, the third option looks easiest but not sure how much 
verification is needed.

Note other things like the msr bitmaps may need write protection, 
otherwise you have to re-merge the bitmap on every guest entry, which 
can be very slow.  So we may be forced to add write protection anyway.

>>>         launched- launched flag
>>>
>>>        
>> Can be part of shadow_vmcs
>>      
> I prefer to keep the shadow_vmcs as a separate structure to store only VMCS
> fields.
>    

It is a vmcs field - it is manipulated by vmx instructions which operate 
on the vmcs.  You'll need to store it in guest memory when you support 
multiple nested guests.

You can put the vmcs fields in a sub-structure if you want to separate 
between explicit fields and implicit fields (I can only see one implicit 
field (launched), but maybe there are more).

>>      
>>>         vpid - the vpid allocate by L0 for L2 (we need to store it
>>>        
> somewhere)
>    
>>>        
>> Note the guest can DoS the host by allocating a lot of vpids.  So we to
>> allocate host vpids on demand and be able to flush them out.
>>      
> The guest is not allocating the vpids the host(L0) does using
> allocate_vpid.
>    

I meant, the guest can force the host to allocate vpids if we don't 
protect against it.

> I agree that with nested the danger for them to run out is bigger.
>    

>> Sorry, don't understand.
>>      
> VMCS02 - the VMCS L0 uses to run L2.
> When we create/update VMCS02 we need to read fields from VMCS01 (host state
> is taken fully, control fields ).
> For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we used
> the same structure.
>    

I don't understand why you need it.  Host state shouldn't change.  Only 
the control fields are interesting, and things like exception_bitmap.

>> But why do it?  Your approach is to store the guest vmcs in the same
>> format as the processor (which we don't really know), so you have to use
>> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
>> vmcs is a shadow_vmcs structure and then you can access it using normal
>> memory operations.
>>      
> I got it now.
> We will need a way to distinguish between processor format VMCS and
> structure based VMCS,
> we can use the revision id field (create a unique revision id for nested
> like 0xffff or 0x0).
>    

No, you can always store guest vmcs in software format, since we'll 
never load it with vmptrld.  We'll only load a real vmcs with vmptrld.

Note it also solves live migration, since now all guest vmcss are copied 
as part of normal guest memory (including their launched state).

oritw@il.ibm.com Sept. 6, 2009, 4:55 p.m. UTC | #6

Avi Kivity <avi@redhat.com> wrote on 06/09/2009 16:52:56:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 06/09/2009 16:53
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/06/2009 04:36 PM, Orit Wasserman wrote:
> >
> >> When we support multiple nested guests, we'll run into a problem of
> >> where to store shadow_vmcs.  I see these options:
> >>
> >> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
> >> the shadow_vmcs into the guest's vmptr]
> >> - always put shadow_vmcs in the guest's vmptr, and write protect it so
> >> the guest can't play with it
> >> - always put shadow_vmcs in the guest's vmptr, and verify everything
you
> >> read (that's what nsvm does)
> >>
> >>
> > The second option looks a bit complicated I prefer one of the other
two.
> >
>
> I agree, the third option looks easiest but not sure how much
> verification is needed.
>
> Note other things like the msr bitmaps may need write protection,
> otherwise you have to re-merge the bitmap on every guest entry, which
> can be very slow.  So we may be forced to add write protection anyway.
We will also need to write protected L1's EPT tables , to allow L1 to swap
out his guests.
>
> >>>         launched- launched flag
> >>>
> >>>
> >> Can be part of shadow_vmcs
> >>
> > I prefer to keep the shadow_vmcs as a separate structure to store only
VMCS
> > fields.
> >
>
> It is a vmcs field - it is manipulated by vmx instructions which operate
> on the vmcs.  You'll need to store it in guest memory when you support
> multiple nested guests.
>
> You can put the vmcs fields in a sub-structure if you want to separate
> between explicit fields and implicit fields (I can only see one implicit
> field (launched), but maybe there are more).
OK.
>
> >>
> >>>         vpid - the vpid allocate by L0 for L2 (we need to store it
> >>>
> > somewhere)
> >
> >>>
> >> Note the guest can DoS the host by allocating a lot of vpids.  So we
to
> >> allocate host vpids on demand and be able to flush them out.
> >>
> > The guest is not allocating the vpids the host(L0) does using
> > allocate_vpid.
> >
>
> I meant, the guest can force the host to allocate vpids if we don't
> protect against it.
You meant by launching a lot of guests ?
We can limit the number of guests as a very quick solution.
More complicated is limiting the number of vpids per L1 hypervisor and
reusing them.
This means we will sometime need to invalidate the vpid when switching
between L2 guests.
>
> > I agree that with nested the danger for them to run out is bigger.
> >
>
>
> >> Sorry, don't understand.
> >>
> > VMCS02 - the VMCS L0 uses to run L2.
> > When we create/update VMCS02 we need to read fields from VMCS01 (host
state
> > is taken fully, control fields ).
> > For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we
used
> > the same structure.
> >
>
> I don't understand why you need it.  Host state shouldn't change.  Only
> the control fields are interesting, and things like exception_bitmap.
I think that when KVM switches to Qemu the host state can change (L0 host
state). If this happens between different runs of L2
we will need to update VMCS02 host state. Of course we can optimize and
update it only than.
>
> >> But why do it?  Your approach is to store the guest vmcs in the same
> >> format as the processor (which we don't really know), so you have to
use
> >> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
> >> vmcs is a shadow_vmcs structure and then you can access it using
normal
> >> memory operations.
> >>
> > I got it now.
> > We will need a way to distinguish between processor format VMCS and
> > structure based VMCS,
> > we can use the revision id field (create a unique revision id for
nested
> > like 0xffff or 0x0).
> >
>
> No, you can always store guest vmcs in software format, since we'll
> never load it with vmptrld.  We'll only load a real vmcs with vmptrld.
You are right , a new VMCS will be zeroed.
>
> Note it also solves live migration, since now all guest vmcss are copied
> as part of normal guest memory (including their launched state).
Great.
>
> --
> error compiling committee.c: too many arguments to function
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity Sept. 6, 2009, 7:10 p.m. UTC | #7

On 09/06/2009 07:55 PM, Orit Wasserman wrote:
>> Note other things like the msr bitmaps may need write protection,
>> otherwise you have to re-merge the bitmap on every guest entry, which
>> can be very slow.  So we may be forced to add write protection anyway.
>>      
> We will also need to write protected L1's EPT tables , to allow L1 to swap
> out his guests.
>    

That comes naturally with the shadow mmu.  In the same way normal shadow 
mmu protects guest page tables, nested EPT shadow should protect the 
guest's EPT pages.

(unfortunately there is no INVEPT instruction that accepts a gpa 
operand; this would make write protection unnecessary).

>> I meant, the guest can force the host to allocate vpids if we don't
>> protect against it.
>>      
> You meant by launching a lot of guests ?
>    

Yes.

> We can limit the number of guests as a very quick solution.
>    

How?  There is no way to tell the guest not to launch more guests.

> More complicated is limiting the number of vpids per L1 hypervisor and
> reusing them.
>    

When the bitmap is full, clear it.  Use a generation count to tell vcpus 
to reload.  svm does that (svm only has 63 asids).

> This means we will sometime need to invalidate the vpid when switching
> between L2 guests.
>    

Yes.

>> I don't understand why you need it.  Host state shouldn't change.  Only
>> the control fields are interesting, and things like exception_bitmap.
>>      
> I think that when KVM switches to Qemu the host state can change (L0 host
> state). If this happens between different runs of L2
> we will need to update VMCS02 host state. Of course we can optimize and
> update it only than.
>    

No, I don't think any host state changes, except for cr0.ts.

[3/6] Nested VMX patch 3 implements vmptrld and vmptrst

Commit Message

Comments

Patch