diff mbox

[17/31] nVMX: Prepare vmcs02 from vmcs01 and vmcs12

Message ID 201105161952.p4GJqbek001851@rice.haifa.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nadav Har'El May 16, 2011, 7:52 p.m. UTC
This patch contains code to prepare the VMCS which can be used to actually
run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information
in vmcs12 (the vmcs that L1 built for L2) and in vmcs01 (our desires for our
own guests).

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/vmx.c |  269 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 269 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tian, Kevin May 24, 2011, 8:02 a.m. UTC | #1
> From: Nadav Har'El
> Sent: Tuesday, May 17, 2011 3:53 AM
>
> This patch contains code to prepare the VMCS which can be used to actually
> run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the
> information
> in vmcs12 (the vmcs that L1 built for L2) and in vmcs01 (our desires for our
> own guests).
>
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> ---
>  arch/x86/kvm/vmx.c |  269
> +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 269 insertions(+)
>
> --- .before/arch/x86/kvm/vmx.c        2011-05-16 22:36:48.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c 2011-05-16 22:36:48.000000000 +0300
> @@ -347,6 +347,12 @@ struct nested_vmx {
>       /* vmcs02_list cache of VMCSs recently used to run L2 guests */
>       struct list_head vmcs02_pool;
>       int vmcs02_num;
> +     u64 vmcs01_tsc_offset;
> +     /*
> +      * Guest pages referred to in vmcs02 with host-physical pointers, so
> +      * we must keep them pinned while L2 runs.
> +      */
> +     struct page *apic_access_page;
>  };
>
>  struct vcpu_vmx {
> @@ -849,6 +855,18 @@ static inline bool report_flexpriority(v
>       return flexpriority_enabled;
>  }
>
> +static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
> +{
> +     return vmcs12->cpu_based_vm_exec_control & bit;
> +}
> +
> +static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
> +{
> +     return (vmcs12->cpu_based_vm_exec_control &
> +                     CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
> +             (vmcs12->secondary_vm_exec_control & bit);
> +}
> +
>  static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
>  {
>       int i;
> @@ -1435,6 +1453,22 @@ static void vmx_fpu_activate(struct kvm_
>
>  static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
>
> +/*
> + * Return the cr0 value that a nested guest would read. This is a combination
> + * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
> + * its hypervisor (cr0_read_shadow).
> + */
> +static inline unsigned long guest_readable_cr0(struct vmcs12 *fields)
> +{
> +     return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
> +             (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
> +}
> +static inline unsigned long guest_readable_cr4(struct vmcs12 *fields)
> +{
> +     return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
> +             (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
> +}
> +

will guest_ prefix look confusing here? The 'guest' has a broad range which makes
above two functions look like they can be used in non-nested case. Should we stick
to nested_prefix for nested specific facilities?

>  static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
>  {
>       vmx_decache_cr0_guest_bits(vcpu);
> @@ -3423,6 +3457,9 @@ static void set_cr4_guest_host_mask(stru
>       vmx->vcpu.arch.cr4_guest_owned_bits =
> KVM_CR4_GUEST_OWNED_BITS;
>       if (enable_ept)
>               vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
> +     if (is_guest_mode(&vmx->vcpu))
> +             vmx->vcpu.arch.cr4_guest_owned_bits &=
> +                     ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;

why not is_nested_mode()? :-P

>       vmcs_writel(CR4_GUEST_HOST_MASK,
> ~vmx->vcpu.arch.cr4_guest_owned_bits);
>  }
>
> @@ -4760,6 +4797,11 @@ static void free_nested(struct vcpu_vmx
>               vmx->nested.current_vmptr = -1ull;
>               vmx->nested.current_vmcs12 = NULL;
>       }
> +     /* Unpin physical memory we referred to in current vmcs02 */
> +     if (vmx->nested.apic_access_page) {
> +             nested_release_page(vmx->nested.apic_access_page);
> +             vmx->nested.apic_access_page = 0;
> +     }
>
>       nested_free_all_saved_vmcss(vmx);
>  }
> @@ -5829,6 +5871,233 @@ static void vmx_set_supported_cpuid(u32
>  }
>
>  /*
> + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
> + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
> + * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
> + * guest in a way that will both be appropriate to L1's requests, and our
> + * needs. In addition to modifying the active vmcs (which is vmcs02), this
> + * function also has additional necessary side-effects, like setting various
> + * vcpu->arch fields.
> + */
> +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> +{
> +     struct vcpu_vmx *vmx = to_vmx(vcpu);
> +     u32 exec_control;
> +
> +     vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
> +     vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
> +     vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
> +     vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
> +     vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
> +     vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
> +     vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
> +     vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
> +     vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
> +     vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
> +     vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
> +     vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
> +     vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
> +     vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
> +     vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
> +     vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
> +     vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
> +     vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
> +     vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
> +     vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
> +     vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
> +     vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
> +     vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
> +     vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
> +     vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
> +     vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
> +     vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
> +     vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
> +     vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
> +     vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
> +     vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
> +     vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
> +     vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
> +     vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
> +     vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
> +     vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
> +
> +     vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
> +     vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
> +             vmcs12->vm_entry_intr_info_field);
> +     vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> +             vmcs12->vm_entry_exception_error_code);
> +     vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
> +             vmcs12->vm_entry_instruction_len);
> +     vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> +             vmcs12->guest_interruptibility_info);
> +     vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
> +     vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
> +     vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
> +     vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
> +     vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> +             vmcs12->guest_pending_dbg_exceptions);
> +     vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
> +     vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
> +
> +     vmcs_write64(VMCS_LINK_POINTER, -1ull);
> +
> +     if (nested_cpu_has2(vmcs12,
> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
> +             struct page *page =
> +                     nested_get_page(vcpu, vmcs12->apic_access_addr);
> +             if (!page)
> +                     return 1;
> +             vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
> +             /*
> +              * Keep the page pinned, so its physical address we just wrote
> +              * remains valid. We keep a reference to it so we can release
> +              * it later.
> +              */
> +             if (vmx->nested.apic_access_page) /* shouldn't happen... */
> +                     nested_release_page(vmx->nested.apic_access_page);
> +             vmx->nested.apic_access_page = page;
> +     }
> +
> +     vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> +             (vmcs_config.pin_based_exec_ctrl |
> +              vmcs12->pin_based_vm_exec_control));
> +
> +     /*
> +      * Whether page-faults are trapped is determined by a combination of
> +      * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
> +      * If enable_ept, L0 doesn't care about page faults and we should
> +      * set all of these to L1's desires. However, if !enable_ept, L0 does
> +      * care about (at least some) page faults, and because it is not easy
> +      * (if at all possible?) to merge L0 and L1's desires, we simply ask
> +      * to exit on each and every L2 page fault. This is done by setting
> +      * MASK=MATCH=0 and (see below) EB.PF=1.
> +      * Note that below we don't need special code to set EB.PF beyond the
> +      * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
> +      * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
> +      * !enable_ept, EB.PF is 1, so the "or" will always be 1.
> +      *
> +      * A problem with this approach (when !enable_ept) is that L1 may be
> +      * injected with more page faults than it asked for. This could have
> +      * caused problems, but in practice existing hypervisors don't care.
> +      * To fix this, we will need to emulate the PFEC checking (on the L1
> +      * page tables), using walk_addr(), when injecting PFs to L1.
> +      */
> +     vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
> +             enable_ept ? vmcs12->page_fault_error_code_mask : 0);
> +     vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
> +             enable_ept ? vmcs12->page_fault_error_code_match : 0);
> +
> +     if (cpu_has_secondary_exec_ctrls()) {
> +             u32 exec_control = vmx_secondary_exec_control(vmx);
> +             if (!vmx->rdtscp_enabled)
> +                     exec_control &= ~SECONDARY_EXEC_RDTSCP;
> +             /* Take the following fields only from vmcs12 */
> +             exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +             if (nested_cpu_has(vmcs12,
> +                             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
> +                     exec_control |= vmcs12->secondary_vm_exec_control;

should this 2nd exec_control be merged in clear case-by-case flavor?

what about L0 sets "virtualize x2APIC" bit while L1 doesn't?

Or what about L0 disables EPT while L1 sets it?

I think it's better to scrutinize every 2nd exec_control feature with a
clear policy:
- whether we want to use the stricter policy which is only set when both L0 and
L1 set it
- whether we want to use L1 setting absolutely regardless of L0 setting like
what you did for virtualize APIC access

> +             vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> +     }
> +
> +     /*
> +      * Set host-state according to L0's settings (vmcs12 is irrelevant here)
> +      * Some constant fields are set here by vmx_set_constant_host_state().
> +      * Other fields are different per CPU, and will be set later when
> +      * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
> +      */
> +     vmx_set_constant_host_state();
> +
> +     /*
> +      * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
> +      * entry, but only if the current (host) sp changed from the value
> +      * we wrote last (vmx->host_rsp). This cache is no longer relevant
> +      * if we switch vmcs, and rather than hold a separate cache per vmcs,
> +      * here we just force the write to happen on entry.
> +      */
> +     vmx->host_rsp = 0;
> +
> +     exec_control = vmx_exec_control(vmx); /* L0's desires */
> +     exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +     exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +     exec_control &= ~CPU_BASED_TPR_SHADOW;
> +     exec_control |= vmcs12->cpu_based_vm_exec_control;

similarly I think default OR for other features may be risky. In most time we
choose the most conservative setting from L1/L0 or simply borrow from L1.
A default OR here doesn't make this policy clear. Better to spell out every
control bit clearly with desired merge policy.


Thanks
Kevin

> +     /*
> +      * Merging of IO and MSR bitmaps not currently supported.
> +      * Rather, exit every time.
> +      */
> +     exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
> +     exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
> +     exec_control |= CPU_BASED_UNCOND_IO_EXITING;
> +
> +     vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
> +
> +     /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be
> the
> +      * bitwise-or of what L1 wants to trap for L2, and what we want to
> +      * trap. Note that CR0.TS also needs updating - we do this later.
> +      */
> +     update_exception_bitmap(vcpu);
> +     vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
> +     vmcs_writel(CR0_GUEST_HOST_MASK,
> ~vcpu->arch.cr0_guest_owned_bits);
> +
> +     /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer
> below */
> +     vmcs_write32(VM_EXIT_CONTROLS,
> +             vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
> +     vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
> +             (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
> +
> +     if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
> +             vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
> +     else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
> +             vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
> +
> +
> +     set_cr4_guest_host_mask(vmx);
> +
> +     vmcs_write64(TSC_OFFSET,
> +             vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
> +
> +     if (enable_vpid) {
> +             /*
> +              * Trivially support vpid by letting L2s share their parent
> +              * L1's vpid. TODO: move to a more elaborate solution, giving
> +              * each L2 its own vpid and exposing the vpid feature to L1.
> +              */
> +             vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
> +             vmx_flush_tlb(vcpu);
> +     }
> +
> +     if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
> +             vcpu->arch.efer = vmcs12->guest_ia32_efer;
> +     if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
> +             vcpu->arch.efer |= (EFER_LMA | EFER_LME);
> +     else
> +             vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
> +     /* Note: modifies VM_ENTRY/EXIT_CONTROLS and
> GUEST/HOST_IA32_EFER */
> +     vmx_set_efer(vcpu, vcpu->arch.efer);
> +
> +     /*
> +      * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
> +      * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
> +      * The CR0_READ_SHADOW is what L2 should have expected to read
> given
> +      * the specifications by L1; It's not enough to take
> +      * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
> we
> +      * have more bits than L1 expected.
> +      */
> +     vmx_set_cr0(vcpu, vmcs12->guest_cr0);
> +     vmcs_writel(CR0_READ_SHADOW, guest_readable_cr0(vmcs12));
> +
> +     vmx_set_cr4(vcpu, vmcs12->guest_cr4);
> +     vmcs_writel(CR4_READ_SHADOW, guest_readable_cr4(vmcs12));
> +
> +     /* shadow page tables on either EPT or shadow page tables */
> +     kvm_set_cr3(vcpu, vmcs12->guest_cr3);
> +     kvm_mmu_reset_context(vcpu);
> +
> +     kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
> +     kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
> +     return 0;
> +}
> +
> +/*
>   * Maintain the vcpus_on_cpu and saved_vmcss_on_cpu lists of vcpus and
>   * inactive saved_vmcss on nested entry (L1->L2) or nested exit (L2->L1).
>   *
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nadav Har'El May 24, 2011, 9:19 a.m. UTC | #2
On Tue, May 24, 2011, Tian, Kevin wrote about "RE: [PATCH 17/31] nVMX: Prepare vmcs02 from vmcs01 and vmcs12":
> > +static inline unsigned long guest_readable_cr4(struct vmcs12 *fields)
> > +{
> > +     return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
> > +             (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
> > +}
> > +
> 
> will guest_ prefix look confusing here? The 'guest' has a broad range which makes
> above two functions look like they can be used in non-nested case. Should we stick
> to nested_prefix for nested specific facilities?

I don't know, I thought it made calls like

	vmcs_writel(CR0_READ_SHADOW, guest_readable_cr0(vmcs12));

readable, and the comments (and the parameters) make it obvious it's for
nested only.

I now renamed these functions nested_read_cr0(), nested_read_cr4() - I hope
you like these names better.
	
> > +     if (is_guest_mode(&vmx->vcpu))
> > +             vmx->vcpu.arch.cr4_guest_owned_bits &=
> > +                     ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
> 
> why not is_nested_mode()? :-P

I assume you're wondering why the function is called is_guest_mode(), and
not is_nested_mode()?

This name was chosen by Avi Kivity in November last year, for the function
previously introduced by Joerg Roedel. My original code (before Joerg added
this function to x86.c) indeed used the term "nested_mode", not "guest_mode".

In January, I pointed to the possibility of confusion between the new
is_guest_mode() and other things called "guest mode", and Avi Kivity said
he will rename it to is_nested_guest() - see
http://lkml.indiana.edu/hypermail/linux/kernel/1101.1/01418.html
But as you can see, he never did this renaming.

That being said, after half a year, I got used to the name is_guest_mode(),
and am no longer convinced it should be changed. It checks whether the vcpu
(not the underlying CPU) is in Intel-SDM-terminology "guest mode". Just like
is_long_mode() checks if the vcpu is in long mode. So I'm fine with leaving
its current name.

> > +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> > +{
>...
> > +             if (!vmx->rdtscp_enabled)
> > +                     exec_control &= ~SECONDARY_EXEC_RDTSCP;
> > +             /* Take the following fields only from vmcs12 */
> > +             exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> > +             if (nested_cpu_has(vmcs12,
> > +                             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
> > +                     exec_control |= vmcs12->secondary_vm_exec_control;
> 
> should this 2nd exec_control be merged in clear case-by-case flavor?
> 
> what about L0 sets "virtualize x2APIC" bit while L1 doesn't?
> 
> Or what about L0 disables EPT while L1 sets it?
> 
> I think it's better to scrutinize every 2nd exec_control feature with a
> clear policy:
> - whether we want to use the stricter policy which is only set when both L0 and
> L1 set it
> - whether we want to use L1 setting absolutely regardless of L0 setting like
> what you did for virtualize APIC access

Please note that most of the examples you give cannot happen in practice,
because we tell L1 (via MSR) which features it is allowed to use, and we
fail entry if it tries to use disallowed features (before ever reaching
the merge code you're commenting on). So we don't allow L1, for example,
to use the EPT feature (and when nested-EPT support is added, we won't
allow L1 to use EPT if L0 didn't). The general thinking was that for most
fields that we do explicitly allow, "OR" is the right choice.

I'll add this to my bugzilla, and think about it again later.

Thanks,
Nadav
Tian, Kevin May 24, 2011, 10:52 a.m. UTC | #3
> From: Nadav Har'El
> Sent: Tuesday, May 24, 2011 5:19 PM
> 
> On Tue, May 24, 2011, Tian, Kevin wrote about "RE: [PATCH 17/31] nVMX:
> Prepare vmcs02 from vmcs01 and vmcs12":
> > > +static inline unsigned long guest_readable_cr4(struct vmcs12 *fields)
> > > +{
> > > +     return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
> > > +             (fields->cr4_read_shadow &
> fields->cr4_guest_host_mask);
> > > +}
> > > +
> >
> > will guest_ prefix look confusing here? The 'guest' has a broad range which
> makes
> > above two functions look like they can be used in non-nested case. Should we
> stick
> > to nested_prefix for nested specific facilities?
> 
> I don't know, I thought it made calls like
> 
> 	vmcs_writel(CR0_READ_SHADOW, guest_readable_cr0(vmcs12));
> 
> readable, and the comments (and the parameters) make it obvious it's for
> nested only.
> 
> I now renamed these functions nested_read_cr0(), nested_read_cr4() - I hope
> you like these names better.

yes.

> 
> > > +     if (is_guest_mode(&vmx->vcpu))
> > > +             vmx->vcpu.arch.cr4_guest_owned_bits &=
> > > +
> ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
> >
> > why not is_nested_mode()? :-P
> 
> I assume you're wondering why the function is called is_guest_mode(), and
> not is_nested_mode()?

yes

> 
> This name was chosen by Avi Kivity in November last year, for the function
> previously introduced by Joerg Roedel. My original code (before Joerg added
> this function to x86.c) indeed used the term "nested_mode", not
> "guest_mode".
> 
> In January, I pointed to the possibility of confusion between the new
> is_guest_mode() and other things called "guest mode", and Avi Kivity said
> he will rename it to is_nested_guest() - see
> http://lkml.indiana.edu/hypermail/linux/kernel/1101.1/01418.html
> But as you can see, he never did this renaming.
> 
> That being said, after half a year, I got used to the name is_guest_mode(),
> and am no longer convinced it should be changed. It checks whether the vcpu
> (not the underlying CPU) is in Intel-SDM-terminology "guest mode". Just like
> is_long_mode() checks if the vcpu is in long mode. So I'm fine with leaving
> its current name.

well, it's a small issue, and I'm fine with leaving it though I don't like 'guest' here. :-)

> 
> > > +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12
> *vmcs12)
> > > +{
> >...
> > > +             if (!vmx->rdtscp_enabled)
> > > +                     exec_control &= ~SECONDARY_EXEC_RDTSCP;
> > > +             /* Take the following fields only from vmcs12 */
> > > +             exec_control &=
> ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> > > +             if (nested_cpu_has(vmcs12,
> > > +
> CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
> > > +                     exec_control |=
> vmcs12->secondary_vm_exec_control;
> >
> > should this 2nd exec_control be merged in clear case-by-case flavor?
> >
> > what about L0 sets "virtualize x2APIC" bit while L1 doesn't?
> >
> > Or what about L0 disables EPT while L1 sets it?
> >
> > I think it's better to scrutinize every 2nd exec_control feature with a
> > clear policy:
> > - whether we want to use the stricter policy which is only set when both L0
> and
> > L1 set it
> > - whether we want to use L1 setting absolutely regardless of L0 setting like
> > what you did for virtualize APIC access
> 
> Please note that most of the examples you give cannot happen in practice,
> because we tell L1 (via MSR) which features it is allowed to use, and we
> fail entry if it tries to use disallowed features (before ever reaching
> the merge code you're commenting on). So we don't allow L1, for example,
> to use the EPT feature (and when nested-EPT support is added, we won't
> allow L1 to use EPT if L0 didn't). The general thinking was that for most
> fields that we do explicitly allow, "OR" is the right choice.

This really bases on the value of the control bit. To achieve the strictest
setting between L0/L1, sometimes you want to use AND and sometimes you
want to use OR.

From a design p.o.v, it's better not to have such implicit assumption on other
places. Just make it clean and correct. Also in your example it doesn't cover
the case where L0 sets some bits which are not exposed to L1 via MSR. For
example as I said earlier, what about L0 sets virtualize X2APIC mode while
it's not enabled by or not exposed to L1. With OR, you then also enable this 
mode for L2 absolutely, while L1 has no logic to handle it.

I'd like to see a clean policy for the known control bits here, even with a 
strict policy to incur most VM-exits which can be optimized in the future.

> 
> I'll add this to my bugzilla, and think about it again later.
>

Thanks
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- .before/arch/x86/kvm/vmx.c	2011-05-16 22:36:48.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2011-05-16 22:36:48.000000000 +0300
@@ -347,6 +347,12 @@  struct nested_vmx {
 	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
 	struct list_head vmcs02_pool;
 	int vmcs02_num;
+	u64 vmcs01_tsc_offset;
+	/*
+	 * Guest pages referred to in vmcs02 with host-physical pointers, so
+	 * we must keep them pinned while L2 runs.
+	 */
+	struct page *apic_access_page;
 };
 
 struct vcpu_vmx {
@@ -849,6 +855,18 @@  static inline bool report_flexpriority(v
 	return flexpriority_enabled;
 }
 
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+	return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+	return (vmcs12->cpu_based_vm_exec_control &
+			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+		(vmcs12->secondary_vm_exec_control & bit);
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -1435,6 +1453,22 @@  static void vmx_fpu_activate(struct kvm_
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
+/*
+ * Return the cr0 value that a nested guest would read. This is a combination
+ * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
+ * its hypervisor (cr0_read_shadow).
+ */
+static inline unsigned long guest_readable_cr0(struct vmcs12 *fields)
+{
+	return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+		(fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long guest_readable_cr4(struct vmcs12 *fields)
+{
+	return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	vmx_decache_cr0_guest_bits(vcpu);
@@ -3423,6 +3457,9 @@  static void set_cr4_guest_host_mask(stru
 	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
 	if (enable_ept)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+	if (is_guest_mode(&vmx->vcpu))
+		vmx->vcpu.arch.cr4_guest_owned_bits &=
+			~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 }
 
@@ -4760,6 +4797,11 @@  static void free_nested(struct vcpu_vmx 
 		vmx->nested.current_vmptr = -1ull;
 		vmx->nested.current_vmcs12 = NULL;
 	}
+	/* Unpin physical memory we referred to in current vmcs02 */
+	if (vmx->nested.apic_access_page) {
+		nested_release_page(vmx->nested.apic_access_page);
+		vmx->nested.apic_access_page = 0;
+	}
 
 	nested_free_all_saved_vmcss(vmx);
 }
@@ -5829,6 +5871,233 @@  static void vmx_set_supported_cpuid(u32 
 }
 
 /*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 exec_control;
+
+	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+	vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+	vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+	vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+	vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+	vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+	vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+	vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+	vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+	vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+	vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+	vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+	vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+	vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+	vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+	vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+	vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+	vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+	vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+	vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+	vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+	vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+	vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+		vmcs12->vm_entry_intr_info_field);
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+		vmcs12->vm_entry_exception_error_code);
+	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+		vmcs12->vm_entry_instruction_len);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+		vmcs12->guest_interruptibility_info);
+	vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
+	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+	vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
+	vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		vmcs12->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+	vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+		struct page *page =
+			nested_get_page(vcpu, vmcs12->apic_access_addr);
+		if (!page)
+			return 1;
+		vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+		/*
+		 * Keep the page pinned, so its physical address we just wrote
+		 * remains valid. We keep a reference to it so we can release
+		 * it later.
+		 */
+		if (vmx->nested.apic_access_page) /* shouldn't happen... */
+			nested_release_page(vmx->nested.apic_access_page);
+		vmx->nested.apic_access_page = page;
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		(vmcs_config.pin_based_exec_ctrl |
+		 vmcs12->pin_based_vm_exec_control));
+
+	/*
+	 * Whether page-faults are trapped is determined by a combination of
+	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+	 * If enable_ept, L0 doesn't care about page faults and we should
+	 * set all of these to L1's desires. However, if !enable_ept, L0 does
+	 * care about (at least some) page faults, and because it is not easy
+	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
+	 * to exit on each and every L2 page fault. This is done by setting
+	 * MASK=MATCH=0 and (see below) EB.PF=1.
+	 * Note that below we don't need special code to set EB.PF beyond the
+	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+	 *
+	 * A problem with this approach (when !enable_ept) is that L1 may be
+	 * injected with more page faults than it asked for. This could have
+	 * caused problems, but in practice existing hypervisors don't care.
+	 * To fix this, we will need to emulate the PFEC checking (on the L1
+	 * page tables), using walk_addr(), when injecting PFs to L1.
+	 */
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+	if (cpu_has_secondary_exec_ctrls()) {
+		u32 exec_control = vmx_secondary_exec_control(vmx);
+		if (!vmx->rdtscp_enabled)
+			exec_control &= ~SECONDARY_EXEC_RDTSCP;
+		/* Take the following fields only from vmcs12 */
+		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		if (nested_cpu_has(vmcs12,
+				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+			exec_control |= vmcs12->secondary_vm_exec_control;
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	}
+
+	/*
+	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+	 * Some constant fields are set here by vmx_set_constant_host_state().
+	 * Other fields are different per CPU, and will be set later when
+	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+	 */
+	vmx_set_constant_host_state();
+
+	/*
+	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+	 * entry, but only if the current (host) sp changed from the value
+	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
+	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
+	 * here we just force the write to happen on entry.
+	 */
+	vmx->host_rsp = 0;
+
+	exec_control = vmx_exec_control(vmx); /* L0's desires */
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+	exec_control |= vmcs12->cpu_based_vm_exec_control;
+	/*
+	 * Merging of IO and MSR bitmaps not currently supported.
+	 * Rather, exit every time.
+	 */
+	exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+	 * bitwise-or of what L1 wants to trap for L2, and what we want to
+	 * trap. Note that CR0.TS also needs updating - we do this later.
+	 */
+	update_exception_bitmap(vcpu);
+	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+	/* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
+	vmcs_write32(VM_EXIT_CONTROLS,
+		vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
+	vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
+
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+	else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+
+	set_cr4_guest_host_mask(vmx);
+
+	vmcs_write64(TSC_OFFSET,
+		vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+
+	if (enable_vpid) {
+		/*
+		 * Trivially support vpid by letting L2s share their parent
+		 * L1's vpid. TODO: move to a more elaborate solution, giving
+		 * each L2 its own vpid and exposing the vpid feature to L1.
+		 */
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+		vmx_flush_tlb(vcpu);
+	}
+
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+		vcpu->arch.efer = vmcs12->guest_ia32_efer;
+	if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+	else
+		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+	vmx_set_efer(vcpu, vcpu->arch.efer);
+
+	/*
+	 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
+	 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+	 * The CR0_READ_SHADOW is what L2 should have expected to read given
+	 * the specifications by L1; It's not enough to take
+	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+	 * have more bits than L1 expected.
+	 */
+	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+	vmcs_writel(CR0_READ_SHADOW, guest_readable_cr0(vmcs12));
+
+	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+	vmcs_writel(CR4_READ_SHADOW, guest_readable_cr4(vmcs12));
+
+	/* shadow page tables on either EPT or shadow page tables */
+	kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+	kvm_mmu_reset_context(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+	return 0;
+}
+
+/*
  * Maintain the vcpus_on_cpu and saved_vmcss_on_cpu lists of vcpus and
  * inactive saved_vmcss on nested entry (L1->L2) or nested exit (L2->L1).
  *