diff mbox

[v4,3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor

Message ID 20170710204936.4001-4-bsd@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bandan Das July 10, 2017, 8:49 p.m. UTC
When L2 uses vmfunc, L0 utilizes the associated vmexit to
emulate a switching of the ept pointer by reloading the
guest MMU.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Bandan Das <bsd@redhat.com>
---
 arch/x86/include/asm/vmx.h |  6 +++++
 arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 61 insertions(+), 3 deletions(-)

Comments

David Hildenbrand July 11, 2017, 7:51 a.m. UTC | #1
On 10.07.2017 22:49, Bandan Das wrote:
> When L2 uses vmfunc, L0 utilizes the associated vmexit to
> emulate a switching of the ept pointer by reloading the
> guest MMU.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Bandan Das <bsd@redhat.com>
> ---
>  arch/x86/include/asm/vmx.h |  6 +++++
>  arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 61 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index da5375e..5f63a2e 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -115,6 +115,10 @@
>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
>  
> +/* VMFUNC functions */
> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
> +#define VMFUNC_EPTP_ENTRIES  512
> +
>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>  {
>  	return vmx_basic & GENMASK_ULL(30, 0);
> @@ -200,6 +204,8 @@ enum vmcs_field {
>  	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
>  	EOI_EXIT_BITMAP3                = 0x00002022,
>  	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
> +	EPTP_LIST_ADDRESS               = 0x00002024,
> +	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
>  	VMREAD_BITMAP                   = 0x00002026,
>  	VMWRITE_BITMAP                  = 0x00002028,
>  	XSS_EXIT_BITMAP                 = 0x0000202C,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index fe8f5fc..0a969fb 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>  	u64 eoi_exit_bitmap1;
>  	u64 eoi_exit_bitmap2;
>  	u64 eoi_exit_bitmap3;
> +	u64 eptp_list_address;
>  	u64 xss_exit_bitmap;
>  	u64 guest_physical_address;
>  	u64 vmcs_link_pointer;
> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>  	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>  	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>  	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
> +	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>  	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>  	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>  	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>  	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>  }
>  
> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
> +{
> +	return nested_cpu_has_vmfunc(vmcs12) &&
> +		(vmcs12->vm_function_control &

I wonder if it makes sense to rename vm_function_control to
- vmfunc_control
- vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
- vmfunc_ctrl

> +		 VMX_VMFUNC_EPTP_SWITCHING);
> +}
> +
>  static inline bool is_nmi(u32 intr_info)
>  {
>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>  	if (cpu_has_vmx_vmfunc()) {
>  		vmx->nested.nested_vmx_secondary_ctls_high |=
>  			SECONDARY_EXEC_ENABLE_VMFUNC;
> -		vmx->nested.nested_vmx_vmfunc_controls = 0;
> +		/*
> +		 * Advertise EPTP switching unconditionally
> +		 * since we emulate it
> +		 */
> +		vmx->nested.nested_vmx_vmfunc_controls =
> +			VMX_VMFUNC_EPTP_SWITCHING;>  	}
>  
>  	/*
> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	struct vmcs12 *vmcs12;
>  	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
> +	struct page *page = NULL;
> +	u64 *l1_eptp_list, address;
>  
>  	/*
>  	 * VMFUNC is only supported for nested guests, but we always enable the
> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>  	}
>  
>  	vmcs12 = get_vmcs12(vcpu);
> -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
> +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> +	    WARN_ON_ONCE(function))

"... instruction causes a VM exit if the bit at position EAX is 0 in the
VM-function controls (the selected VM function is
not enabled)."

So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
completely.

> +		goto fail;
> +
> +	if (!nested_cpu_has_ept(vmcs12) ||
> +	    !nested_cpu_has_eptp_switching(vmcs12))
> +		goto fail;
> +
> +	if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
> +		goto fail;

I can find the definition for an vmexit in case of index >=
VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.

Can you give me a hint?

> +
> +	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
> +	if (!page)
>  		goto fail;
> -	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
> +
> +	l1_eptp_list = kmap(page);
> +	address = l1_eptp_list[index];
> +	if (!address)
> +		goto fail;

Can you move that check to the other address checks below? (or rework if
this make sense, see below)

> +	/*
> +	 * If the (L2) guest does a vmfunc to the currently
> +	 * active ept pointer, we don't have to do anything else
> +	 */
> +	if (vmcs12->ept_pointer != address) {
> +		if (address >> cpuid_maxphyaddr(vcpu) ||
> +		    !IS_ALIGNED(address, 4096))

Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
(triggering a KVM_REQ_TRIPLE_FAULT)

> +			goto fail;
> +		kvm_mmu_unload(vcpu);
> +		vmcs12->ept_pointer = address;
> +		kvm_mmu_reload(vcpu);

I was thinking about something like this:

kvm_mmu_unload(vcpu);
old = vmcs12->ept_pointer;
vmcs12->ept_pointer = address;
if (kvm_mmu_reload(vcpu)) {
	/* pointer invalid, restore previous state */
	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
	vmcs12->ept_pointer = old;
	kvm_mmu_reload(vcpu);
	goto fail;
}

The you can inherit the checks from mmu_check_root().


Wonder why I can't spot checks for cpuid_maxphyaddr() /
IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
checks should be identical.


> +		kunmap(page);
> +		nested_release_page_clean(page);

shouldn't the kunmap + nested_release_page_clean go outside the if clause?

> +	}
> +	return kvm_skip_emulated_instruction(vcpu);
>  
>  fail:
> +	if (page) {
> +		kunmap(page);
> +		nested_release_page_clean(page);
> +	}
>  	nested_vmx_vmexit(vcpu, vmx->exit_reason,
>  			  vmcs_read32(VM_EXIT_INTR_INFO),
>  			  vmcs_readl(EXIT_QUALIFICATION));
> 

David and mmu code are not yet best friends. So sorry if I am missing
something.
Paolo Bonzini July 11, 2017, 8:39 a.m. UTC | #2
On 11/07/2017 09:51, David Hildenbrand wrote:
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has_vmfunc(vmcs12) &&
>> +		(vmcs12->vm_function_control &
>
> I wonder if it makes sense to rename vm_function_control to
> - vmfunc_control
> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
> - vmfunc_ctrl

Blame Intel for this. :) They use full English names for VMCS fields (so
"VM-function control") and uppercase names for MSRs (so "IA32_VMX_VMFUNC").

"nested_vmx_vmfunc_controls" could be renamed to "nested_vmx_vmfunc" for
consistency, but I like the longer name too.

Paolo
Radim Krčmář July 11, 2017, 1:52 p.m. UTC | #3
[David did a great review, so I'll just point out things I noticed.]

2017-07-11 09:51+0200, David Hildenbrand:
> On 10.07.2017 22:49, Bandan Das wrote:
> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
> > emulate a switching of the ept pointer by reloading the
> > guest MMU.
> > 
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > Signed-off-by: Bandan Das <bsd@redhat.com>
> > ---
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
> >  	}
> >  
> >  	vmcs12 = get_vmcs12(vcpu);
> > -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
> > +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> > +	    WARN_ON_ONCE(function))
> 
> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> VM-function controls (the selected VM function is
> not enabled)."
> 
> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> completely.

It assumes that vm_function_control is not > 1, which is (should be)
guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
is 1.

> > +		goto fail;

The rest of the code assumes that the function is
VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.

Writing it as

  WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)

would be cleared and I'd prefer to move the part that handles
VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
going to add more than one VM FUNC. :])

> > +	if (!nested_cpu_has_ept(vmcs12) ||
> > +	    !nested_cpu_has_eptp_switching(vmcs12))
> > +		goto fail;

This brings me to a missing vm-entry check:

 If “EPTP switching” VM-function control is 1, the “enable EPT”
 VM-execution control must also be 1. In addition, the EPTP-list address
 must satisfy the following checks:
 • Bits 11:0 of the address must be 0.
 • The address must not set any bits beyond the processor’s
   physical-address width.

so this one could be

  if (!nested_cpu_has_eptp_switching(vmcs12) ||
      WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))

after adding the check.
Bandan Das July 11, 2017, 5:58 p.m. UTC | #4
David Hildenbrand <david@redhat.com> writes:

> On 10.07.2017 22:49, Bandan Das wrote:
>> When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> emulate a switching of the ept pointer by reloading the
>> guest MMU.
>> 
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> ---
>>  arch/x86/include/asm/vmx.h |  6 +++++
>>  arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
>>  2 files changed, 61 insertions(+), 3 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index da5375e..5f63a2e 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -115,6 +115,10 @@
>>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
>>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
>>  
>> +/* VMFUNC functions */
>> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
>> +#define VMFUNC_EPTP_ENTRIES  512
>> +
>>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>  {
>>  	return vmx_basic & GENMASK_ULL(30, 0);
>> @@ -200,6 +204,8 @@ enum vmcs_field {
>>  	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
>>  	EOI_EXIT_BITMAP3                = 0x00002022,
>>  	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
>> +	EPTP_LIST_ADDRESS               = 0x00002024,
>> +	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
>>  	VMREAD_BITMAP                   = 0x00002026,
>>  	VMWRITE_BITMAP                  = 0x00002028,
>>  	XSS_EXIT_BITMAP                 = 0x0000202C,
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index fe8f5fc..0a969fb 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>>  	u64 eoi_exit_bitmap1;
>>  	u64 eoi_exit_bitmap2;
>>  	u64 eoi_exit_bitmap3;
>> +	u64 eptp_list_address;
>>  	u64 xss_exit_bitmap;
>>  	u64 guest_physical_address;
>>  	u64 vmcs_link_pointer;
>> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>>  	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>>  	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>>  	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
>> +	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>>  	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>>  	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>>  	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
>> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>>  	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>>  }
>>  
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has_vmfunc(vmcs12) &&
>> +		(vmcs12->vm_function_control &
>
> I wonder if it makes sense to rename vm_function_control to
> - vmfunc_control
> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
> - vmfunc_ctrl

I tend to follow the SDM names because it's easy to look for them.

>> +		 VMX_VMFUNC_EPTP_SWITCHING);
>> +}
>> +
>>  static inline bool is_nmi(u32 intr_info)
>>  {
>>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>>  	if (cpu_has_vmx_vmfunc()) {
>>  		vmx->nested.nested_vmx_secondary_ctls_high |=
>>  			SECONDARY_EXEC_ENABLE_VMFUNC;
>> -		vmx->nested.nested_vmx_vmfunc_controls = 0;
>> +		/*
>> +		 * Advertise EPTP switching unconditionally
>> +		 * since we emulate it
>> +		 */
>> +		vmx->nested.nested_vmx_vmfunc_controls =
>> +			VMX_VMFUNC_EPTP_SWITCHING;>  	}
>>  
>>  	/*
>> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>>  	struct vmcs12 *vmcs12;
>>  	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>> +	struct page *page = NULL;
>> +	u64 *l1_eptp_list, address;
>>  
>>  	/*
>>  	 * VMFUNC is only supported for nested guests, but we always enable the
>> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>  	}
>>  
>>  	vmcs12 = get_vmcs12(vcpu);
>> -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> +	    WARN_ON_ONCE(function))
>
> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> VM-function controls (the selected VM function is
> not enabled)."
>
> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> completely.

It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's
not misused.

>> +		goto fail;
>> +
>> +	if (!nested_cpu_has_ept(vmcs12) ||
>> +	    !nested_cpu_has_eptp_switching(vmcs12))
>> +		goto fail;
>> +
>> +	if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
>> +		goto fail;
>
> I can find the definition for an vmexit in case of index >=
> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>
> Can you give me a hint?

I don't think there is. Since, we are basically emulating eptp switching
for L2, this is a good check to have.

>> +
>> +	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>> +	if (!page)
>>  		goto fail;
>> -	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
>> +
>> +	l1_eptp_list = kmap(page);
>> +	address = l1_eptp_list[index];
>> +	if (!address)
>> +		goto fail;
>
> Can you move that check to the other address checks below? (or rework if
> this make sense, see below)
>
>> +	/*
>> +	 * If the (L2) guest does a vmfunc to the currently
>> +	 * active ept pointer, we don't have to do anything else
>> +	 */
>> +	if (vmcs12->ept_pointer != address) {
>> +		if (address >> cpuid_maxphyaddr(vcpu) ||
>> +		    !IS_ALIGNED(address, 4096))
>
> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
> (triggering a KVM_REQ_TRIPLE_FAULT)

If there's a triple fault, I think it's a good idea to inject it
back. Basically, there's no need to take care of damage control
that L1 is intentionally doing.

>> +			goto fail;
>> +		kvm_mmu_unload(vcpu);
>> +		vmcs12->ept_pointer = address;
>> +		kvm_mmu_reload(vcpu);
>
> I was thinking about something like this:
>
> kvm_mmu_unload(vcpu);
> old = vmcs12->ept_pointer;
> vmcs12->ept_pointer = address;
> if (kvm_mmu_reload(vcpu)) {
> 	/* pointer invalid, restore previous state */
> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> 	vmcs12->ept_pointer = old;
> 	kvm_mmu_reload(vcpu);
> 	goto fail;
> }
>
> The you can inherit the checks from mmu_check_root().
>
>
> Wonder why I can't spot checks for cpuid_maxphyaddr() /
> IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
> checks should be identical.

I think the reason is vmcs12->ept_pointer is never used directly. It's
used to create a shadow table but nevertheless, the check should be there.

>
>> +		kunmap(page);
>> +		nested_release_page_clean(page);
>
> shouldn't the kunmap + nested_release_page_clean go outside the if clause?

:) Indeed, thanks for the catch.

Bandan

>> +	}
>> +	return kvm_skip_emulated_instruction(vcpu);
>>  
>>  fail:
>> +	if (page) {
>> +		kunmap(page);
>> +		nested_release_page_clean(page);
>> +	}
>>  	nested_vmx_vmexit(vcpu, vmx->exit_reason,
>>  			  vmcs_read32(VM_EXIT_INTR_INFO),
>>  			  vmcs_readl(EXIT_QUALIFICATION));
>> 
>
> David and mmu code are not yet best friends. So sorry if I am missing
> something.
Bandan Das July 11, 2017, 6:05 p.m. UTC | #5
Radim Krčmář <rkrcmar@redhat.com> writes:

> [David did a great review, so I'll just point out things I noticed.]
>
> 2017-07-11 09:51+0200, David Hildenbrand:
>> On 10.07.2017 22:49, Bandan Das wrote:
>> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> > emulate a switching of the ept pointer by reloading the
>> > guest MMU.
>> > 
>> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> > Signed-off-by: Bandan Das <bsd@redhat.com>
>> > ---
>> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> >  	}
>> >  
>> >  	vmcs12 = get_vmcs12(vcpu);
>> > -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> > +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> > +	    WARN_ON_ONCE(function))
>> 
>> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> VM-function controls (the selected VM function is
>> not enabled)."
>> 
>> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> completely.
>
> It assumes that vm_function_control is not > 1, which is (should be)
> guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
> is 1.
>
>> > +		goto fail;
>
> The rest of the code assumes that the function is
> VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
>
> Writing it as
>
>   WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
>
> would be cleared and I'd prefer to move the part that handles
> VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
> going to add more than one VM FUNC. :])

IMO, for now, this should be fine because we are not even passing through the
hardware's eptp switching. Even if there are other vm functions, they
won't be available for the nested case and cause any conflict.

>> > +	if (!nested_cpu_has_ept(vmcs12) ||
>> > +	    !nested_cpu_has_eptp_switching(vmcs12))
>> > +		goto fail;
>
> This brings me to a missing vm-entry check:
>
>  If “EPTP switching” VM-function control is 1, the “enable EPT”
>  VM-execution control must also be 1. In addition, the EPTP-list address
>  must satisfy the following checks:
>  • Bits 11:0 of the address must be 0.
>  • The address must not set any bits beyond the processor’s
>    physical-address width.
>
> so this one could be
>
>   if (!nested_cpu_has_eptp_switching(vmcs12) ||
>       WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))

I will reverse the order here but the vm entry check is unnecessary because
the check on the list address is already being done in this function.

> after adding the check.
Jim Mattson July 11, 2017, 6:22 p.m. UTC | #6
On Tue, Jul 11, 2017 at 10:58 AM, Bandan Das <bsd@redhat.com> wrote:
> David Hildenbrand <david@redhat.com> writes:
>
>> On 10.07.2017 22:49, Bandan Das wrote:
>>> When L2 uses vmfunc, L0 utilizes the associated vmexit to
>>> emulate a switching of the ept pointer by reloading the
>>> guest MMU.
>>>
>>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>>> Signed-off-by: Bandan Das <bsd@redhat.com>
>>> ---
>>>  arch/x86/include/asm/vmx.h |  6 +++++
>>>  arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
>>>  2 files changed, 61 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>> index da5375e..5f63a2e 100644
>>> --- a/arch/x86/include/asm/vmx.h
>>> +++ b/arch/x86/include/asm/vmx.h
>>> @@ -115,6 +115,10 @@
>>>  #define VMX_MISC_SAVE_EFER_LMA                      0x00000020
>>>  #define VMX_MISC_ACTIVITY_HLT                       0x00000040
>>>
>>> +/* VMFUNC functions */
>>> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
>>> +#define VMFUNC_EPTP_ENTRIES  512
>>> +
>>>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>>  {
>>>      return vmx_basic & GENMASK_ULL(30, 0);
>>> @@ -200,6 +204,8 @@ enum vmcs_field {
>>>      EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
>>>      EOI_EXIT_BITMAP3                = 0x00002022,
>>>      EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
>>> +    EPTP_LIST_ADDRESS               = 0x00002024,
>>> +    EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
>>>      VMREAD_BITMAP                   = 0x00002026,
>>>      VMWRITE_BITMAP                  = 0x00002028,
>>>      XSS_EXIT_BITMAP                 = 0x0000202C,
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index fe8f5fc..0a969fb 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>>>      u64 eoi_exit_bitmap1;
>>>      u64 eoi_exit_bitmap2;
>>>      u64 eoi_exit_bitmap3;
>>> +    u64 eptp_list_address;
>>>      u64 xss_exit_bitmap;
>>>      u64 guest_physical_address;
>>>      u64 vmcs_link_pointer;
>>> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>>>      FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>>>      FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>>>      FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
>>> +    FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>>>      FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>>>      FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>>>      FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
>>> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>>>      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>>>  }
>>>
>>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>>> +{
>>> +    return nested_cpu_has_vmfunc(vmcs12) &&
>>> +            (vmcs12->vm_function_control &
>>
>> I wonder if it makes sense to rename vm_function_control to
>> - vmfunc_control
>> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
>> - vmfunc_ctrl
>
> I tend to follow the SDM names because it's easy to look for them.
>
>>> +             VMX_VMFUNC_EPTP_SWITCHING);
>>> +}
>>> +
>>>  static inline bool is_nmi(u32 intr_info)
>>>  {
>>>      return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>>>      if (cpu_has_vmx_vmfunc()) {
>>>              vmx->nested.nested_vmx_secondary_ctls_high |=
>>>                      SECONDARY_EXEC_ENABLE_VMFUNC;
>>> -            vmx->nested.nested_vmx_vmfunc_controls = 0;
>>> +            /*
>>> +             * Advertise EPTP switching unconditionally
>>> +             * since we emulate it
>>> +             */
>>> +            vmx->nested.nested_vmx_vmfunc_controls =
>>> +                    VMX_VMFUNC_EPTP_SWITCHING;>     }
>>>
>>>      /*
>>> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>>      struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>      struct vmcs12 *vmcs12;
>>>      u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>>> +    u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>>> +    struct page *page = NULL;
>>> +    u64 *l1_eptp_list, address;
>>>
>>>      /*
>>>       * VMFUNC is only supported for nested guests, but we always enable the
>>> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>>      }
>>>
>>>      vmcs12 = get_vmcs12(vcpu);
>>> -    if ((vmcs12->vm_function_control & (1 << function)) == 0)
>>> +    if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>>> +        WARN_ON_ONCE(function))
>>
>> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> VM-function controls (the selected VM function is
>> not enabled)."
>>
>> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> completely.
>
> It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's
> not misused.
>
>>> +            goto fail;
>>> +
>>> +    if (!nested_cpu_has_ept(vmcs12) ||
>>> +        !nested_cpu_has_eptp_switching(vmcs12))
>>> +            goto fail;
>>> +
>>> +    if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
>>> +            goto fail;
>>
>> I can find the definition for an vmexit in case of index >=
>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>>
>> Can you give me a hint?
>
> I don't think there is. Since, we are basically emulating eptp switching
> for L2, this is a good check to have.

There is nothing wrong with a hypervisor using physical page 0 for
whatever purpose it likes, including an EPTP list.

>>> +
>>> +    page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>>> +    if (!page)
>>>              goto fail;
>>> -    WARN_ONCE(1, "VMCS12 VM function control should have been zero");
>>> +
>>> +    l1_eptp_list = kmap(page);
>>> +    address = l1_eptp_list[index];
>>> +    if (!address)
>>> +            goto fail;
>>
>> Can you move that check to the other address checks below? (or rework if
>> this make sense, see below)
>>
>>> +    /*
>>> +     * If the (L2) guest does a vmfunc to the currently
>>> +     * active ept pointer, we don't have to do anything else
>>> +     */
>>> +    if (vmcs12->ept_pointer != address) {
>>> +            if (address >> cpuid_maxphyaddr(vcpu) ||
>>> +                !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
>
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.
>
>>> +                    goto fail;
>>> +            kvm_mmu_unload(vcpu);
>>> +            vmcs12->ept_pointer = address;
>>> +            kvm_mmu_reload(vcpu);
>>
>> I was thinking about something like this:
>>
>> kvm_mmu_unload(vcpu);
>> old = vmcs12->ept_pointer;
>> vmcs12->ept_pointer = address;
>> if (kvm_mmu_reload(vcpu)) {
>>       /* pointer invalid, restore previous state */
>>       kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>>       vmcs12->ept_pointer = old;
>>       kvm_mmu_reload(vcpu);
>>       goto fail;
>> }
>>
>> The you can inherit the checks from mmu_check_root().
>>
>>
>> Wonder why I can't spot checks for cpuid_maxphyaddr() /
>> IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
>> checks should be identical.
>
> I think the reason is vmcs12->ept_pointer is never used directly. It's
> used to create a shadow table but nevertheless, the check should be there.
>
>>
>>> +            kunmap(page);
>>> +            nested_release_page_clean(page);
>>
>> shouldn't the kunmap + nested_release_page_clean go outside the if clause?
>
> :) Indeed, thanks for the catch.
>
> Bandan
>
>>> +    }
>>> +    return kvm_skip_emulated_instruction(vcpu);
>>>
>>>  fail:
>>> +    if (page) {
>>> +            kunmap(page);
>>> +            nested_release_page_clean(page);
>>> +    }
>>>      nested_vmx_vmexit(vcpu, vmx->exit_reason,
>>>                        vmcs_read32(VM_EXIT_INTR_INFO),
>>>                        vmcs_readl(EXIT_QUALIFICATION));
>>>
>>
>> David and mmu code are not yet best friends. So sorry if I am missing
>> something.
Bandan Das July 11, 2017, 6:24 p.m. UTC | #7
Bandan Das <bsd@redhat.com> writes:
....
>>> +	/*
>>> +	 * If the (L2) guest does a vmfunc to the currently
>>> +	 * active ept pointer, we don't have to do anything else
>>> +	 */
>>> +	if (vmcs12->ept_pointer != address) {
>>> +		if (address >> cpuid_maxphyaddr(vcpu) ||
>>> +		    !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
>
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.
>
>>> +			goto fail;
>>> +		kvm_mmu_unload(vcpu);
>>> +		vmcs12->ept_pointer = address;
>>> +		kvm_mmu_reload(vcpu);
>>
>> I was thinking about something like this:
>>
>> kvm_mmu_unload(vcpu);
>> old = vmcs12->ept_pointer;
>> vmcs12->ept_pointer = address;
>> if (kvm_mmu_reload(vcpu)) {
>> 	/* pointer invalid, restore previous state */
>> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> 	vmcs12->ept_pointer = old;
>> 	kvm_mmu_reload(vcpu);
>> 	goto fail;
>> }
>>
>> The you can inherit the checks from mmu_check_root().

Actually, thinking about this a bit more, I agree with you. Any fault
with a vmfunc operation should end with a vmfunc vmexit, so this
is a good thing to have. Thank you for this idea! :)

Bandan
Bandan Das July 11, 2017, 6:35 p.m. UTC | #8
Jim Mattson <jmattson@google.com> writes:
...
>>> I can find the definition for an vmexit in case of index >=
>>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>>>
>>> Can you give me a hint?
>>
>> I don't think there is. Since, we are basically emulating eptp switching
>> for L2, this is a good check to have.
>
> There is nothing wrong with a hypervisor using physical page 0 for
> whatever purpose it likes, including an EPTP list.

Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
address most likely means it forgot to initialize it. Whatever damage it does will
still end up with vmfunc vmexit anyway.

Bandan
Radim Krčmář July 11, 2017, 7:12 p.m. UTC | #9
2017-07-11 14:05-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > [David did a great review, so I'll just point out things I noticed.]
> >
> > 2017-07-11 09:51+0200, David Hildenbrand:
> >> On 10.07.2017 22:49, Bandan Das wrote:
> >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
> >> > emulate a switching of the ept pointer by reloading the
> >> > guest MMU.
> >> > 
> >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> >> > Signed-off-by: Bandan Das <bsd@redhat.com>
> >> > ---
> >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
> >> >  	}
> >> >  
> >> >  	vmcs12 = get_vmcs12(vcpu);
> >> > -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
> >> > +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> >> > +	    WARN_ON_ONCE(function))
> >> 
> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> >> VM-function controls (the selected VM function is
> >> not enabled)."
> >> 
> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> >> completely.
> >
> > It assumes that vm_function_control is not > 1, which is (should be)
> > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
> > is 1.
> >
> >> > +		goto fail;
> >
> > The rest of the code assumes that the function is
> > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
> >
> > Writing it as
> >
> >   WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
> >
> > would be cleared and I'd prefer to move the part that handles
> > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
> > going to add more than one VM FUNC. :])
> 
> IMO, for now, this should be fine because we are not even passing through the
> hardware's eptp switching. Even if there are other vm functions, they
> won't be available for the nested case and cause any conflict.

Yeah, it is fine function-wise, I was just pointing out that it looks
ugly to me.

Btw. have you looked what we'd need to do for the hardware pass-through?
I'd expect big changes to MMU. :)

> >> > +	if (!nested_cpu_has_ept(vmcs12) ||
> >> > +	    !nested_cpu_has_eptp_switching(vmcs12))
> >> > +		goto fail;
> >
> > This brings me to a missing vm-entry check:
> >
> >  If “EPTP switching” VM-function control is 1, the “enable EPT”
> >  VM-execution control must also be 1. In addition, the EPTP-list address
> >  must satisfy the following checks:
> >  • Bits 11:0 of the address must be 0.
> >  • The address must not set any bits beyond the processor’s
> >    physical-address width.
> >
> > so this one could be
> >
> >   if (!nested_cpu_has_eptp_switching(vmcs12) ||
> >       WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
> 
> I will reverse the order here but the vm entry check is unnecessary because
> the check on the list address is already being done in this function.

Here is too late, the nested VM-entry should have failed, never letting
this situation happen.  We want an equivalent of

  if (nested_cpu_has_eptp_switching(vmcs12) && !nested_cpu_has_ept(vmcs12))
  	return VMXERR_ENTRY_INVALID_CONTROL_FIELD;

in nested controls checks, right next to the reserved fields check.
And then also the check EPTP-list check.  All of them only checked when
nested_cpu_has_vmfunc(vmcs12).
Radim Krčmář July 11, 2017, 7:13 p.m. UTC | #10
2017-07-11 14:35-0400, Bandan Das:
> Jim Mattson <jmattson@google.com> writes:
> ...
> >>> I can find the definition for an vmexit in case of index >=
> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >>>
> >>> Can you give me a hint?
> >>
> >> I don't think there is. Since, we are basically emulating eptp switching
> >> for L2, this is a good check to have.
> >
> > There is nothing wrong with a hypervisor using physical page 0 for
> > whatever purpose it likes, including an EPTP list.
> 
> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> address most likely means it forgot to initialize it. Whatever damage it does will
> still end up with vmfunc vmexit anyway.

Most likely, but not certainly.  I also don't see a to diverge from the
spec here.
Radim Krčmář July 11, 2017, 7:32 p.m. UTC | #11
2017-07-11 14:24-0400, Bandan Das:
> Bandan Das <bsd@redhat.com> writes:
> > If there's a triple fault, I think it's a good idea to inject it
> > back. Basically, there's no need to take care of damage control
> > that L1 is intentionally doing.
> >
> >>> +			goto fail;
> >>> +		kvm_mmu_unload(vcpu);
> >>> +		vmcs12->ept_pointer = address;
> >>> +		kvm_mmu_reload(vcpu);
> >>
> >> I was thinking about something like this:
> >>
> >> kvm_mmu_unload(vcpu);
> >> old = vmcs12->ept_pointer;
> >> vmcs12->ept_pointer = address;
> >> if (kvm_mmu_reload(vcpu)) {
> >> 	/* pointer invalid, restore previous state */
> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> 	vmcs12->ept_pointer = old;
> >> 	kvm_mmu_reload(vcpu);
> >> 	goto fail;
> >> }
> >>
> >> The you can inherit the checks from mmu_check_root().
> 
> Actually, thinking about this a bit more, I agree with you. Any fault
> with a vmfunc operation should end with a vmfunc vmexit, so this
> is a good thing to have. Thank you for this idea! :)

SDM says

  IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
  if in EPTP) THEN VMexit;

and no other mentions of a VM exit, so I think that the VM exit happens
only under these conditions:

  — The EPT memory type (bits 2:0) must be a value supported by the
    processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
    Appendix A.10).
  — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
    an EPT page-walk length of 4; see Section 28.2.2.
  — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
    bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
    as 0, indicating that the processor does not support accessed and
    dirty flags for EPT.
  — Reserved bits 11:7 and 63:N (where N is the processor’s
    physical-address width) must all be 0.

And it looks like we need parts of nested_ept_init_mmu_context() to
properly handle VMX_EPT_AD_ENABLE_BIT.

The KVM_REQ_TRIPLE_FAULT can be handled by kvm_mmu_reload in vcpu_run if
we just invalidate the MMU.
Bandan Das July 11, 2017, 7:34 p.m. UTC | #12
Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 14:05-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> 
>> > [David did a great review, so I'll just point out things I noticed.]
>> >
>> > 2017-07-11 09:51+0200, David Hildenbrand:
>> >> On 10.07.2017 22:49, Bandan Das wrote:
>> >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> >> > emulate a switching of the ept pointer by reloading the
>> >> > guest MMU.
>> >> > 
>> >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> >> > Signed-off-by: Bandan Das <bsd@redhat.com>
>> >> > ---
>> >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> >> >  	}
>> >> >  
>> >> >  	vmcs12 = get_vmcs12(vcpu);
>> >> > -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> >> > +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> >> > +	    WARN_ON_ONCE(function))
>> >> 
>> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> >> VM-function controls (the selected VM function is
>> >> not enabled)."
>> >> 
>> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> >> completely.
>> >
>> > It assumes that vm_function_control is not > 1, which is (should be)
>> > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
>> > is 1.
>> >
>> >> > +		goto fail;
>> >
>> > The rest of the code assumes that the function is
>> > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
>> >
>> > Writing it as
>> >
>> >   WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
>> >
>> > would be cleared and I'd prefer to move the part that handles
>> > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
>> > going to add more than one VM FUNC. :])
>> 
>> IMO, for now, this should be fine because we are not even passing through the
>> hardware's eptp switching. Even if there are other vm functions, they
>> won't be available for the nested case and cause any conflict.
>
> Yeah, it is fine function-wise, I was just pointing out that it looks
> ugly to me.

Ok, lemme switch this to a switch statement style handler function. That way,
future additions would be easier.

> Btw. have you looked what we'd need to do for the hardware pass-through?
> I'd expect big changes to MMU. :)

Yes, the first version was actually using vmfunc 0 directly, well not exatly, the
first time would go through this path and then the next time the processor would
handle it directly. Paolo pointed out an issue that I was ready to fix but he wasn't
comfortable with the idea. I actually agree with him, it's too much untested code
for something that would be rarely used.

>> >> > +	if (!nested_cpu_has_ept(vmcs12) ||
>> >> > +	    !nested_cpu_has_eptp_switching(vmcs12))
>> >> > +		goto fail;
>> >
>> > This brings me to a missing vm-entry check:
>> >
>> >  If “EPTP switching” VM-function control is 1, the “enable EPT”
>> >  VM-execution control must also be 1. In addition, the EPTP-list address
>> >  must satisfy the following checks:
>> >  • Bits 11:0 of the address must be 0.
>> >  • The address must not set any bits beyond the processor’s
>> >    physical-address width.
>> >
>> > so this one could be
>> >
>> >   if (!nested_cpu_has_eptp_switching(vmcs12) ||
>> >       WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
>> 
>> I will reverse the order here but the vm entry check is unnecessary because
>> the check on the list address is already being done in this function.
>
> Here is too late, the nested VM-entry should have failed, never letting
> this situation happen.  We want an equivalent of
>
>   if (nested_cpu_has_eptp_switching(vmcs12) && !nested_cpu_has_ept(vmcs12))
>   	return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>
> in nested controls checks, right next to the reserved fields check.
> And then also the check EPTP-list check.  All of them only checked when
> nested_cpu_has_vmfunc(vmcs12).

Actually, I misread 25.5.5.3. There are two checks. Here, the list entry
needs to be checked so that eptp won't cause a vmentry failure. The vmentry
needs to check the eptp list address itself. I will add that check for the
list address in the next version.

Bandan
Bandan Das July 11, 2017, 7:38 p.m. UTC | #13
Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 14:35-0400, Bandan Das:
>> Jim Mattson <jmattson@google.com> writes:
>> ...
>> >>> I can find the definition for an vmexit in case of index >=
>> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>> >>>
>> >>> Can you give me a hint?
>> >>
>> >> I don't think there is. Since, we are basically emulating eptp switching
>> >> for L2, this is a good check to have.
>> >
>> > There is nothing wrong with a hypervisor using physical page 0 for
>> > whatever purpose it likes, including an EPTP list.
>> 
>> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
>> address most likely means it forgot to initialize it. Whatever damage it does will
>> still end up with vmfunc vmexit anyway.
>
> Most likely, but not certainly.  I also don't see a to diverge from the
> spec here.

Actually, this is a specific case where I would like to diverge from the spec.
But then again, it's L1 shooting itself in the foot and this would be a rarely
used code path, so, I am fine removing it.
Bandan Das July 11, 2017, 7:50 p.m. UTC | #14
Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 14:24-0400, Bandan Das:
>> Bandan Das <bsd@redhat.com> writes:
>> > If there's a triple fault, I think it's a good idea to inject it
>> > back. Basically, there's no need to take care of damage control
>> > that L1 is intentionally doing.
>> >
>> >>> +			goto fail;
>> >>> +		kvm_mmu_unload(vcpu);
>> >>> +		vmcs12->ept_pointer = address;
>> >>> +		kvm_mmu_reload(vcpu);
>> >>
>> >> I was thinking about something like this:
>> >>
>> >> kvm_mmu_unload(vcpu);
>> >> old = vmcs12->ept_pointer;
>> >> vmcs12->ept_pointer = address;
>> >> if (kvm_mmu_reload(vcpu)) {
>> >> 	/* pointer invalid, restore previous state */
>> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> 	vmcs12->ept_pointer = old;
>> >> 	kvm_mmu_reload(vcpu);
>> >> 	goto fail;
>> >> }
>> >>
>> >> The you can inherit the checks from mmu_check_root().
>> 
>> Actually, thinking about this a bit more, I agree with you. Any fault
>> with a vmfunc operation should end with a vmfunc vmexit, so this
>> is a good thing to have. Thank you for this idea! :)
>
> SDM says
>
>   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
>   if in EPTP) THEN VMexit;

This section here:
As noted in Section 25.5.5.2, an execution of the
EPTP-switching VM function that causes a VM exit (as specified
above), uses the basic exit reason 59, indicating “VMFUNC”.
The length of the VMFUNC instruction is saved into the
VM-exit instruction-length field. No additional VM-exit
information is provided.

Although, it adds (as specified above), from testing, any vmexit that
happens as a result of the execution of the vmfunc instruction always
has exit reason 59.

IMO, the case David pointed out comes under "as a result of the
execution of the vmfunc instruction", so I would prefer exiting
with reason 59.

> and no other mentions of a VM exit, so I think that the VM exit happens
> only under these conditions:
>
>   — The EPT memory type (bits 2:0) must be a value supported by the
>     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>     Appendix A.10).
>   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>     an EPT page-walk length of 4; see Section 28.2.2.
>   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>     as 0, indicating that the processor does not support accessed and
>     dirty flags for EPT.
>   — Reserved bits 11:7 and 63:N (where N is the processor’s
>     physical-address width) must all be 0.
>
> And it looks like we need parts of nested_ept_init_mmu_context() to
> properly handle VMX_EPT_AD_ENABLE_BIT.

I completely ignored AD and the #VE sections. I will add a TODO item
in the comment section.

> The KVM_REQ_TRIPLE_FAULT can be handled by kvm_mmu_reload in vcpu_run if
> we just invalidate the MMU.
Radim Krčmář July 11, 2017, 8:21 p.m. UTC | #15
2017-07-11 15:50-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> > 2017-07-11 14:24-0400, Bandan Das:
> >> Bandan Das <bsd@redhat.com> writes:
> >> > If there's a triple fault, I think it's a good idea to inject it
> >> > back. Basically, there's no need to take care of damage control
> >> > that L1 is intentionally doing.
> >> >
> >> >>> +			goto fail;
> >> >>> +		kvm_mmu_unload(vcpu);
> >> >>> +		vmcs12->ept_pointer = address;
> >> >>> +		kvm_mmu_reload(vcpu);
> >> >>
> >> >> I was thinking about something like this:
> >> >>
> >> >> kvm_mmu_unload(vcpu);
> >> >> old = vmcs12->ept_pointer;
> >> >> vmcs12->ept_pointer = address;
> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> 	/* pointer invalid, restore previous state */
> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> 	vmcs12->ept_pointer = old;
> >> >> 	kvm_mmu_reload(vcpu);
> >> >> 	goto fail;
> >> >> }
> >> >>
> >> >> The you can inherit the checks from mmu_check_root().
> >> 
> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> is a good thing to have. Thank you for this idea! :)
> >
> > SDM says
> >
> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> >   if in EPTP) THEN VMexit;
> 
> This section here:
> As noted in Section 25.5.5.2, an execution of the
> EPTP-switching VM function that causes a VM exit (as specified
> above), uses the basic exit reason 59, indicating “VMFUNC”.
> The length of the VMFUNC instruction is saved into the
> VM-exit instruction-length field. No additional VM-exit
> information is provided.
> 
> Although, it adds (as specified above), from testing, any vmexit that
> happens as a result of the execution of the vmfunc instruction always
> has exit reason 59.
> 
> IMO, the case David pointed out comes under "as a result of the
> execution of the vmfunc instruction", so I would prefer exiting
> with reason 59.

Right, the exit reason is 59 for reasons that trigger a VM exit
(i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
unrelated stuff.

If the EPTP value is correct, then the switch should succeed.
If the EPTP is correct, but bogus, then the guest should get
EPT_MISCONFIG VM exit on its first access (when reading the
instruction).  Source: I added

  vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));

shortly before a VMLAUNCH on L0. :)

I think that we might be emulating this case incorrectly and throwing
triple faults when it should be VM exits in vcpu_run().

> > and no other mentions of a VM exit, so I think that the VM exit happens
> > only under these conditions:
> >
> >   — The EPT memory type (bits 2:0) must be a value supported by the
> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
> >     Appendix A.10).
> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
> >     an EPT page-walk length of 4; see Section 28.2.2.
> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
> >     as 0, indicating that the processor does not support accessed and
> >     dirty flags for EPT.
> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
> >     physical-address width) must all be 0.
> >
> > And it looks like we need parts of nested_ept_init_mmu_context() to
> > properly handle VMX_EPT_AD_ENABLE_BIT.
> 
> I completely ignored AD and the #VE sections. I will add a TODO item
> in the comment section.

AFAIK, we don't support #VE, but AD would be nice to handle from the
beginning.  (I think that caling nested_ept_init_mmu_context() as-is
isn't that bad.)
Radim Krčmář July 11, 2017, 8:22 p.m. UTC | #16
2017-07-11 15:38-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > 2017-07-11 14:35-0400, Bandan Das:
> >> Jim Mattson <jmattson@google.com> writes:
> >> ...
> >> >>> I can find the definition for an vmexit in case of index >=
> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >> >>>
> >> >>> Can you give me a hint?
> >> >>
> >> >> I don't think there is. Since, we are basically emulating eptp switching
> >> >> for L2, this is a good check to have.
> >> >
> >> > There is nothing wrong with a hypervisor using physical page 0 for
> >> > whatever purpose it likes, including an EPTP list.
> >> 
> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> >> address most likely means it forgot to initialize it. Whatever damage it does will
> >> still end up with vmfunc vmexit anyway.
> >
> > Most likely, but not certainly.  I also don't see a to diverge from the
> > spec here.
> 
> Actually, this is a specific case where I would like to diverge from the spec.
> But then again, it's L1 shooting itself in the foot and this would be a rarely
> used code path, so, I am fine removing it.

Thanks, we're not here to judge the guest, but to provide a bare-metal
experience. :)
Bandan Das July 11, 2017, 8:34 p.m. UTC | #17
Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 15:50-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> > 2017-07-11 14:24-0400, Bandan Das:
>> >> Bandan Das <bsd@redhat.com> writes:
>> >> > If there's a triple fault, I think it's a good idea to inject it
>> >> > back. Basically, there's no need to take care of damage control
>> >> > that L1 is intentionally doing.
>> >> >
>> >> >>> +			goto fail;
>> >> >>> +		kvm_mmu_unload(vcpu);
>> >> >>> +		vmcs12->ept_pointer = address;
>> >> >>> +		kvm_mmu_reload(vcpu);
>> >> >>
>> >> >> I was thinking about something like this:
>> >> >>
>> >> >> kvm_mmu_unload(vcpu);
>> >> >> old = vmcs12->ept_pointer;
>> >> >> vmcs12->ept_pointer = address;
>> >> >> if (kvm_mmu_reload(vcpu)) {
>> >> >> 	/* pointer invalid, restore previous state */
>> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> >> 	vmcs12->ept_pointer = old;
>> >> >> 	kvm_mmu_reload(vcpu);
>> >> >> 	goto fail;
>> >> >> }
>> >> >>
>> >> >> The you can inherit the checks from mmu_check_root().
>> >> 
>> >> Actually, thinking about this a bit more, I agree with you. Any fault
>> >> with a vmfunc operation should end with a vmfunc vmexit, so this
>> >> is a good thing to have. Thank you for this idea! :)
>> >
>> > SDM says
>> >
>> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
>> >   if in EPTP) THEN VMexit;
>> 
>> This section here:
>> As noted in Section 25.5.5.2, an execution of the
>> EPTP-switching VM function that causes a VM exit (as specified
>> above), uses the basic exit reason 59, indicating “VMFUNC”.
>> The length of the VMFUNC instruction is saved into the
>> VM-exit instruction-length field. No additional VM-exit
>> information is provided.
>> 
>> Although, it adds (as specified above), from testing, any vmexit that
>> happens as a result of the execution of the vmfunc instruction always
>> has exit reason 59.
>> 
>> IMO, the case David pointed out comes under "as a result of the
>> execution of the vmfunc instruction", so I would prefer exiting
>> with reason 59.
>
> Right, the exit reason is 59 for reasons that trigger a VM exit
> (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> unrelated stuff.
>
> If the EPTP value is correct, then the switch should succeed.
> If the EPTP is correct, but bogus, then the guest should get
> EPT_MISCONFIG VM exit on its first access (when reading the
> instruction).  Source: I added

My point is that we are using kvm_mmu_reload() to emulate eptp
switching. If that emulation of vmfunc fails, it should exit with reason
59.

>   vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>
> shortly before a VMLAUNCH on L0. :)

What happens if this ept pointer is actually in the eptp list and the guest
switches to it using vmfunc ? I think it will exit with reason 59.

> I think that we might be emulating this case incorrectly and throwing
> triple faults when it should be VM exits in vcpu_run().

No, I agree with not throwing a triple fault. We should clear it out.
But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.

>> > and no other mentions of a VM exit, so I think that the VM exit happens
>> > only under these conditions:
>> >
>> >   — The EPT memory type (bits 2:0) must be a value supported by the
>> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>> >     Appendix A.10).
>> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>> >     an EPT page-walk length of 4; see Section 28.2.2.
>> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>> >     as 0, indicating that the processor does not support accessed and
>> >     dirty flags for EPT.
>> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
>> >     physical-address width) must all be 0.
>> >
>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>> 
>> I completely ignored AD and the #VE sections. I will add a TODO item
>> in the comment section.
>
> AFAIK, we don't support #VE, but AD would be nice to handle from the

Nevertheless, it's good to have the nested hypervisor be able to use it
just like vmfunc.

> beginning.  (I think that caling nested_ept_init_mmu_context() as-is
> isn't that bad.)

Ok, I will take a look.
Radim Krčmář July 11, 2017, 8:45 p.m. UTC | #18
2017-07-11 16:34-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > 2017-07-11 15:50-0400, Bandan Das:
> >> Radim Krčmář <rkrcmar@redhat.com> writes:
> >> > 2017-07-11 14:24-0400, Bandan Das:
> >> >> Bandan Das <bsd@redhat.com> writes:
> >> >> > If there's a triple fault, I think it's a good idea to inject it
> >> >> > back. Basically, there's no need to take care of damage control
> >> >> > that L1 is intentionally doing.
> >> >> >
> >> >> >>> +			goto fail;
> >> >> >>> +		kvm_mmu_unload(vcpu);
> >> >> >>> +		vmcs12->ept_pointer = address;
> >> >> >>> +		kvm_mmu_reload(vcpu);
> >> >> >>
> >> >> >> I was thinking about something like this:
> >> >> >>
> >> >> >> kvm_mmu_unload(vcpu);
> >> >> >> old = vmcs12->ept_pointer;
> >> >> >> vmcs12->ept_pointer = address;
> >> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> >> 	/* pointer invalid, restore previous state */
> >> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> >> 	vmcs12->ept_pointer = old;
> >> >> >> 	kvm_mmu_reload(vcpu);
> >> >> >> 	goto fail;
> >> >> >> }
> >> >> >>
> >> >> >> The you can inherit the checks from mmu_check_root().
> >> >> 
> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> >> is a good thing to have. Thank you for this idea! :)
> >> >
> >> > SDM says
> >> >
> >> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> >> >   if in EPTP) THEN VMexit;
> >> 
> >> This section here:
> >> As noted in Section 25.5.5.2, an execution of the
> >> EPTP-switching VM function that causes a VM exit (as specified
> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
> >> The length of the VMFUNC instruction is saved into the
> >> VM-exit instruction-length field. No additional VM-exit
> >> information is provided.
> >> 
> >> Although, it adds (as specified above), from testing, any vmexit that
> >> happens as a result of the execution of the vmfunc instruction always
> >> has exit reason 59.
> >> 
> >> IMO, the case David pointed out comes under "as a result of the
> >> execution of the vmfunc instruction", so I would prefer exiting
> >> with reason 59.
> >
> > Right, the exit reason is 59 for reasons that trigger a VM exit
> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> > unrelated stuff.
> >
> > If the EPTP value is correct, then the switch should succeed.
> > If the EPTP is correct, but bogus, then the guest should get
> > EPT_MISCONFIG VM exit on its first access (when reading the
> > instruction).  Source: I added
> 
> My point is that we are using kvm_mmu_reload() to emulate eptp
> switching. If that emulation of vmfunc fails, it should exit with reason
> 59.

Yeah, we just disagree on what is a vmfunc failure.

> >   vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
> >
> > shortly before a VMLAUNCH on L0. :)
> 
> What happens if this ept pointer is actually in the eptp list and the guest
> switches to it using vmfunc ? I think it will exit with reason 59.

I think otherwise, because it doesn't cause a VM entry failure on
bare-metal (and SDM says that we get a VM exit only if there would be a
VM entry failure).
I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
(Experiment pending :])

> > I think that we might be emulating this case incorrectly and throwing
> > triple faults when it should be VM exits in vcpu_run().
> 
> No, I agree with not throwing a triple fault. We should clear it out.
> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.

Here we disagree.  I think that it's a bug do the VM exit, so we can
just keep the original bug -- we want to eventually fix it and it's no
worse till then.
Bandan Das July 11, 2017, 8:45 p.m. UTC | #19
Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 15:38-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> 
>> > 2017-07-11 14:35-0400, Bandan Das:
>> >> Jim Mattson <jmattson@google.com> writes:
>> >> ...
>> >> >>> I can find the definition for an vmexit in case of index >=
>> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>> >> >>>
>> >> >>> Can you give me a hint?
>> >> >>
>> >> >> I don't think there is. Since, we are basically emulating eptp switching
>> >> >> for L2, this is a good check to have.
>> >> >
>> >> > There is nothing wrong with a hypervisor using physical page 0 for
>> >> > whatever purpose it likes, including an EPTP list.
>> >> 
>> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
>> >> address most likely means it forgot to initialize it. Whatever damage it does will
>> >> still end up with vmfunc vmexit anyway.
>> >
>> > Most likely, but not certainly.  I also don't see a to diverge from the
>> > spec here.
>> 
>> Actually, this is a specific case where I would like to diverge from the spec.
>> But then again, it's L1 shooting itself in the foot and this would be a rarely
>> used code path, so, I am fine removing it.
>
> Thanks, we're not here to judge the guest, but to provide a bare-metal
> experience. :)

There are certain cases where do. For example, when L2 instruction emulation
fails we decide to kill L2 instead of injecting the error to L1 and let it handle
that. Anyway, that's a different topic, I was just trying to point out there
are cases kvm does a somewhat policy decision...
Bandan Das July 11, 2017, 9:08 p.m. UTC | #20
Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 16:34-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> 
>> > 2017-07-11 15:50-0400, Bandan Das:
>> >> Radim Krčmář <rkrcmar@redhat.com> writes:
>> >> > 2017-07-11 14:24-0400, Bandan Das:
>> >> >> Bandan Das <bsd@redhat.com> writes:
>> >> >> > If there's a triple fault, I think it's a good idea to inject it
>> >> >> > back. Basically, there's no need to take care of damage control
>> >> >> > that L1 is intentionally doing.
>> >> >> >
>> >> >> >>> +			goto fail;
>> >> >> >>> +		kvm_mmu_unload(vcpu);
>> >> >> >>> +		vmcs12->ept_pointer = address;
>> >> >> >>> +		kvm_mmu_reload(vcpu);
>> >> >> >>
>> >> >> >> I was thinking about something like this:
>> >> >> >>
>> >> >> >> kvm_mmu_unload(vcpu);
>> >> >> >> old = vmcs12->ept_pointer;
>> >> >> >> vmcs12->ept_pointer = address;
>> >> >> >> if (kvm_mmu_reload(vcpu)) {
>> >> >> >> 	/* pointer invalid, restore previous state */
>> >> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> >> >> 	vmcs12->ept_pointer = old;
>> >> >> >> 	kvm_mmu_reload(vcpu);
>> >> >> >> 	goto fail;
>> >> >> >> }
>> >> >> >>
>> >> >> >> The you can inherit the checks from mmu_check_root().
>> >> >> 
>> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
>> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
>> >> >> is a good thing to have. Thank you for this idea! :)
>> >> >
>> >> > SDM says
>> >> >
>> >> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
>> >> >   if in EPTP) THEN VMexit;
>> >> 
>> >> This section here:
>> >> As noted in Section 25.5.5.2, an execution of the
>> >> EPTP-switching VM function that causes a VM exit (as specified
>> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
>> >> The length of the VMFUNC instruction is saved into the
>> >> VM-exit instruction-length field. No additional VM-exit
>> >> information is provided.
>> >> 
>> >> Although, it adds (as specified above), from testing, any vmexit that
>> >> happens as a result of the execution of the vmfunc instruction always
>> >> has exit reason 59.
>> >> 
>> >> IMO, the case David pointed out comes under "as a result of the
>> >> execution of the vmfunc instruction", so I would prefer exiting
>> >> with reason 59.
>> >
>> > Right, the exit reason is 59 for reasons that trigger a VM exit
>> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
>> > unrelated stuff.
>> >
>> > If the EPTP value is correct, then the switch should succeed.
>> > If the EPTP is correct, but bogus, then the guest should get
>> > EPT_MISCONFIG VM exit on its first access (when reading the
>> > instruction).  Source: I added
>> 
>> My point is that we are using kvm_mmu_reload() to emulate eptp
>> switching. If that emulation of vmfunc fails, it should exit with reason
>> 59.
>
> Yeah, we just disagree on what is a vmfunc failure.
>
>> >   vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>> >
>> > shortly before a VMLAUNCH on L0. :)
>> 
>> What happens if this ept pointer is actually in the eptp list and the guest
>> switches to it using vmfunc ? I think it will exit with reason 59.
>
> I think otherwise, because it doesn't cause a VM entry failure on
> bare-metal (and SDM says that we get a VM exit only if there would be a
> VM entry failure).
> I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
> (Experiment pending :])
>
>> > I think that we might be emulating this case incorrectly and throwing
>> > triple faults when it should be VM exits in vcpu_run().
>> 
>> No, I agree with not throwing a triple fault. We should clear it out.
>> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.
>
> Here we disagree.  I think that it's a bug do the VM exit, so we can

Why do you think it's a bug ? The eptp switching function really didn't
succeed as far as our emulation goes when kvm_mmu_reload() fails.
And as such, the generic vmexit failure event should be a vmfunc vmexit.
We cannot strictly follow the spec here, the spec doesn't even mention a way
to emulate eptp switching. If setting up the switching succeeded and the
new root pointer is invalid or whatever, I really don't care what happens
next but this is not the case. We fail to get a new root pointer and without
that, we can't even make a switch!

> just keep the original bug -- we want to eventually fix it and it's no
> worse till then.

Anyway, can you please confirm again what is the behavior that you
are expecting if kvm_mmu_reload fails ? This would be a rarely used
branch and I am actually fine diverging from what I think is right if
I can get the reviewers to agree on a common thing.

(Thanks for giving this a closer look, Radim. I really appreciate it.)

Bandan
Radim Krčmář July 12, 2017, 1:24 p.m. UTC | #21
2017-07-11 17:08-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > 2017-07-11 16:34-0400, Bandan Das:
> >> Radim Krčmář <rkrcmar@redhat.com> writes:
> >> 
> >> > 2017-07-11 15:50-0400, Bandan Das:
> >> >> Radim Krčmář <rkrcmar@redhat.com> writes:
> >> >> > 2017-07-11 14:24-0400, Bandan Das:
> >> >> >> Bandan Das <bsd@redhat.com> writes:
> >> >> >> > If there's a triple fault, I think it's a good idea to inject it
> >> >> >> > back. Basically, there's no need to take care of damage control
> >> >> >> > that L1 is intentionally doing.
> >> >> >> >
> >> >> >> >>> +			goto fail;
> >> >> >> >>> +		kvm_mmu_unload(vcpu);
> >> >> >> >>> +		vmcs12->ept_pointer = address;
> >> >> >> >>> +		kvm_mmu_reload(vcpu);
> >> >> >> >>
> >> >> >> >> I was thinking about something like this:
> >> >> >> >>
> >> >> >> >> kvm_mmu_unload(vcpu);
> >> >> >> >> old = vmcs12->ept_pointer;
> >> >> >> >> vmcs12->ept_pointer = address;
> >> >> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> >> >> 	/* pointer invalid, restore previous state */
> >> >> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> >> >> 	vmcs12->ept_pointer = old;
> >> >> >> >> 	kvm_mmu_reload(vcpu);
> >> >> >> >> 	goto fail;
> >> >> >> >> }
> >> >> >> >>
> >> >> >> >> The you can inherit the checks from mmu_check_root().
> >> >> >> 
> >> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> >> >> is a good thing to have. Thank you for this idea! :)
> >> >> >
> >> >> > SDM says
> >> >> >
> >> >> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> >> >> >   if in EPTP) THEN VMexit;
> >> >> 
> >> >> This section here:
> >> >> As noted in Section 25.5.5.2, an execution of the
> >> >> EPTP-switching VM function that causes a VM exit (as specified
> >> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
> >> >> The length of the VMFUNC instruction is saved into the
> >> >> VM-exit instruction-length field. No additional VM-exit
> >> >> information is provided.
> >> >> 
> >> >> Although, it adds (as specified above), from testing, any vmexit that
> >> >> happens as a result of the execution of the vmfunc instruction always
> >> >> has exit reason 59.
> >> >> 
> >> >> IMO, the case David pointed out comes under "as a result of the
> >> >> execution of the vmfunc instruction", so I would prefer exiting
> >> >> with reason 59.
> >> >
> >> > Right, the exit reason is 59 for reasons that trigger a VM exit
> >> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> >> > unrelated stuff.
> >> >
> >> > If the EPTP value is correct, then the switch should succeed.
> >> > If the EPTP is correct, but bogus, then the guest should get
> >> > EPT_MISCONFIG VM exit on its first access (when reading the
> >> > instruction).  Source: I added
> >> 
> >> My point is that we are using kvm_mmu_reload() to emulate eptp
> >> switching. If that emulation of vmfunc fails, it should exit with reason
> >> 59.
>>
>> Yeah, we just disagree on what is a vmfunc failure.
>>
>>> >   vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>>> >
>>> > shortly before a VMLAUNCH on L0. :)
>>> 
>>> What happens if this ept pointer is actually in the eptp list and the guest
>>> switches to it using vmfunc ? I think it will exit with reason 59.
>>
>> I think otherwise, because it doesn't cause a VM entry failure on
>> bare-metal (and SDM says that we get a VM exit only if there would be a
>> VM entry failure).
>> I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
>> (Experiment pending :])
>>
>>> > I think that we might be emulating this case incorrectly and throwing
>>> > triple faults when it should be VM exits in vcpu_run().
>>> 
>>> No, I agree with not throwing a triple fault. We should clear it out.
>>> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.
>>
>> Here we disagree.  I think that it's a bug do the VM exit, so we can
> 
> Why do you think it's a bug ?

SDM defines a different behavior and hardware doesn't do that either.
There are only two reasons for a VMFUNC VM exit from EPTP switching:

 1) ECX > 0
 2) EPTP would cause VM entry to fail if in VMCS.EPT_POINTER

KVM can fail for other reasons because of its bugs, but that should be
notified to the guest in another way.  Rebooting the guest is kind of
acceptable in that case.

>                               The eptp switching function really didn't
> succeed as far as our emulation goes when kvm_mmu_reload() fails.
> And as such, the generic vmexit failure event should be a vmfunc vmexit.

I interpret it as two separate events -- at first, the vmfunc succeeds
and when it later tries to access memory through the new EPTP (valid,
but not pointing into backed memory), it results in a EPT_MISCONFIG VM
exit.

> We cannot strictly follow the spec here, the spec doesn't even mention a way
> to emulate eptp switching.  If setting up the switching succeeded and the
> new root pointer is invalid or whatever, I really don't care what happens
> next but this is not the case. We fail to get a new root pointer and without
> that, we can't even make a switch!

We just make it behave exactly how the spec says that it behaves.  We do
have a value (we read 'address') to put into VMCS.EPT_POINTER, which is
all we need for the emulation.
The function doesn't dereference that pointer, it just looks at its
value to decide whether it is valid or not.  (btw. we should check that
properly, because we cannot depend on VM entry failure pass-through like
the normal case.)

The dereference done in kvm_mmu_reload() should happen after EPTP
switching finishes, because the spec doesn't mention a VM exit for other
reason than invalid EPT_POINTER value.

>> just keep the original bug -- we want to eventually fix it and it's no
>> worse till then.
> 
> Anyway, can you please confirm again what is the behavior that you
> are expecting if kvm_mmu_reload fails ? This would be a rarely used
> branch and I am actually fine diverging from what I think is right if
> I can get the reviewers to agree on a common thing.

kvm_mmu_reload() fails when mmu_check_root() is false, which means that
the pointed physical address is not backed.  We've hit this corner-case
in the past -- Jim said that the chipset returns all 1s if a read is not
claimed.

So in theory, KVM should not fail kvm_mmu_reload(), but behave as if the
pointer pointed to a memory of all 1s, which would likely result in
EPT_MISCONFIG when the guest does a memory access.

It is a mishandled corner case, but turning it into VM exit would only
confuse an OS that receives the impossible VM exit and potentially
confuse reader of the KVM logic.

I think that not using kvm_mmu_reload() directly in EPTP switching is
best.  The bug is not really something we care about.
Radim Krčmář July 12, 2017, 1:41 p.m. UTC | #22
2017-07-11 16:45-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > 2017-07-11 15:38-0400, Bandan Das:
> >> Radim Krčmář <rkrcmar@redhat.com> writes:
> >> 
> >> > 2017-07-11 14:35-0400, Bandan Das:
> >> >> Jim Mattson <jmattson@google.com> writes:
> >> >> ...
> >> >> >>> I can find the definition for an vmexit in case of index >=
> >> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >> >> >>>
> >> >> >>> Can you give me a hint?
> >> >> >>
> >> >> >> I don't think there is. Since, we are basically emulating eptp switching
> >> >> >> for L2, this is a good check to have.
> >> >> >
> >> >> > There is nothing wrong with a hypervisor using physical page 0 for
> >> >> > whatever purpose it likes, including an EPTP list.
> >> >> 
> >> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> >> >> address most likely means it forgot to initialize it. Whatever damage it does will
> >> >> still end up with vmfunc vmexit anyway.
> >> >
> >> > Most likely, but not certainly.  I also don't see a to diverge from the
> >> > spec here.
> >> 
> >> Actually, this is a specific case where I would like to diverge from the spec.
> >> But then again, it's L1 shooting itself in the foot and this would be a rarely
> >> used code path, so, I am fine removing it.
> >
> > Thanks, we're not here to judge the guest, but to provide a bare-metal
> > experience. :)
> 
> There are certain cases where do. For example, when L2 instruction emulation
> fails we decide to kill L2 instead of injecting the error to L1 and let it handle
> that. Anyway, that's a different topic, I was just trying to point out there
> are cases kvm does a somewhat policy decision...

Emulation failure is a KVM bug and we are too lazy to implement the
bare-metal behavior correctly, but avoiding the EPTP list bug is
actually easier than introducing it.  You can make KVM simpler and
improve bare-metal emulation at the same time.
Bandan Das July 12, 2017, 6:04 p.m. UTC | #23
Radim Krčmář <rkrcmar@redhat.com> writes:
...
>> > Thanks, we're not here to judge the guest, but to provide a bare-metal
>> > experience. :)
>> 
>> There are certain cases where do. For example, when L2 instruction emulation
>> fails we decide to kill L2 instead of injecting the error to L1 and let it handle
>> that. Anyway, that's a different topic, I was just trying to point out there
>> are cases kvm does a somewhat policy decision...
>
> Emulation failure is a KVM bug and we are too lazy to implement the
> bare-metal behavior correctly, but avoiding the EPTP list bug is
> actually easier than introducing it.  You can make KVM simpler and
> improve bare-metal emulation at the same time.

We are just talking past each other here trying to impose point of views.
Checking for 0 makes KVM simpler. As I said before, a 0 list_address means
that the hypervisor forgot to initialize it. Feel free to show me examples
where the hypervisor does indeed use a 0 address for eptp list address or
anything vm specific. You disagreed and I am fine with it.
Bandan Das July 12, 2017, 6:11 p.m. UTC | #24
Radim Krčmář <rkrcmar@redhat.com> writes:
...
>> Why do you think it's a bug ?
>
> SDM defines a different behavior and hardware doesn't do that either.
> There are only two reasons for a VMFUNC VM exit from EPTP switching:
>
>  1) ECX > 0
>  2) EPTP would cause VM entry to fail if in VMCS.EPT_POINTER
>
> KVM can fail for other reasons because of its bugs, but that should be
> notified to the guest in another way.  Rebooting the guest is kind of
> acceptable in that case.
>
>>                               The eptp switching function really didn't
>> succeed as far as our emulation goes when kvm_mmu_reload() fails.
>> And as such, the generic vmexit failure event should be a vmfunc vmexit.
>
> I interpret it as two separate events -- at first, the vmfunc succeeds
> and when it later tries to access memory through the new EPTP (valid,
> but not pointing into backed memory), it results in a EPT_MISCONFIG VM
> exit.
>
>> We cannot strictly follow the spec here, the spec doesn't even mention a way
>> to emulate eptp switching.  If setting up the switching succeeded and the
>> new root pointer is invalid or whatever, I really don't care what happens
>> next but this is not the case. We fail to get a new root pointer and without
>> that, we can't even make a switch!
>
> We just make it behave exactly how the spec says that it behaves.  We do
> have a value (we read 'address') to put into VMCS.EPT_POINTER, which is
> all we need for the emulation.
> The function doesn't dereference that pointer, it just looks at its
> value to decide whether it is valid or not.  (btw. we should check that
> properly, because we cannot depend on VM entry failure pass-through like
> the normal case.)
>
> The dereference done in kvm_mmu_reload() should happen after EPTP
> switching finishes, because the spec doesn't mention a VM exit for other
> reason than invalid EPT_POINTER value.
>
>>> just keep the original bug -- we want to eventually fix it and it's no
>>> worse till then.
>> 
>> Anyway, can you please confirm again what is the behavior that you
>> are expecting if kvm_mmu_reload fails ? This would be a rarely used
>> branch and I am actually fine diverging from what I think is right if
>> I can get the reviewers to agree on a common thing.
>
> kvm_mmu_reload() fails when mmu_check_root() is false, which means that
> the pointed physical address is not backed.  We've hit this corner-case
> in the past -- Jim said that the chipset returns all 1s if a read is not
> claimed.
>
> So in theory, KVM should not fail kvm_mmu_reload(), but behave as if the
> pointer pointed to a memory of all 1s, which would likely result in
> EPT_MISCONFIG when the guest does a memory access.

As much as I would like to disagree with you, I have already spent way more
time on this then I want. Please let's just leave it here, then ? The mmu unload
will make sure there's an invalid root hpa and whatever happens next, happens.

> It is a mishandled corner case, but turning it into VM exit would only
> confuse an OS that receives the impossible VM exit and potentially
> confuse reader of the KVM logic.
>
> I think that not using kvm_mmu_reload() directly in EPTP switching is
> best.  The bug is not really something we care about.
Radim Krčmář July 12, 2017, 7:18 p.m. UTC | #25
2017-07-12 14:11-0400, Bandan Das:
> As much as I would like to disagree with you, I have already spent way more
> time on this then I want. Please let's just leave it here, then ? The mmu unload
> will make sure there's an invalid root hpa and whatever happens next, happens.

Sure;  let's discuss the subtleties of hardware emulation over a beer.
David Hildenbrand July 13, 2017, 3:39 p.m. UTC | #26
>>> +	/*
>>> +	 * If the (L2) guest does a vmfunc to the currently
>>> +	 * active ept pointer, we don't have to do anything else
>>> +	 */
>>> +	if (vmcs12->ept_pointer != address) {
>>> +		if (address >> cpuid_maxphyaddr(vcpu) ||
>>> +		    !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
> 
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.

I quickly rushed over the massive amount of comments. Sounds like you'll
be preparing a v5. Would be great if you could add some comments that
were the result of this discussion (for parts that are not that obvious
- triple faults) - thanks!
Bandan Das July 13, 2017, 5:08 p.m. UTC | #27
David Hildenbrand <david@redhat.com> writes:

>>>> +	/*
>>>> +	 * If the (L2) guest does a vmfunc to the currently
>>>> +	 * active ept pointer, we don't have to do anything else
>>>> +	 */
>>>> +	if (vmcs12->ept_pointer != address) {
>>>> +		if (address >> cpuid_maxphyaddr(vcpu) ||
>>>> +		    !IS_ALIGNED(address, 4096))
>>>
>>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>>> (triggering a KVM_REQ_TRIPLE_FAULT)
>> 
>> If there's a triple fault, I think it's a good idea to inject it
>> back. Basically, there's no need to take care of damage control
>> that L1 is intentionally doing.
>
> I quickly rushed over the massive amount of comments. Sounds like you'll
> be preparing a v5. Would be great if you could add some comments that
> were the result of this discussion (for parts that are not that obvious
> - triple faults) - thanks!

Will do. Basically, we agreed that we don't need to do anything with mmu_reload() faillures
because the invalid eptp that mmu_unload will write to root_hpa will result in an ept
violation.

Bandan
Bandan Das July 17, 2017, 5:58 p.m. UTC | #28
Radim Krčmář <rkrcmar@redhat.com> writes:
...
>> > and no other mentions of a VM exit, so I think that the VM exit happens
>> > only under these conditions:
>> >
>> >   — The EPT memory type (bits 2:0) must be a value supported by the
>> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>> >     Appendix A.10).
>> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>> >     an EPT page-walk length of 4; see Section 28.2.2.
>> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>> >     as 0, indicating that the processor does not support accessed and
>> >     dirty flags for EPT.
>> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
>> >     physical-address width) must all be 0.
>> >
>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>> 
>> I completely ignored AD and the #VE sections. I will add a TODO item
>> in the comment section.
>
> AFAIK, we don't support #VE, but AD would be nice to handle from the
> beginning.  (I think that caling nested_ept_init_mmu_context() as-is
> isn't that bad.)

I went back to the spec to take a look at the AD handling. It doesn't look
like anything needs to be done since nested_ept_init_mmu_context() is already
being called with the correct eptp in prepare_vmcs02 ? Anything else that
needs to be done for AD handling in vmfunc context ?

Thanks,
Bandan
Radim Krčmář July 19, 2017, 9:30 a.m. UTC | #29
2017-07-17 13:58-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> ...
>>> > and no other mentions of a VM exit, so I think that the VM exit happens
>>> > only under these conditions:
>>> >
>>> >   — The EPT memory type (bits 2:0) must be a value supported by the
>>> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>>> >     Appendix A.10).
>>> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>>> >     an EPT page-walk length of 4; see Section 28.2.2.
>>> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>>> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>>> >     as 0, indicating that the processor does not support accessed and
>>> >     dirty flags for EPT.
>>> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
>>> >     physical-address width) must all be 0.
>>> >
>>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>>> 
>>> I completely ignored AD and the #VE sections. I will add a TODO item
>>> in the comment section.
>>
>> AFAIK, we don't support #VE, but AD would be nice to handle from the
>> beginning.  (I think that caling nested_ept_init_mmu_context() as-is
>> isn't that bad.)
> 
> I went back to the spec to take a look at the AD handling. It doesn't look
> like anything needs to be done since nested_ept_init_mmu_context() is already
> being called with the correct eptp in prepare_vmcs02 ? Anything else that
> needs to be done for AD handling in vmfunc context ?

AD is decided by EPTP bit 6, so it can be toggled by EPTP switching and
we don't call prepare_vmcs02() after emulating VMFUNC vm exit.
We want to forward the new AD configuration to KVM's MMU.
Bandan Das July 19, 2017, 5:54 p.m. UTC | #30
Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-17 13:58-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> ...
>>>> > and no other mentions of a VM exit, so I think that the VM exit happens
>>>> > only under these conditions:
>>>> >
>>>> >   — The EPT memory type (bits 2:0) must be a value supported by the
>>>> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>>>> >     Appendix A.10).
>>>> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>>>> >     an EPT page-walk length of 4; see Section 28.2.2.
>>>> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>>>> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>>>> >     as 0, indicating that the processor does not support accessed and
>>>> >     dirty flags for EPT.
>>>> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
>>>> >     physical-address width) must all be 0.
>>>> >
>>>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>>>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>>>> 
>>>> I completely ignored AD and the #VE sections. I will add a TODO item
>>>> in the comment section.
>>>
>>> AFAIK, we don't support #VE, but AD would be nice to handle from the
>>> beginning.  (I think that caling nested_ept_init_mmu_context() as-is
>>> isn't that bad.)
>> 
>> I went back to the spec to take a look at the AD handling. It doesn't look
>> like anything needs to be done since nested_ept_init_mmu_context() is already
>> being called with the correct eptp in prepare_vmcs02 ? Anything else that
>> needs to be done for AD handling in vmfunc context ?
>
> AD is decided by EPTP bit 6, so it can be toggled by EPTP switching and
> we don't call prepare_vmcs02() after emulating VMFUNC vm exit.
> We want to forward the new AD configuration to KVM's MMU.

Thanks, I had incorrectly assumed that prepare_vmcs02 will be called after
an exit unconditionally. I will work something up soon.

Bandan
diff mbox

Patch

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index da5375e..5f63a2e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -115,6 +115,10 @@ 
 #define VMX_MISC_SAVE_EFER_LMA			0x00000020
 #define VMX_MISC_ACTIVITY_HLT			0x00000040
 
+/* VMFUNC functions */
+#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
+#define VMFUNC_EPTP_ENTRIES  512
+
 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
 {
 	return vmx_basic & GENMASK_ULL(30, 0);
@@ -200,6 +204,8 @@  enum vmcs_field {
 	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
 	EOI_EXIT_BITMAP3                = 0x00002022,
 	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
+	EPTP_LIST_ADDRESS               = 0x00002024,
+	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
 	VMREAD_BITMAP                   = 0x00002026,
 	VMWRITE_BITMAP                  = 0x00002028,
 	XSS_EXIT_BITMAP                 = 0x0000202C,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fe8f5fc..0a969fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -246,6 +246,7 @@  struct __packed vmcs12 {
 	u64 eoi_exit_bitmap1;
 	u64 eoi_exit_bitmap2;
 	u64 eoi_exit_bitmap3;
+	u64 eptp_list_address;
 	u64 xss_exit_bitmap;
 	u64 guest_physical_address;
 	u64 vmcs_link_pointer;
@@ -771,6 +772,7 @@  static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
 	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
 	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1402,6 +1404,13 @@  static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
 }
 
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has_vmfunc(vmcs12) &&
+		(vmcs12->vm_function_control &
+		 VMX_VMFUNC_EPTP_SWITCHING);
+}
+
 static inline bool is_nmi(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2791,7 +2800,12 @@  static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	if (cpu_has_vmx_vmfunc()) {
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 			SECONDARY_EXEC_ENABLE_VMFUNC;
-		vmx->nested.nested_vmx_vmfunc_controls = 0;
+		/*
+		 * Advertise EPTP switching unconditionally
+		 * since we emulate it
+		 */
+		vmx->nested.nested_vmx_vmfunc_controls =
+			VMX_VMFUNC_EPTP_SWITCHING;
 	}
 
 	/*
@@ -7772,6 +7786,9 @@  static int handle_vmfunc(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12;
 	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+	struct page *page = NULL;
+	u64 *l1_eptp_list, address;
 
 	/*
 	 * VMFUNC is only supported for nested guests, but we always enable the
@@ -7784,11 +7801,46 @@  static int handle_vmfunc(struct kvm_vcpu *vcpu)
 	}
 
 	vmcs12 = get_vmcs12(vcpu);
-	if ((vmcs12->vm_function_control & (1 << function)) == 0)
+	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
+	    WARN_ON_ONCE(function))
+		goto fail;
+
+	if (!nested_cpu_has_ept(vmcs12) ||
+	    !nested_cpu_has_eptp_switching(vmcs12))
+		goto fail;
+
+	if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
+		goto fail;
+
+	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
+	if (!page)
 		goto fail;
-	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
+
+	l1_eptp_list = kmap(page);
+	address = l1_eptp_list[index];
+	if (!address)
+		goto fail;
+	/*
+	 * If the (L2) guest does a vmfunc to the currently
+	 * active ept pointer, we don't have to do anything else
+	 */
+	if (vmcs12->ept_pointer != address) {
+		if (address >> cpuid_maxphyaddr(vcpu) ||
+		    !IS_ALIGNED(address, 4096))
+			goto fail;
+		kvm_mmu_unload(vcpu);
+		vmcs12->ept_pointer = address;
+		kvm_mmu_reload(vcpu);
+		kunmap(page);
+		nested_release_page_clean(page);
+	}
+	return kvm_skip_emulated_instruction(vcpu);
 
 fail:
+	if (page) {
+		kunmap(page);
+		nested_release_page_clean(page);
+	}
 	nested_vmx_vmexit(vcpu, vmx->exit_reason,
 			  vmcs_read32(VM_EXIT_INTR_INFO),
 			  vmcs_readl(EXIT_QUALIFICATION));