diff mbox

Nested virtualization off VMware vSphere 6.0 with EL6 guests crashes on Xen 4.6

Message ID 20160115213958.GA16118@char.us.oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Konrad Rzeszutek Wilk Jan. 15, 2016, 9:39 p.m. UTC
On Tue, Jan 12, 2016 at 02:22:03AM -0700, Jan Beulich wrote:
> >>> On 12.01.16 at 04:38, <konrad.wilk@oracle.com> wrote:
> > (XEN) Assertion 'vapic_pg && !p2m_is_paging(p2mt)' failed at vvmx.c:698
> > (XEN) ----[ Xen-4.6.0  x86_64  debug=y  Tainted:    C ]----
> > (XEN) CPU:    39
> > (XEN) RIP:    e008:[<ffff82d0801ed053>] virtual_vmentry+0x487/0xac9
> > (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor (d1v3)
> > (XEN) rax: 0000000000000000   rbx: ffff83007786c000   rcx: 0000000000000000
> > (XEN) rdx: 0000000000000e00   rsi: 000fffffffffffff   rdi: ffff83407f81e010
> > (XEN) rbp: ffff834008a47ea8   rsp: ffff834008a47e38   r8: 0000000000000000
> > (XEN) r9:  0000000000000000   r10: 0000000000000000   r11: 0000000000000000
> > (XEN) r12: 0000000000000000   r13: ffff82c000341000   r14: ffff834008a47f18
> > (XEN) r15: ffff83407f7c4000   cr0: 0000000080050033   cr4: 00000000001526e0
> > (XEN) cr3: 000000407fb22000   cr2: 0000000000000000
> > (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
> > (XEN) Xen stack trace from rsp=ffff834008a47e38:
> > (XEN)    ffff834008a47e68 ffff82d0801d2cde ffff834008a47e68 0000000000000d00
> > (XEN)    0000000000000000 0000000000000000 ffff834008a47e88 00000004801cc30e
> > (XEN)    ffff83007786c000 ffff83007786c000 ffff834008a40000 0000000000000000
> > (XEN)    ffff834008a47f18 0000000000000000 ffff834008a47f08 ffff82d0801edf94
> > (XEN)    ffff834008a47ef8 0000000000000000 ffff834008f62000 ffff834008a47f18
> > (XEN)    000000ae8c99eb8d ffff83007786c000 0000000000000000 0000000000000000
> > (XEN)    0000000000000000 0000000000000000 0000000000000000 ffff82d0801ee2ab
> > (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > (XEN)    00000000078bfbff 0000000000000000 0000000000000000 0000beef0000beef
> > (XEN)    fffffffffc4b3440 000000bf0000beef 0000000000040046 fffffffffc607f00
> > (XEN)    000000000000beef 000000000000beef 000000000000beef 000000000000beef
> > (XEN)    000000000000beef 0000000000000027 ffff83007786c000 0000006f88716300
> > (XEN)    0000000000000000
> > (XEN) Xen call trace:
> > (XEN)    [<ffff82d0801ed053>] virtual_vmentry+0x487/0xac9
> > (XEN)    [<ffff82d0801edf94>] nvmx_switch_guest+0x8ff/0x915
> > (XEN)    [<ffff82d0801ee2ab>] vmx_asm_vmexit_handler+0x4b/0xc0
> > (XEN)
> > (XEN)
> > (XEN) ****************************************
> > (XEN) Panic on CPU 39:
> > (XEN) Assertion 'vapic_pg && !p2m_is_paging(p2mt)' failed at vvmx.c:698
> > (XEN) ****************************************
> > (XEN)
> > 
> > ..and then to my surprise the hypervisor stopped hitting this.
> 
> Since we can (I hope) pretty much exclude a paging type, the
> ASSERT() must have triggered because of vapic_pg being NULL.
> That might be verifiable without extra printk()s, just by checking
> the disassembly (assuming the value sits in a register). In which
> case vapic_gpfn would be of interest too.

The vapic_gpfn is 0xffffffffffff.

To be exact:

nvmx_update_virtual_apic_address:vCPU0 0xffffffffffffffff(vAPIC) 0x0(APIC), 0x0(TPR) ctrl=b5b9effe

Based on this:



Got me:
(XEN) stdvga.c:151:d1v0 leaving stdvga mode
(XEN) stdvga.c:147:d1v0 entering stdvga and caching modes
(XEN) stdvga.c:520:d1v0 leaving caching mode
(XEN) vvmx.c:2491:d1v0 Unknown nested vmexit reason 80000021.
(XEN) Failed vm entry (exit reason 0x80000021) caused by invalid guest state (0).
(XEN) ************* VMCS Area **************
(XEN) *** Guest State ***
(XEN) CR0: actual=0x0000000000000030, shadow=0x0000000000000000, gh_mask=ffffffffffffffff
(XEN) CR4: actual=0x0000000000002050, shadow=0x0000000000000000, gh_mask=ffffffffffffffff
(XEN) CR3 = 0x00000000800ed000
(XEN) RSP = 0x0000000000000000 (0x0000000000000000)  RIP = 0x0000000000000000 (0x0000000000000000)
(XEN) RFLAGS=0x00000002 (0x00000002)  DR7 = 0x0000000000000400
(XEN) Sysenter RSP=0000000000000000 CS:RIP=0000:0000000000000000
(XEN)        sel  attr  limit   base
(XEN)   CS: 0000 00000 00000000 0000000000000000
(XEN)   DS: 0000 00000 00000000 0000000000000000
(XEN)   SS: 0000 00000 00000000 0000000000000000
(XEN)   ES: 0000 00000 00000000 0000000000000000
(XEN)   FS: 0000 00000 00000000 0000000000000000
(XEN)   GS: 0000 00000 00000000 0000000000000000
(XEN) GDTR:            00000000 0000000000000000
(XEN) LDTR: 0000 00000 00000000 0000000000000000
(XEN) IDTR:            00000000 0000000000000000
(XEN)   TR: 0000 00000 00000000 0000000000000000
(XEN) EFER = 0x0000000000000800  PAT = 0x0000000000000000
(XEN) PreemptionTimer = 0x00000000  SM Base = 0x00000000
(XEN) DebugCtl = 0x0000000000000000  DebugExceptions = 0x0000000000000000
(XEN) Interruptibility = 00000000  ActivityState = 00000000
(XEN) *** Host State ***
(XEN) RIP = 0xffff82d0801ee3a0 (vmx_asm_vmexit_handler)  RSP = 0xffff8340077d7f90
(XEN) CS=e008 SS=0000 DS=0000 ES=0000 FS=0000 GS=0000 TR=e040
(XEN) FSBase=0000000000000000 GSBase=0000000000000000 TRBase=ffff8340077dfc00
(XEN) GDTBase=ffff8340077d0000 IDTBase=ffff8340077dc000
(XEN) CR0=0000000080050033 CR3=000000400076c000 CR4=00000000001526e0
(XEN) Sysenter RSP=ffff8340077d7fc0 CS:RIP=e008:ffff82d080238870
(XEN) EFER = 0x0000000000000000  PAT = 0x0000050100070406
(XEN) *** Control State ***
(XEN) PinBased=0000003f CPUBased=b5b9effe SecondaryExec=000054eb
(XEN) EntryControls=000011fb ExitControls=001fefff
(XEN) ExceptionBitmap=00062042 PFECmask=00000000 PFECmatch=ffffffff
(XEN) VMEntry: intr_info=00000000 errcode=00000000 ilen=00000000
(XEN) VMExit: intr_info=00000000 errcode=00000000 ilen=00000006
(XEN)         reason=80000021 qualification=0000000000000000
(XEN) IDTVectoring: info=00000000 errcode=00000000
(XEN) TSC Offset = 0xfffd34adb2c3a149
(XEN) TPR Threshold = 0x00  PostedIntrVec = 0x00
(XEN) EPT pointer = 0x000000400079a01e  EPTP index = 0x0000
(XEN) PLE Gap=00000080 Window=00001000
(XEN) Virtual processor ID = 0x004e VMfunc controls = 0000000000000000
(XEN) **************************************
(XEN) domain_crash called from vmx.c:2729
(XEN) Domain 1 (vcpu#0) crashed on cpu#21:
(XEN) ----[ Xen-4.6.0  x86_64  debug=y  Tainted:    C ]----
(XEN) CPU:    21
(XEN) RIP:    0000:[<0000000000000000>]
(XEN) RFLAGS: 0000000000000002   CONTEXT: hvm guest (d1v0)
(XEN) rax: 0000000000000000   rbx: 0000000000000000   rcx: 0000000000000000
(XEN) rdx: 00000000078bfbff   rsi: 0000000000000000   rdi: 0000000000000000
(XEN) rbp: 0000000000000000   rsp: 0000000000000000   r8:  0000000000000000
(XEN) r9:  0000000000000000   r10: 0000000000000000   r11: 0000000000000000
(XEN) r12: 0000000000000000   r13: 0000000000000000   r14: 0000000000000000
(XEN) r15: 0000000000000000   cr0: 0000000000000010   cr4: 0000000000000000
(XEN) cr3: 00000000800ed000   cr2: 0000000000000000
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: 0000

..

> 
> Anyway, the writing of the respective VMCS field to zero in the
> alternative worries me a little: Aren't we risking MFN zero to be
> wrongly accessed due to this?
> 
> Furthermore, nvmx_update_apic_access_address() having a
> similar ASSERT() seems entirely wrong: The APIC access
> page doesn't really need to match up with any actual page
> belonging to the guest - a guest could choose to point this
> into no-where (note that we've been at least considering this
> option recently for our own purposes, in the context of
> http://lists.xenproject.org/archives/html/xen-devel/2015-12/msg02191.html).
> 
> > Instead I started getting an even more bizzare crash:

Ignore this part please.
.. snip..
> this doesn't match the call stack. Something's pretty fishy here.

Yes. The hypervisor was modified alongside me and I hadn't connected
the dots...
> 
> Jan

Comments

Jan Beulich Jan. 18, 2016, 9:41 a.m. UTC | #1
>>> On 15.01.16 at 22:39, <konrad.wilk@oracle.com> wrote:
> On Tue, Jan 12, 2016 at 02:22:03AM -0700, Jan Beulich wrote:
>> Since we can (I hope) pretty much exclude a paging type, the
>> ASSERT() must have triggered because of vapic_pg being NULL.
>> That might be verifiable without extra printk()s, just by checking
>> the disassembly (assuming the value sits in a register). In which
>> case vapic_gpfn would be of interest too.
> 
> The vapic_gpfn is 0xffffffffffff.
> 
> To be exact:
> 
> nvmx_update_virtual_apic_address:vCPU0 0xffffffffffffffff(vAPIC) 0x0(APIC), 0x0(TPR) ctrl=b5b9effe
> 
> Based on this:
> 
> diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
> index cb6f9b8..8a0abfc 100644
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -695,7 +695,15 @@ static void nvmx_update_virtual_apic_address(struct vcpu *v)
>  
>          vapic_gpfn = __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR) >> PAGE_SHIFT;
>          vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn, &p2mt, P2M_ALLOC);
> -        ASSERT(vapic_pg && !p2m_is_paging(p2mt));
> +       if ( !vapic_pg ) {
> +               printk("%s:vCPU%d 0x%lx(vAPIC) 0x%lx(APIC), 0x%lx(TPR) ctrl=%x\n", __func__,v->vcpu_id,
> +                       __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR),
> +                       __get_vvmcs(nvcpu->nv_vvmcx, APIC_ACCESS_ADDR),
> +                       __get_vvmcs(nvcpu->nv_vvmcx, TPR_THRESHOLD),
> +                       ctrl);
> +       }
> +        ASSERT(vapic_pg);
> +       ASSERT(vapic_pg && !p2m_is_paging(p2mt));
>          __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg));
>          put_page(vapic_pg);
>      }

Interesting: I can't see VIRTUAL_APIC_PAGE_ADDR to be written
with all ones anywhere, neither for the real VMCS nor for the virtual
one (page_to_maddr() can't, afaict, return such a value). Could you
check where the L1 guest itself is writing that value, or whether it
fails to initialize that field and it happens to start out as all ones?

>> What looks odd to me is the connection between
>> CPU_BASED_TPR_SHADOW being set and the use of a (valid)
>> virtual APIC page: Wouldn't this rather need to depend on
>> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES, just like in
>> nvmx_update_apic_access_address()?
> 
> Could be. I added in an read for the secondary control:
> 
> nvmx_update_virtual_apic_address:vCPU2 0xffffffffffffffff(vAPIC) 0x0(APIC), 
> 0x0(TPR) ctrl=b5b9effe sec=0
> 
> So trying your recommendation:
> diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
> index cb6f9b8..d291c91 100644
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -686,8 +686,8 @@ static void nvmx_update_virtual_apic_address(struct vcpu *v)
>      struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
>      u32 ctrl;
>  
> -    ctrl = __n2_exec_control(v);
> -    if ( ctrl & CPU_BASED_TPR_SHADOW )
> +    ctrl = __n2_secondary_exec_control(v);
> +    if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES )
>      {
>          p2m_type_t p2mt;
>          unsigned long vapic_gpfn;
> 
> 
> Got me:
> (XEN) stdvga.c:151:d1v0 leaving stdvga mode
> (XEN) stdvga.c:147:d1v0 entering stdvga and caching modes
> (XEN) stdvga.c:520:d1v0 leaving caching mode
> (XEN) vvmx.c:2491:d1v0 Unknown nested vmexit reason 80000021.
> (XEN) Failed vm entry (exit reason 0x80000021) caused by invalid guest state 

Interesting. I've just noticed that a similar odd looking (to me)
dependency exists in construct_vmcs(). Perhaps I've overlooked
something in the SDM. In any event I think some words from the
VMX maintainers would be quite nice here.

Sadly the VMCS dump doesn't include the two APIC related
addresses...

Jan
Konrad Rzeszutek Wilk Feb. 2, 2016, 10:05 p.m. UTC | #2
On Mon, Jan 18, 2016 at 02:41:52AM -0700, Jan Beulich wrote:
> >>> On 15.01.16 at 22:39, <konrad.wilk@oracle.com> wrote:
> > On Tue, Jan 12, 2016 at 02:22:03AM -0700, Jan Beulich wrote:
> >> Since we can (I hope) pretty much exclude a paging type, the
> >> ASSERT() must have triggered because of vapic_pg being NULL.
> >> That might be verifiable without extra printk()s, just by checking
> >> the disassembly (assuming the value sits in a register). In which
> >> case vapic_gpfn would be of interest too.
> > 
> > The vapic_gpfn is 0xffffffffffff.
> > 
> > To be exact:
> > 
> > nvmx_update_virtual_apic_address:vCPU0 0xffffffffffffffff(vAPIC) 0x0(APIC), 0x0(TPR) ctrl=b5b9effe
> > 
> > Based on this:
> > 
> > diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
> > index cb6f9b8..8a0abfc 100644
> > --- a/xen/arch/x86/hvm/vmx/vvmx.c
> > +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> > @@ -695,7 +695,15 @@ static void nvmx_update_virtual_apic_address(struct vcpu *v)
> >  
> >          vapic_gpfn = __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR) >> PAGE_SHIFT;
> >          vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn, &p2mt, P2M_ALLOC);
> > -        ASSERT(vapic_pg && !p2m_is_paging(p2mt));
> > +       if ( !vapic_pg ) {
> > +               printk("%s:vCPU%d 0x%lx(vAPIC) 0x%lx(APIC), 0x%lx(TPR) ctrl=%x\n", __func__,v->vcpu_id,
> > +                       __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR),
> > +                       __get_vvmcs(nvcpu->nv_vvmcx, APIC_ACCESS_ADDR),
> > +                       __get_vvmcs(nvcpu->nv_vvmcx, TPR_THRESHOLD),
> > +                       ctrl);
> > +       }
> > +        ASSERT(vapic_pg);
> > +       ASSERT(vapic_pg && !p2m_is_paging(p2mt));
> >          __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg));
> >          put_page(vapic_pg);
> >      }
> 
> Interesting: I can't see VIRTUAL_APIC_PAGE_ADDR to be written
> with all ones anywhere, neither for the real VMCS nor for the virtual
> one (page_to_maddr() can't, afaict, return such a value). Could you
> check where the L1 guest itself is writing that value, or whether it
> fails to initialize that field and it happens to start out as all ones?

This is getting more and more bizzare.

I realized that this machine has VMCS shadowing so Xen does not trap on
any vmwrite or vmread. Unless I update the VMCS shadowing bitmap - which
I did for vmwrite and vmread to get a better view of this. It never
traps on VIRTUAL_APIC_PAGE_ADDR accesses. It does trap on: VIRTUAL_PROCESSOR_ID,
VM_EXIT_MSR_LOAD_ADDR and GUEST_[ES,DS,FS,GS,TR]_SELECTORS.

(It may also trap on IO_BITMAP_A,B but I didn't print that out).

To confirm that the VMCS that will be given to the L2 guest is correct
I added some printking of some states that ought to be pretty OK such
as HOST_RIP or HOST_RSP - which are all 0!

If I let the nvmx_update_virtual_apic_address keep on going without
modifying the VIRTUAL_APIC_PAGE_ADDR it later on crashes the nested
guest:

EN) nvmx_handle_vmwrite: 0                                                    
(XEN) nvmx_handle_vmwrite: 0                                                    
(XEN) nvmx_handle_vmwrite: 2008                                                 
(XEN) nvmx_handle_vmwrite: 2008                                                 
(XEN) nvmx_handle_vmwrite: 0                                                    
(XEN) nvmx_handle_vmwrite: 2008                                                 
(XEN) nvmx_handle_vmwrite: 0                                                    
(XEN) nvmx_handle_vmwrite: 2008                                                 
(XEN) nvmx_handle_vmwrite: 2008                                                 
(XEN) nvmx_handle_vmwrite: 2008                                                 
(XEN) nvmx_handle_vmwrite: 2008                                                 
(XEN) nvmx_handle_vmwrite: 2008                                                 
(XEN) nvmx_handle_vmwrite: 800                                                  
(XEN) nvmx_handle_vmwrite: 804                                                  
(XEN) nvmx_handle_vmwrite: 806                                                  
(XEN) nvmx_handle_vmwrite: 80a                                                  
(XEN) nvmx_handle_vmwrite: 80e                                                  
(XEN) nvmx_update_virtual_apic_address: vCPU1 0xffffffffffffffff(vAPIC) 0x0(APIC), 0x0(TPR) ctrl=b5b9effe sec=0 
(XEN) nvmx_update_virtual_apic_address: TPR threshold = 0x0 updated 0.          
(XEN) nvmx_update_virtual_apic_address: Virtual APIC = 0x0 updated 0.           
(XEN) nvmx_update_virtual_apic_address: APIC address = 0x0 updated 0.           
(XEN) HOST_RIP=0x0 HOST_RSP=0x0                                                 
(XEN) <vm_launch_fail> error code 7                                             
(XEN) domain_crash_sync called from vmcs.c:1597                                 
(XEN) Domain 1 (vcpu#1) crashed on cpu#37:                                      
(XEN) ----[ Xen-4.6.0  x86_64  debug=n  Tainted:    C ]----                     
(XEN) CPU:    37                                                                
(XEN) RIP:    0000:[<0000000000000000>]                                         
(XEN) RFLAGS: 0000000000000000   CONTEXT: hvm guest (d1v1)                      
(XEN) rax: ffff82d08010648b   rbx: ffff8340007fb000   rcx: 0000000000000000     
(XEN) rdx: ffff82d0801ddf5f   rsi: 0000000000000000   rdi: ffff82d0801ebd6a     
(XEN) rbp: ffff82d08018cb09   rsp: 0000000000000000   r8:  0000000000000000     
(XEN) r9:  ffff834007980000   r10: 000000000000063d   r11: ffff82d080106465     
(XEN) r12: 0000000000000000   r13: 0000000000000000   r14: 0000000000000000     
(XEN) r15: ffff834007980000   cr0: 0000000000000010   cr4: 0000000000000000     
(XEN) cr3: 00000000efd06000   cr2: 0000000000000000                             
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: 0000          

which should be no surprise as the VMCS is corrupt.

I need to do some more double-checking to see how it is possible
for this VMCS to get some messed up.

And of course if I run an Xen under Xen with an HVM guests - it works fine.
Jan Beulich Feb. 3, 2016, 9:34 a.m. UTC | #3
>>> On 02.02.16 at 23:05, <konrad.wilk@oracle.com> wrote:
> This is getting more and more bizzare.
> 
> I realized that this machine has VMCS shadowing so Xen does not trap on
> any vmwrite or vmread. Unless I update the VMCS shadowing bitmap - which
> I did for vmwrite and vmread to get a better view of this. It never
> traps on VIRTUAL_APIC_PAGE_ADDR accesses. It does trap on: 
> VIRTUAL_PROCESSOR_ID,
> VM_EXIT_MSR_LOAD_ADDR and GUEST_[ES,DS,FS,GS,TR]_SELECTORS.
> 
> (It may also trap on IO_BITMAP_A,B but I didn't print that out).
> 
> To confirm that the VMCS that will be given to the L2 guest is correct
> I added some printking of some states that ought to be pretty OK such
> as HOST_RIP or HOST_RSP - which are all 0!

But did you also check what the field of interest starts out as?

> If I let the nvmx_update_virtual_apic_address keep on going without
> modifying the VIRTUAL_APIC_PAGE_ADDR it later on crashes the nested
> guest:
> 
> EN) nvmx_handle_vmwrite: 0                                                   

The missing characters at the beginning may just be a copy-and-
paste mistake, but they could also indicate a truncated log. Can
you clarify which of the two it is?

> (XEN) nvmx_handle_vmwrite: 0                                                 
> (XEN) nvmx_handle_vmwrite: 2008                                              
> (XEN) nvmx_handle_vmwrite: 2008                                              
> (XEN) nvmx_handle_vmwrite: 0                                                 
> (XEN) nvmx_handle_vmwrite: 2008                                              
> (XEN) nvmx_handle_vmwrite: 0                                                 
> (XEN) nvmx_handle_vmwrite: 2008                                              
> (XEN) nvmx_handle_vmwrite: 2008                                              
> (XEN) nvmx_handle_vmwrite: 2008                                              
> (XEN) nvmx_handle_vmwrite: 2008                                              
> (XEN) nvmx_handle_vmwrite: 2008                                              
> (XEN) nvmx_handle_vmwrite: 800                                               
> (XEN) nvmx_handle_vmwrite: 804                                               
> (XEN) nvmx_handle_vmwrite: 806                                               
> (XEN) nvmx_handle_vmwrite: 80a                                               
> (XEN) nvmx_handle_vmwrite: 80e                                               
> (XEN) nvmx_update_virtual_apic_address: vCPU1 0xffffffffffffffff(vAPIC) 0x0(APIC), 0x0(TPR) ctrl=b5b9effe sec=0 

Assuming the field starts out as other than all ones, could you check
its value on each of the intercepted VMWRITEs, to at least narrow
when it changes.

Kevin, Jun - are there any cases where the hardware would alter
this field's value? Like during some guest side LAPIC manipulations?
(The same monitoring as suggested during VMWRITEs could of
course also be added to LAPIC accesses visible to the hypervisor,
but I guess there won't be too many of those.)

Jan
Konrad Rzeszutek Wilk Feb. 3, 2016, 3:07 p.m. UTC | #4
On Wed, Feb 03, 2016 at 02:34:47AM -0700, Jan Beulich wrote:
> >>> On 02.02.16 at 23:05, <konrad.wilk@oracle.com> wrote:
> > This is getting more and more bizzare.
> > 
> > I realized that this machine has VMCS shadowing so Xen does not trap on
> > any vmwrite or vmread. Unless I update the VMCS shadowing bitmap - which
> > I did for vmwrite and vmread to get a better view of this. It never
> > traps on VIRTUAL_APIC_PAGE_ADDR accesses. It does trap on: 
> > VIRTUAL_PROCESSOR_ID,
> > VM_EXIT_MSR_LOAD_ADDR and GUEST_[ES,DS,FS,GS,TR]_SELECTORS.
> > 
> > (It may also trap on IO_BITMAP_A,B but I didn't print that out).
> > 
> > To confirm that the VMCS that will be given to the L2 guest is correct
> > I added some printking of some states that ought to be pretty OK such
> > as HOST_RIP or HOST_RSP - which are all 0!
> 
> But did you also check what the field of interest starts out as?

I will do that.
> 
> > If I let the nvmx_update_virtual_apic_address keep on going without
> > modifying the VIRTUAL_APIC_PAGE_ADDR it later on crashes the nested
> > guest:
> > 
> > EN) nvmx_handle_vmwrite: 0                                                   
> 
> The missing characters at the beginning may just be a copy-and-
> paste mistake, but they could also indicate a truncated log. Can
> you clarify which of the two it is?

Just an copy-n-paste error. Nothing of interest before there:
(d1)   NULL                                                                        
(d1) Booting from Hard Disk...                                                     
(d1) Booting from 0000:7c00                                                        
(XEN) nvmx_handle_vmwrite: 0                                                       
(XEN) nvmx_handle_vmwrite: 0                
..
> 
> > (XEN) nvmx_handle_vmwrite: 0                                                 
> > (XEN) nvmx_handle_vmwrite: 2008                                              
> > (XEN) nvmx_handle_vmwrite: 2008                                              
> > (XEN) nvmx_handle_vmwrite: 0                                                 
> > (XEN) nvmx_handle_vmwrite: 2008                                              
> > (XEN) nvmx_handle_vmwrite: 0                                                 
> > (XEN) nvmx_handle_vmwrite: 2008                                              
> > (XEN) nvmx_handle_vmwrite: 2008                                              
> > (XEN) nvmx_handle_vmwrite: 2008                                              
> > (XEN) nvmx_handle_vmwrite: 2008                                              
> > (XEN) nvmx_handle_vmwrite: 2008                                              
> > (XEN) nvmx_handle_vmwrite: 800                                               
> > (XEN) nvmx_handle_vmwrite: 804                                               
> > (XEN) nvmx_handle_vmwrite: 806                                               
> > (XEN) nvmx_handle_vmwrite: 80a                                               
> > (XEN) nvmx_handle_vmwrite: 80e                                               
> > (XEN) nvmx_update_virtual_apic_address: vCPU1 0xffffffffffffffff(vAPIC) 0x0(APIC), 0x0(TPR) ctrl=b5b9effe sec=0 
> 
> Assuming the field starts out as other than all ones, could you check
> its value on each of the intercepted VMWRITEs, to at least narrow
> when it changes.

Yes of course.
> 
> Kevin, Jun - are there any cases where the hardware would alter
> this field's value? Like during some guest side LAPIC manipulations?
> (The same monitoring as suggested during VMWRITEs could of
> course also be added to LAPIC accesses visible to the hypervisor,
> but I guess there won't be too many of those.)
> 
> Jan
>
Tian, Kevin Feb. 4, 2016, 5:52 a.m. UTC | #5
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Wednesday, February 03, 2016 5:35 PM
> > (XEN) nvmx_handle_vmwrite: 0
> > (XEN) nvmx_handle_vmwrite: 2008
> > (XEN) nvmx_handle_vmwrite: 2008
> > (XEN) nvmx_handle_vmwrite: 0
> > (XEN) nvmx_handle_vmwrite: 2008
> > (XEN) nvmx_handle_vmwrite: 0
> > (XEN) nvmx_handle_vmwrite: 2008
> > (XEN) nvmx_handle_vmwrite: 2008
> > (XEN) nvmx_handle_vmwrite: 2008
> > (XEN) nvmx_handle_vmwrite: 2008
> > (XEN) nvmx_handle_vmwrite: 2008
> > (XEN) nvmx_handle_vmwrite: 800
> > (XEN) nvmx_handle_vmwrite: 804
> > (XEN) nvmx_handle_vmwrite: 806
> > (XEN) nvmx_handle_vmwrite: 80a
> > (XEN) nvmx_handle_vmwrite: 80e
> > (XEN) nvmx_update_virtual_apic_address: vCPU1 0xffffffffffffffff(vAPIC) 0x0(APIC),
> 0x0(TPR) ctrl=b5b9effe sec=0
> 
> Assuming the field starts out as other than all ones, could you check
> its value on each of the intercepted VMWRITEs, to at least narrow
> when it changes.
> 
> Kevin, Jun - are there any cases where the hardware would alter
> this field's value? Like during some guest side LAPIC manipulations?
> (The same monitoring as suggested during VMWRITEs could of
> course also be added to LAPIC accesses visible to the hypervisor,
> but I guess there won't be too many of those.)
> 

No such case in my knowledge. But let me confirm with hardware team.

Thanks
Kevin
Tian, Kevin Feb. 17, 2016, 2:54 a.m. UTC | #6
> From: Tian, Kevin
> Sent: Thursday, February 04, 2016 1:52 PM
> 
> > From: Jan Beulich [mailto:JBeulich@suse.com]
> > Sent: Wednesday, February 03, 2016 5:35 PM
> > > (XEN) nvmx_handle_vmwrite: 0
> > > (XEN) nvmx_handle_vmwrite: 2008
> > > (XEN) nvmx_handle_vmwrite: 2008
> > > (XEN) nvmx_handle_vmwrite: 0
> > > (XEN) nvmx_handle_vmwrite: 2008
> > > (XEN) nvmx_handle_vmwrite: 0
> > > (XEN) nvmx_handle_vmwrite: 2008
> > > (XEN) nvmx_handle_vmwrite: 2008
> > > (XEN) nvmx_handle_vmwrite: 2008
> > > (XEN) nvmx_handle_vmwrite: 2008
> > > (XEN) nvmx_handle_vmwrite: 2008
> > > (XEN) nvmx_handle_vmwrite: 800
> > > (XEN) nvmx_handle_vmwrite: 804
> > > (XEN) nvmx_handle_vmwrite: 806
> > > (XEN) nvmx_handle_vmwrite: 80a
> > > (XEN) nvmx_handle_vmwrite: 80e
> > > (XEN) nvmx_update_virtual_apic_address: vCPU1 0xffffffffffffffff(vAPIC)
> 0x0(APIC),
> > 0x0(TPR) ctrl=b5b9effe sec=0
> >
> > Assuming the field starts out as other than all ones, could you check
> > its value on each of the intercepted VMWRITEs, to at least narrow
> > when it changes.
> >
> > Kevin, Jun - are there any cases where the hardware would alter
> > this field's value? Like during some guest side LAPIC manipulations?
> > (The same monitoring as suggested during VMWRITEs could of
> > course also be added to LAPIC accesses visible to the hypervisor,
> > but I guess there won't be too many of those.)
> >
> 
> No such case in my knowledge. But let me confirm with hardware team.
> 

Confirmed no such case.

Thanks
Kevin
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index cb6f9b8..8a0abfc 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -695,7 +695,15 @@  static void nvmx_update_virtual_apic_address(struct vcpu *v)
 
         vapic_gpfn = __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR) >> PAGE_SHIFT;
         vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn, &p2mt, P2M_ALLOC);
-        ASSERT(vapic_pg && !p2m_is_paging(p2mt));
+       if ( !vapic_pg ) {
+               printk("%s:vCPU%d 0x%lx(vAPIC) 0x%lx(APIC), 0x%lx(TPR) ctrl=%x\n", __func__,v->vcpu_id,
+                       __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR),
+                       __get_vvmcs(nvcpu->nv_vvmcx, APIC_ACCESS_ADDR),
+                       __get_vvmcs(nvcpu->nv_vvmcx, TPR_THRESHOLD),
+                       ctrl);
+       }
+        ASSERT(vapic_pg);
+       ASSERT(vapic_pg && !p2m_is_paging(p2mt));
         __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg));
         put_page(vapic_pg);
     }

> 
> What looks odd to me is the connection between
> CPU_BASED_TPR_SHADOW being set and the use of a (valid)
> virtual APIC page: Wouldn't this rather need to depend on
> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES, just like in
> nvmx_update_apic_access_address()?

Could be. I added in an read for the secondary control:

nvmx_update_virtual_apic_address:vCPU2 0xffffffffffffffff(vAPIC) 0x0(APIC), 0x0(TPR) ctrl=b5b9effe sec=0

So trying your recommendation:
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index cb6f9b8..d291c91 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -686,8 +686,8 @@  static void nvmx_update_virtual_apic_address(struct vcpu *v)
     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
     u32 ctrl;
 
-    ctrl = __n2_exec_control(v);
-    if ( ctrl & CPU_BASED_TPR_SHADOW )
+    ctrl = __n2_secondary_exec_control(v);
+    if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES )
     {
         p2m_type_t p2mt;
         unsigned long vapic_gpfn;