diff mbox

[RFC,01/12] x86/paging: introduce paging_set_allocation

Message ID 1469809747-11176-2-git-send-email-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné July 29, 2016, 4:28 p.m. UTC
... and remove hap_set_alloc_for_pvh_dom0.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Tim Deegan <tim@xen.org>
---
 xen/arch/x86/domain_build.c     |  7 +++----
 xen/arch/x86/mm/hap/hap.c       | 14 +-------------
 xen/arch/x86/mm/paging.c        | 16 ++++++++++++++++
 xen/arch/x86/mm/shadow/common.c |  5 ++---
 xen/include/asm-x86/hap.h       |  3 ++-
 xen/include/asm-x86/paging.h    |  3 +++
 xen/include/asm-x86/shadow.h    |  6 ++++++
 7 files changed, 33 insertions(+), 21 deletions(-)

Comments

Andrew Cooper July 29, 2016, 4:47 p.m. UTC | #1
On 29/07/16 17:28, Roger Pau Monne wrote:
> diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
> index 107fc8c..1b270df 100644
> --- a/xen/arch/x86/mm/paging.c
> +++ b/xen/arch/x86/mm/paging.c
> @@ -953,6 +953,22 @@ void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
>          safe_write_pte(p, new);
>  }
>  
> +int paging_set_allocation(struct domain *d, unsigned long pages)
> +{
> +    int rc;
> +
> +    ASSERT(paging_mode_enabled(d));
> +
> +    paging_lock(d);
> +    if ( hap_enabled(d) )
> +        rc = hap_set_allocation(d, pages, NULL);
> +    else
> +        rc = sh_set_allocation(d, pages, NULL);

(without looking at the rest of the series) Your NMI is probably a
watchdog timeout from this call, as passing NULL means "non-preemptible".

As this is for the construction of dom0, it would be better to take a
preemptible pointer, loop in construct_dom0(), with a
process_pending_softirqs() call in between.

> +    paging_unlock(d);
> +
> +    return rc;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
> index c22362f..452e22e 100644
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -1604,9 +1604,8 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg)
>   * Input will be rounded up to at least shadow_min_acceptable_pages(),
>   * plus space for the p2m table.
>   * Returns 0 for success, non-zero for failure. */
> -static unsigned int sh_set_allocation(struct domain *d,
> -                                      unsigned int pages,
> -                                      int *preempted)
> +unsigned int sh_set_allocation(struct domain *d, unsigned int pages,
> +                               int *preempted)
>  {
>      struct page_info *sp;
>      unsigned int lower_bound;
> diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
> index c613836..e3c9c98 100644
> --- a/xen/include/asm-x86/hap.h
> +++ b/xen/include/asm-x86/hap.h
> @@ -46,7 +46,8 @@ int   hap_track_dirty_vram(struct domain *d,
>                             XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
>  
>  extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
> -void hap_set_alloc_for_pvh_dom0(struct domain *d, unsigned long num_pages);
> +unsigned int hap_set_allocation(struct domain *d, unsigned int pages,
> +				int *preempted);

I also note from this change that there is an unsigned long => unsigned
int truncation in the internals of *_set_allocation().  This should
definitely be fixed, although possibly wants to be a separate change.

~Andrew
Tim Deegan Aug. 1, 2016, 8:57 a.m. UTC | #2
Hi,

At 18:28 +0200 on 29 Jul (1469816936), Roger Pau Monne wrote:
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -1604,9 +1604,8 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg)
>   * Input will be rounded up to at least shadow_min_acceptable_pages(),
>   * plus space for the p2m table.
>   * Returns 0 for success, non-zero for failure. */
> -static unsigned int sh_set_allocation(struct domain *d,
> -                                      unsigned int pages,
> -                                      int *preempted)
> +unsigned int sh_set_allocation(struct domain *d, unsigned int pages,
> +                               int *preempted)

Making this non-static is fine by me.  Since you'll be respinning
anyway for Andrew's comments about can you please also move the full
comment above it to shadow.h so callers can see the rounding of inputs
&c.  It might be good to repeat some of that in the comment for
paging_set_allocation() too.

With that, Acked-by: Tim Deegan <tim@xen.org>

Cheers,

Tim.
Roger Pau Monné Aug. 2, 2016, 9:47 a.m. UTC | #3
On Fri, Jul 29, 2016 at 05:47:24PM +0100, Andrew Cooper wrote:
> On 29/07/16 17:28, Roger Pau Monne wrote:
> > diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
> > index 107fc8c..1b270df 100644
> > --- a/xen/arch/x86/mm/paging.c
> > +++ b/xen/arch/x86/mm/paging.c
> > @@ -953,6 +953,22 @@ void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
> >          safe_write_pte(p, new);
> >  }
> >  
> > +int paging_set_allocation(struct domain *d, unsigned long pages)
> > +{
> > +    int rc;
> > +
> > +    ASSERT(paging_mode_enabled(d));
> > +
> > +    paging_lock(d);
> > +    if ( hap_enabled(d) )
> > +        rc = hap_set_allocation(d, pages, NULL);
> > +    else
> > +        rc = sh_set_allocation(d, pages, NULL);
> 
> (without looking at the rest of the series) Your NMI is probably a
> watchdog timeout from this call, as passing NULL means "non-preemptible".

I don't think so, the NMI I saw happened while the guest was booting.

> As this is for the construction of dom0, it would be better to take a
> preemptible pointer, loop in construct_dom0(), with a
> process_pending_softirqs() call in between.

Now fixed.

> > +    paging_unlock(d);
> > +
> > +    return rc;
> > +}
> > +
> >  /*
> >   * Local variables:
> >   * mode: C
> > diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
> > index c22362f..452e22e 100644
> > --- a/xen/arch/x86/mm/shadow/common.c
> > +++ b/xen/arch/x86/mm/shadow/common.c
> > @@ -1604,9 +1604,8 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg)
> >   * Input will be rounded up to at least shadow_min_acceptable_pages(),
> >   * plus space for the p2m table.
> >   * Returns 0 for success, non-zero for failure. */
> > -static unsigned int sh_set_allocation(struct domain *d,
> > -                                      unsigned int pages,
> > -                                      int *preempted)
> > +unsigned int sh_set_allocation(struct domain *d, unsigned int pages,
> > +                               int *preempted)
> >  {
> >      struct page_info *sp;
> >      unsigned int lower_bound;
> > diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
> > index c613836..e3c9c98 100644
> > --- a/xen/include/asm-x86/hap.h
> > +++ b/xen/include/asm-x86/hap.h
> > @@ -46,7 +46,8 @@ int   hap_track_dirty_vram(struct domain *d,
> >                             XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
> >  
> >  extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
> > -void hap_set_alloc_for_pvh_dom0(struct domain *d, unsigned long num_pages);
> > +unsigned int hap_set_allocation(struct domain *d, unsigned int pages,
> > +				int *preempted);
> 
> I also note from this change that there is an unsigned long => unsigned
> int truncation in the internals of *_set_allocation().  This should
> definitely be fixed, although possibly wants to be a separate change.

Right, let me take a stab at this.

Roger.
Roger Pau Monné Aug. 2, 2016, 3:49 p.m. UTC | #4
On Tue, Aug 02, 2016 at 11:47:22AM +0200, Roger Pau Monne wrote:
> On Fri, Jul 29, 2016 at 05:47:24PM +0100, Andrew Cooper wrote:
> > On 29/07/16 17:28, Roger Pau Monne wrote:
> > > diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
> > > index 107fc8c..1b270df 100644
> > > --- a/xen/arch/x86/mm/paging.c
> > > +++ b/xen/arch/x86/mm/paging.c
> > > @@ -953,6 +953,22 @@ void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
> > >          safe_write_pte(p, new);
> > >  }
> > >  
> > > +int paging_set_allocation(struct domain *d, unsigned long pages)
> > > +{
> > > +    int rc;
> > > +
> > > +    ASSERT(paging_mode_enabled(d));
> > > +
> > > +    paging_lock(d);
> > > +    if ( hap_enabled(d) )
> > > +        rc = hap_set_allocation(d, pages, NULL);
> > > +    else
> > > +        rc = sh_set_allocation(d, pages, NULL);
> > 
> > (without looking at the rest of the series) Your NMI is probably a
> > watchdog timeout from this call, as passing NULL means "non-preemptible".
> 
> I don't think so, the NMI I saw happened while the guest was booting.
> 
> > As this is for the construction of dom0, it would be better to take a
> > preemptible pointer, loop in construct_dom0(), with a
> > process_pending_softirqs() call in between.
> 
> Now fixed.

Hm, I have to stand corrected, using hypercall_preempt_check (as 
any of the *_set_allocation function use), is not safe at this point:

(XEN) ----[ Xen-4.8-unstable  x86_64  debug=y  Tainted:    C  ]----
(XEN) CPU:    0
(XEN) RIP:    e008:[<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
(XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor
(XEN) rax: 0000000000000000   rbx: ffff83023f5a5000   rcx: ffff82d080312900
(XEN) rdx: 0000000000000001   rsi: ffff83023f5a56c8   rdi: ffff8300b213d000
(XEN) rbp: ffff82d080307cc8   rsp: ffff82d080307cc8   r8:  0180000000000000
(XEN) r9:  0000000000000000   r10: 0000000000247000   r11: ffff82d08029a5b0
(XEN) r12: 0000000000000011   r13: 00000000000023ac   r14: ffff82d080307d4c
(XEN) r15: ffff83023f5a56c8   cr0: 000000008005003b   cr4: 00000000001526e0
(XEN) cr3: 00000000b20fc000   cr2: 0000000000000000
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
(XEN) Xen code around <ffff82d08022fd47> (hap.c#local_events_need_delivery+0x27/0x40):
(XEN)  0d ad fa ff 48 8b 47 08 <80> 38 00 74 09 80 78 01 00 0f 94 c0 eb 02 31 c0
(XEN) Xen stack trace from rsp=ffff82d080307cc8:
(XEN)    ffff82d080307d08 ffff82d08022fc47 0000000000000000 ffff83023f5a5000
(XEN)    ffff83023f5a5648 0000000000000000 ffff82d080307d4c 0000000000002400
(XEN)    ffff82d080307d38 ffff82d08020008c 00000000000ffffd ffff8300b1efd000
(XEN)    ffff83023f5a5000 ffff82d080307d4c ffff82d080307d78 ffff82d0802cad30
(XEN)    0000000000203000 ffff83023f5a5000 ffff82d0802bf860 0000000000000000
(XEN)    0000000000000001 ffff83000008bef0 ffff82d080307de8 ffff82d0802c91e0
(XEN)    ffff82d080307de8 ffff82d080143900 ffff82d080307de8 0000000000000000
(XEN)    ffff83000008bf00 ffff82d0802eb480 ffff82d080307dc4 ffff82d08028b1cd
(XEN)    ffff83000008bf00 0000000000000000 0000000000000001 ffff83023f5a5000
(XEN)    ffff82d080307f08 ffff82d0802bf0c9 0000000000000000 0000000000000000
(XEN)    0000000000000000 ffff82d080307f18 ffff83000008bee0 0000000000000001
(XEN)    0000000000000001 0000000000000001 0000000000000000 0000000000100000
(XEN)    0000000000000001 0000000000247000 ffff83000008bef4 0000000000100000
(XEN)    ffff830100000000 0000000000247001 0000000000000014 0000000100000000
(XEN)    ffff8300ffffffec ffff83000008bef0 ffff82d0802e0640 ffff83000008bfb0
(XEN)    0000000000000000 0000000000000000 0000000000000111 0000000800000000
(XEN)    000000010000006e 0000000000000003 00000000000002f8 0000000000000000
(XEN)    00000000ad5c0bd0 0000000000000000 0000000000000001 0000000000000008
(XEN)    0000000000000000 ffff82d080100073 0000000000000000 0000000000000000
(XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
(XEN) Xen call trace:
(XEN)    [<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
(XEN)    [<ffff82d08022fc47>] hap_set_allocation+0x107/0x130
(XEN)    [<ffff82d08020008c>] paging_set_allocation+0x4c/0x80
(XEN)    [<ffff82d0802cad30>] domain_build.c#hvm_setup_p2m+0x70/0x1a0
(XEN)    [<ffff82d0802c91e0>] domain_build.c#construct_dom0_hvm+0x60/0x120
(XEN)    [<ffff82d0802bf0c9>] __start_xen+0x1ea9/0x23a0
(XEN)    [<ffff82d080100073>] __high_start+0x53/0x60
(XEN)
(XEN) Pagetable walk from 0000000000000000:
(XEN)  L4[0x000] = 0000000245233063 ffffffffffffffff
(XEN)  L3[0x000] = 0000000245232063 ffffffffffffffff
(XEN)  L2[0x000] = 0000000245231063 ffffffffffffffff
(XEN)  L1[0x000] = 0000000000000000 ffffffffffffffff
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 0:
(XEN) FATAL PAGE FAULT
(XEN) [error_code=0000]
(XEN) Faulting linear address: 0000000000000000
(XEN) ****************************************

I've tried setting current to d->vcpu[0], but that just makes the call to 
hypercall_preempt_check crash in some scheduler assert. In any case, I've 
added the preempt parameter to the paging_set_allocation function, but I 
don't plan to use it in the domain builder for the time being. Does that 
sound right?

Roger.
Jan Beulich Aug. 2, 2016, 4:12 p.m. UTC | #5
>>> On 02.08.16 at 17:49, <roger.pau@citrix.com> wrote:
> On Tue, Aug 02, 2016 at 11:47:22AM +0200, Roger Pau Monne wrote:
>> On Fri, Jul 29, 2016 at 05:47:24PM +0100, Andrew Cooper wrote:
>> > As this is for the construction of dom0, it would be better to take a
>> > preemptible pointer, loop in construct_dom0(), with a
>> > process_pending_softirqs() call in between.
>> 
>> Now fixed.
> 
> Hm, I have to stand corrected, using hypercall_preempt_check (as 
> any of the *_set_allocation function use), is not safe at this point:
> 
> (XEN) ----[ Xen-4.8-unstable  x86_64  debug=y  Tainted:    C  ]----
> (XEN) CPU:    0
> (XEN) RIP:    e008:[<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
> (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor
> (XEN) rax: 0000000000000000   rbx: ffff83023f5a5000   rcx: ffff82d080312900
> (XEN) rdx: 0000000000000001   rsi: ffff83023f5a56c8   rdi: ffff8300b213d000
> (XEN) rbp: ffff82d080307cc8   rsp: ffff82d080307cc8   r8:  0180000000000000
> (XEN) r9:  0000000000000000   r10: 0000000000247000   r11: ffff82d08029a5b0
> (XEN) r12: 0000000000000011   r13: 00000000000023ac   r14: ffff82d080307d4c
> (XEN) r15: ffff83023f5a56c8   cr0: 000000008005003b   cr4: 00000000001526e0
> (XEN) cr3: 00000000b20fc000   cr2: 0000000000000000
> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
> (XEN) Xen code around <ffff82d08022fd47> (hap.c#local_events_need_delivery+0x27/0x40):
> (XEN)  0d ad fa ff 48 8b 47 08 <80> 38 00 74 09 80 78 01 00 0f 94 c0 eb 02 31 c0
> (XEN) Xen stack trace from rsp=ffff82d080307cc8:
> (XEN)    ffff82d080307d08 ffff82d08022fc47 0000000000000000 ffff83023f5a5000
> (XEN)    ffff83023f5a5648 0000000000000000 ffff82d080307d4c 0000000000002400
> (XEN)    ffff82d080307d38 ffff82d08020008c 00000000000ffffd ffff8300b1efd000
> (XEN)    ffff83023f5a5000 ffff82d080307d4c ffff82d080307d78 ffff82d0802cad30
> (XEN)    0000000000203000 ffff83023f5a5000 ffff82d0802bf860 0000000000000000
> (XEN)    0000000000000001 ffff83000008bef0 ffff82d080307de8 ffff82d0802c91e0
> (XEN)    ffff82d080307de8 ffff82d080143900 ffff82d080307de8 0000000000000000
> (XEN)    ffff83000008bf00 ffff82d0802eb480 ffff82d080307dc4 ffff82d08028b1cd
> (XEN)    ffff83000008bf00 0000000000000000 0000000000000001 ffff83023f5a5000
> (XEN)    ffff82d080307f08 ffff82d0802bf0c9 0000000000000000 0000000000000000
> (XEN)    0000000000000000 ffff82d080307f18 ffff83000008bee0 0000000000000001
> (XEN)    0000000000000001 0000000000000001 0000000000000000 0000000000100000
> (XEN)    0000000000000001 0000000000247000 ffff83000008bef4 0000000000100000
> (XEN)    ffff830100000000 0000000000247001 0000000000000014 0000000100000000
> (XEN)    ffff8300ffffffec ffff83000008bef0 ffff82d0802e0640 ffff83000008bfb0
> (XEN)    0000000000000000 0000000000000000 0000000000000111 0000000800000000
> (XEN)    000000010000006e 0000000000000003 00000000000002f8 0000000000000000
> (XEN)    00000000ad5c0bd0 0000000000000000 0000000000000001 0000000000000008
> (XEN)    0000000000000000 ffff82d080100073 0000000000000000 0000000000000000
> (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
> (XEN) Xen call trace:
> (XEN)    [<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
> (XEN)    [<ffff82d08022fc47>] hap_set_allocation+0x107/0x130
> (XEN)    [<ffff82d08020008c>] paging_set_allocation+0x4c/0x80
> (XEN)    [<ffff82d0802cad30>] domain_build.c#hvm_setup_p2m+0x70/0x1a0
> (XEN)    [<ffff82d0802c91e0>] domain_build.c#construct_dom0_hvm+0x60/0x120
> (XEN)    [<ffff82d0802bf0c9>] __start_xen+0x1ea9/0x23a0
> (XEN)    [<ffff82d080100073>] __high_start+0x53/0x60
> (XEN)
> (XEN) Pagetable walk from 0000000000000000:

Sadly you don't make clear what pointer it is that is NULL at that point.

> I've tried setting current to d->vcpu[0], but that just makes the call to 
> hypercall_preempt_check crash in some scheduler assert. In any case, I've 
> added the preempt parameter to the paging_set_allocation function, but I 
> don't plan to use it in the domain builder for the time being. Does that 
> sound right?

Not really, new huge latency issues like this shouldn't be reintroduced;
we've been fighting hard to get rid of those (and we still occasionally
find some no-one had noticed before).

Jan
George Dunlap Aug. 3, 2016, 3:11 p.m. UTC | #6
On Tue, Aug 2, 2016 at 5:12 PM, Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 02.08.16 at 17:49, <roger.pau@citrix.com> wrote:
>> On Tue, Aug 02, 2016 at 11:47:22AM +0200, Roger Pau Monne wrote:
>>> On Fri, Jul 29, 2016 at 05:47:24PM +0100, Andrew Cooper wrote:
>>> > As this is for the construction of dom0, it would be better to take a
>>> > preemptible pointer, loop in construct_dom0(), with a
>>> > process_pending_softirqs() call in between.
>>>
>>> Now fixed.
>>
>> Hm, I have to stand corrected, using hypercall_preempt_check (as
>> any of the *_set_allocation function use), is not safe at this point:
>>
>> (XEN) ----[ Xen-4.8-unstable  x86_64  debug=y  Tainted:    C  ]----
>> (XEN) CPU:    0
>> (XEN) RIP:    e008:[<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
>> (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor
>> (XEN) rax: 0000000000000000   rbx: ffff83023f5a5000   rcx: ffff82d080312900
>> (XEN) rdx: 0000000000000001   rsi: ffff83023f5a56c8   rdi: ffff8300b213d000
>> (XEN) rbp: ffff82d080307cc8   rsp: ffff82d080307cc8   r8:  0180000000000000
>> (XEN) r9:  0000000000000000   r10: 0000000000247000   r11: ffff82d08029a5b0
>> (XEN) r12: 0000000000000011   r13: 00000000000023ac   r14: ffff82d080307d4c
>> (XEN) r15: ffff83023f5a56c8   cr0: 000000008005003b   cr4: 00000000001526e0
>> (XEN) cr3: 00000000b20fc000   cr2: 0000000000000000
>> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
>> (XEN) Xen code around <ffff82d08022fd47> (hap.c#local_events_need_delivery+0x27/0x40):
>> (XEN)  0d ad fa ff 48 8b 47 08 <80> 38 00 74 09 80 78 01 00 0f 94 c0 eb 02 31 c0
>> (XEN) Xen stack trace from rsp=ffff82d080307cc8:
>> (XEN)    ffff82d080307d08 ffff82d08022fc47 0000000000000000 ffff83023f5a5000
>> (XEN)    ffff83023f5a5648 0000000000000000 ffff82d080307d4c 0000000000002400
>> (XEN)    ffff82d080307d38 ffff82d08020008c 00000000000ffffd ffff8300b1efd000
>> (XEN)    ffff83023f5a5000 ffff82d080307d4c ffff82d080307d78 ffff82d0802cad30
>> (XEN)    0000000000203000 ffff83023f5a5000 ffff82d0802bf860 0000000000000000
>> (XEN)    0000000000000001 ffff83000008bef0 ffff82d080307de8 ffff82d0802c91e0
>> (XEN)    ffff82d080307de8 ffff82d080143900 ffff82d080307de8 0000000000000000
>> (XEN)    ffff83000008bf00 ffff82d0802eb480 ffff82d080307dc4 ffff82d08028b1cd
>> (XEN)    ffff83000008bf00 0000000000000000 0000000000000001 ffff83023f5a5000
>> (XEN)    ffff82d080307f08 ffff82d0802bf0c9 0000000000000000 0000000000000000
>> (XEN)    0000000000000000 ffff82d080307f18 ffff83000008bee0 0000000000000001
>> (XEN)    0000000000000001 0000000000000001 0000000000000000 0000000000100000
>> (XEN)    0000000000000001 0000000000247000 ffff83000008bef4 0000000000100000
>> (XEN)    ffff830100000000 0000000000247001 0000000000000014 0000000100000000
>> (XEN)    ffff8300ffffffec ffff83000008bef0 ffff82d0802e0640 ffff83000008bfb0
>> (XEN)    0000000000000000 0000000000000000 0000000000000111 0000000800000000
>> (XEN)    000000010000006e 0000000000000003 00000000000002f8 0000000000000000
>> (XEN)    00000000ad5c0bd0 0000000000000000 0000000000000001 0000000000000008
>> (XEN)    0000000000000000 ffff82d080100073 0000000000000000 0000000000000000
>> (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
>> (XEN) Xen call trace:
>> (XEN)    [<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
>> (XEN)    [<ffff82d08022fc47>] hap_set_allocation+0x107/0x130
>> (XEN)    [<ffff82d08020008c>] paging_set_allocation+0x4c/0x80
>> (XEN)    [<ffff82d0802cad30>] domain_build.c#hvm_setup_p2m+0x70/0x1a0
>> (XEN)    [<ffff82d0802c91e0>] domain_build.c#construct_dom0_hvm+0x60/0x120
>> (XEN)    [<ffff82d0802bf0c9>] __start_xen+0x1ea9/0x23a0
>> (XEN)    [<ffff82d080100073>] __high_start+0x53/0x60
>> (XEN)
>> (XEN) Pagetable walk from 0000000000000000:
>
> Sadly you don't make clear what pointer it is that is NULL at that point.

It sounds from what he says in the following paragraph like current is NULL.

>> I've tried setting current to d->vcpu[0], but that just makes the call to
>> hypercall_preempt_check crash in some scheduler assert. In any case, I've
>> added the preempt parameter to the paging_set_allocation function, but I
>> don't plan to use it in the domain builder for the time being. Does that
>> sound right?
>
> Not really, new huge latency issues like this shouldn't be reintroduced;
> we've been fighting hard to get rid of those (and we still occasionally
> find some no-one had noticed before).

You mean latency in processing softirqs?

Maybe what we need to do is to make local_events_need_delivery() safe
to call at this point by having it return 0 if current is NULL rather
than crashing?

 -George
Jan Beulich Aug. 3, 2016, 3:25 p.m. UTC | #7
>>> On 03.08.16 at 17:11, <George.Dunlap@eu.citrix.com> wrote:
> On Tue, Aug 2, 2016 at 5:12 PM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>> On 02.08.16 at 17:49, <roger.pau@citrix.com> wrote:
>>> On Tue, Aug 02, 2016 at 11:47:22AM +0200, Roger Pau Monne wrote:
>>>> On Fri, Jul 29, 2016 at 05:47:24PM +0100, Andrew Cooper wrote:
>>>> > As this is for the construction of dom0, it would be better to take a
>>>> > preemptible pointer, loop in construct_dom0(), with a
>>>> > process_pending_softirqs() call in between.
>>>>
>>>> Now fixed.
>>>
>>> Hm, I have to stand corrected, using hypercall_preempt_check (as
>>> any of the *_set_allocation function use), is not safe at this point:
>>>
>>> (XEN) ----[ Xen-4.8-unstable  x86_64  debug=y  Tainted:    C  ]----
>>> (XEN) CPU:    0
>>> (XEN) RIP:    e008:[<ffff82d08022fd47>] 
> hap.c#local_events_need_delivery+0x27/0x40
>>> (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor
>>> (XEN) rax: 0000000000000000   rbx: ffff83023f5a5000   rcx: ffff82d080312900
>>> (XEN) rdx: 0000000000000001   rsi: ffff83023f5a56c8   rdi: ffff8300b213d000
>>> (XEN) rbp: ffff82d080307cc8   rsp: ffff82d080307cc8   r8:  0180000000000000
>>> (XEN) r9:  0000000000000000   r10: 0000000000247000   r11: ffff82d08029a5b0
>>> (XEN) r12: 0000000000000011   r13: 00000000000023ac   r14: ffff82d080307d4c
>>> (XEN) r15: ffff83023f5a56c8   cr0: 000000008005003b   cr4: 00000000001526e0
>>> (XEN) cr3: 00000000b20fc000   cr2: 0000000000000000
>>> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
>>> (XEN) Xen code around <ffff82d08022fd47> 
> (hap.c#local_events_need_delivery+0x27/0x40):
>>> (XEN)  0d ad fa ff 48 8b 47 08 <80> 38 00 74 09 80 78 01 00 0f 94 c0 eb 02 31 
> c0
>>> (XEN) Xen stack trace from rsp=ffff82d080307cc8:
>>> (XEN)    ffff82d080307d08 ffff82d08022fc47 0000000000000000 ffff83023f5a5000
>>> (XEN)    ffff83023f5a5648 0000000000000000 ffff82d080307d4c 0000000000002400
>>> (XEN)    ffff82d080307d38 ffff82d08020008c 00000000000ffffd ffff8300b1efd000
>>> (XEN)    ffff83023f5a5000 ffff82d080307d4c ffff82d080307d78 ffff82d0802cad30
>>> (XEN)    0000000000203000 ffff83023f5a5000 ffff82d0802bf860 0000000000000000
>>> (XEN)    0000000000000001 ffff83000008bef0 ffff82d080307de8 ffff82d0802c91e0
>>> (XEN)    ffff82d080307de8 ffff82d080143900 ffff82d080307de8 0000000000000000
>>> (XEN)    ffff83000008bf00 ffff82d0802eb480 ffff82d080307dc4 ffff82d08028b1cd
>>> (XEN)    ffff83000008bf00 0000000000000000 0000000000000001 ffff83023f5a5000
>>> (XEN)    ffff82d080307f08 ffff82d0802bf0c9 0000000000000000 0000000000000000
>>> (XEN)    0000000000000000 ffff82d080307f18 ffff83000008bee0 0000000000000001
>>> (XEN)    0000000000000001 0000000000000001 0000000000000000 0000000000100000
>>> (XEN)    0000000000000001 0000000000247000 ffff83000008bef4 0000000000100000
>>> (XEN)    ffff830100000000 0000000000247001 0000000000000014 0000000100000000
>>> (XEN)    ffff8300ffffffec ffff83000008bef0 ffff82d0802e0640 ffff83000008bfb0
>>> (XEN)    0000000000000000 0000000000000000 0000000000000111 0000000800000000
>>> (XEN)    000000010000006e 0000000000000003 00000000000002f8 0000000000000000
>>> (XEN)    00000000ad5c0bd0 0000000000000000 0000000000000001 0000000000000008
>>> (XEN)    0000000000000000 ffff82d080100073 0000000000000000 0000000000000000
>>> (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>> (XEN) Xen call trace:
>>> (XEN)    [<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
>>> (XEN)    [<ffff82d08022fc47>] hap_set_allocation+0x107/0x130
>>> (XEN)    [<ffff82d08020008c>] paging_set_allocation+0x4c/0x80
>>> (XEN)    [<ffff82d0802cad30>] domain_build.c#hvm_setup_p2m+0x70/0x1a0
>>> (XEN)    [<ffff82d0802c91e0>] domain_build.c#construct_dom0_hvm+0x60/0x120
>>> (XEN)    [<ffff82d0802bf0c9>] __start_xen+0x1ea9/0x23a0
>>> (XEN)    [<ffff82d080100073>] __high_start+0x53/0x60
>>> (XEN)
>>> (XEN) Pagetable walk from 0000000000000000:
>>
>> Sadly you don't make clear what pointer it is that is NULL at that point.
> 
> It sounds from what he says in the following paragraph like current is NULL.

I don't recall us re-setting current to this late in the boot process.
Even during early boot we set it to a bogus non-NULL value rather
than NULL.

>>> I've tried setting current to d->vcpu[0], but that just makes the call to
>>> hypercall_preempt_check crash in some scheduler assert. In any case, I've
>>> added the preempt parameter to the paging_set_allocation function, but I
>>> don't plan to use it in the domain builder for the time being. Does that
>>> sound right?
>>
>> Not really, new huge latency issues like this shouldn't be reintroduced;
>> we've been fighting hard to get rid of those (and we still occasionally
>> find some no-one had noticed before).
> 
> You mean latency in processing softirqs?
> 
> Maybe what we need to do is to make local_events_need_delivery() safe
> to call at this point by having it return 0 if current is NULL rather
> than crashing?

That would have the same effect - no softirq processing, and hence
possible time issues on huge systems.

Jan
George Dunlap Aug. 3, 2016, 3:28 p.m. UTC | #8
On 03/08/16 16:25, Jan Beulich wrote:
>>>> On 03.08.16 at 17:11, <George.Dunlap@eu.citrix.com> wrote:
>> On Tue, Aug 2, 2016 at 5:12 PM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>> On 02.08.16 at 17:49, <roger.pau@citrix.com> wrote:
>>>> On Tue, Aug 02, 2016 at 11:47:22AM +0200, Roger Pau Monne wrote:
>>>>> On Fri, Jul 29, 2016 at 05:47:24PM +0100, Andrew Cooper wrote:
>>>>>> As this is for the construction of dom0, it would be better to take a
>>>>>> preemptible pointer, loop in construct_dom0(), with a
>>>>>> process_pending_softirqs() call in between.
>>>>>
>>>>> Now fixed.
>>>>
>>>> Hm, I have to stand corrected, using hypercall_preempt_check (as
>>>> any of the *_set_allocation function use), is not safe at this point:
>>>>
>>>> (XEN) ----[ Xen-4.8-unstable  x86_64  debug=y  Tainted:    C  ]----
>>>> (XEN) CPU:    0
>>>> (XEN) RIP:    e008:[<ffff82d08022fd47>] 
>> hap.c#local_events_need_delivery+0x27/0x40
>>>> (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor
>>>> (XEN) rax: 0000000000000000   rbx: ffff83023f5a5000   rcx: ffff82d080312900
>>>> (XEN) rdx: 0000000000000001   rsi: ffff83023f5a56c8   rdi: ffff8300b213d000
>>>> (XEN) rbp: ffff82d080307cc8   rsp: ffff82d080307cc8   r8:  0180000000000000
>>>> (XEN) r9:  0000000000000000   r10: 0000000000247000   r11: ffff82d08029a5b0
>>>> (XEN) r12: 0000000000000011   r13: 00000000000023ac   r14: ffff82d080307d4c
>>>> (XEN) r15: ffff83023f5a56c8   cr0: 000000008005003b   cr4: 00000000001526e0
>>>> (XEN) cr3: 00000000b20fc000   cr2: 0000000000000000
>>>> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
>>>> (XEN) Xen code around <ffff82d08022fd47> 
>> (hap.c#local_events_need_delivery+0x27/0x40):
>>>> (XEN)  0d ad fa ff 48 8b 47 08 <80> 38 00 74 09 80 78 01 00 0f 94 c0 eb 02 31 
>> c0
>>>> (XEN) Xen stack trace from rsp=ffff82d080307cc8:
>>>> (XEN)    ffff82d080307d08 ffff82d08022fc47 0000000000000000 ffff83023f5a5000
>>>> (XEN)    ffff83023f5a5648 0000000000000000 ffff82d080307d4c 0000000000002400
>>>> (XEN)    ffff82d080307d38 ffff82d08020008c 00000000000ffffd ffff8300b1efd000
>>>> (XEN)    ffff83023f5a5000 ffff82d080307d4c ffff82d080307d78 ffff82d0802cad30
>>>> (XEN)    0000000000203000 ffff83023f5a5000 ffff82d0802bf860 0000000000000000
>>>> (XEN)    0000000000000001 ffff83000008bef0 ffff82d080307de8 ffff82d0802c91e0
>>>> (XEN)    ffff82d080307de8 ffff82d080143900 ffff82d080307de8 0000000000000000
>>>> (XEN)    ffff83000008bf00 ffff82d0802eb480 ffff82d080307dc4 ffff82d08028b1cd
>>>> (XEN)    ffff83000008bf00 0000000000000000 0000000000000001 ffff83023f5a5000
>>>> (XEN)    ffff82d080307f08 ffff82d0802bf0c9 0000000000000000 0000000000000000
>>>> (XEN)    0000000000000000 ffff82d080307f18 ffff83000008bee0 0000000000000001
>>>> (XEN)    0000000000000001 0000000000000001 0000000000000000 0000000000100000
>>>> (XEN)    0000000000000001 0000000000247000 ffff83000008bef4 0000000000100000
>>>> (XEN)    ffff830100000000 0000000000247001 0000000000000014 0000000100000000
>>>> (XEN)    ffff8300ffffffec ffff83000008bef0 ffff82d0802e0640 ffff83000008bfb0
>>>> (XEN)    0000000000000000 0000000000000000 0000000000000111 0000000800000000
>>>> (XEN)    000000010000006e 0000000000000003 00000000000002f8 0000000000000000
>>>> (XEN)    00000000ad5c0bd0 0000000000000000 0000000000000001 0000000000000008
>>>> (XEN)    0000000000000000 ffff82d080100073 0000000000000000 0000000000000000
>>>> (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>>> (XEN) Xen call trace:
>>>> (XEN)    [<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
>>>> (XEN)    [<ffff82d08022fc47>] hap_set_allocation+0x107/0x130
>>>> (XEN)    [<ffff82d08020008c>] paging_set_allocation+0x4c/0x80
>>>> (XEN)    [<ffff82d0802cad30>] domain_build.c#hvm_setup_p2m+0x70/0x1a0
>>>> (XEN)    [<ffff82d0802c91e0>] domain_build.c#construct_dom0_hvm+0x60/0x120
>>>> (XEN)    [<ffff82d0802bf0c9>] __start_xen+0x1ea9/0x23a0
>>>> (XEN)    [<ffff82d080100073>] __high_start+0x53/0x60
>>>> (XEN)
>>>> (XEN) Pagetable walk from 0000000000000000:
>>>
>>> Sadly you don't make clear what pointer it is that is NULL at that point.
>>
>> It sounds from what he says in the following paragraph like current is NULL.
> 
> I don't recall us re-setting current to this late in the boot process.
> Even during early boot we set it to a bogus non-NULL value rather
> than NULL.
> 
>>>> I've tried setting current to d->vcpu[0], but that just makes the call to
>>>> hypercall_preempt_check crash in some scheduler assert. In any case, I've
>>>> added the preempt parameter to the paging_set_allocation function, but I
>>>> don't plan to use it in the domain builder for the time being. Does that
>>>> sound right?
>>>
>>> Not really, new huge latency issues like this shouldn't be reintroduced;
>>> we've been fighting hard to get rid of those (and we still occasionally
>>> find some no-one had noticed before).
>>
>> You mean latency in processing softirqs?
>>
>> Maybe what we need to do is to make local_events_need_delivery() safe
>> to call at this point by having it return 0 if current is NULL rather
>> than crashing?
> 
> That would have the same effect - no softirq processing, and hence
> possible time issues on huge systems.

No, local_events_delivery() only checks to see if the current vcpu has
outstanding virtual interrupts.  The other half of
hypercall_preempt_check() checks for softirqs, which doesn't appear to
rely on having current available.

 -George

> 
> Jan
>
Jan Beulich Aug. 3, 2016, 3:37 p.m. UTC | #9
>>> On 03.08.16 at 17:28, <george.dunlap@citrix.com> wrote:
> On 03/08/16 16:25, Jan Beulich wrote:
>>>>> On 03.08.16 at 17:11, <George.Dunlap@eu.citrix.com> wrote:
>>> On Tue, Aug 2, 2016 at 5:12 PM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>> On 02.08.16 at 17:49, <roger.pau@citrix.com> wrote:
>>>>> On Tue, Aug 02, 2016 at 11:47:22AM +0200, Roger Pau Monne wrote:
>>>>>> On Fri, Jul 29, 2016 at 05:47:24PM +0100, Andrew Cooper wrote:
>>>>>>> As this is for the construction of dom0, it would be better to take a
>>>>>>> preemptible pointer, loop in construct_dom0(), with a
>>>>>>> process_pending_softirqs() call in between.
>>>>>>
>>>>>> Now fixed.
>>>>>
>>>>> Hm, I have to stand corrected, using hypercall_preempt_check (as
>>>>> any of the *_set_allocation function use), is not safe at this point:
>>>>>
>>>>> (XEN) ----[ Xen-4.8-unstable  x86_64  debug=y  Tainted:    C  ]----
>>>>> (XEN) CPU:    0
>>>>> (XEN) RIP:    e008:[<ffff82d08022fd47>] 
>>> hap.c#local_events_need_delivery+0x27/0x40
>>>>> (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor
>>>>> (XEN) rax: 0000000000000000   rbx: ffff83023f5a5000   rcx: ffff82d080312900
>>>>> (XEN) rdx: 0000000000000001   rsi: ffff83023f5a56c8   rdi: ffff8300b213d000
>>>>> (XEN) rbp: ffff82d080307cc8   rsp: ffff82d080307cc8   r8:  0180000000000000
>>>>> (XEN) r9:  0000000000000000   r10: 0000000000247000   r11: ffff82d08029a5b0
>>>>> (XEN) r12: 0000000000000011   r13: 00000000000023ac   r14: ffff82d080307d4c
>>>>> (XEN) r15: ffff83023f5a56c8   cr0: 000000008005003b   cr4: 00000000001526e0
>>>>> (XEN) cr3: 00000000b20fc000   cr2: 0000000000000000
>>>>> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
>>>>> (XEN) Xen code around <ffff82d08022fd47> 
>>> (hap.c#local_events_need_delivery+0x27/0x40):
>>>>> (XEN)  0d ad fa ff 48 8b 47 08 <80> 38 00 74 09 80 78 01 00 0f 94 c0 eb 02 31 
>>> c0
>>>>> (XEN) Xen stack trace from rsp=ffff82d080307cc8:
>>>>> (XEN)    ffff82d080307d08 ffff82d08022fc47 0000000000000000 ffff83023f5a5000
>>>>> (XEN)    ffff83023f5a5648 0000000000000000 ffff82d080307d4c 0000000000002400
>>>>> (XEN)    ffff82d080307d38 ffff82d08020008c 00000000000ffffd ffff8300b1efd000
>>>>> (XEN)    ffff83023f5a5000 ffff82d080307d4c ffff82d080307d78 ffff82d0802cad30
>>>>> (XEN)    0000000000203000 ffff83023f5a5000 ffff82d0802bf860 0000000000000000
>>>>> (XEN)    0000000000000001 ffff83000008bef0 ffff82d080307de8 ffff82d0802c91e0
>>>>> (XEN)    ffff82d080307de8 ffff82d080143900 ffff82d080307de8 0000000000000000
>>>>> (XEN)    ffff83000008bf00 ffff82d0802eb480 ffff82d080307dc4 ffff82d08028b1cd
>>>>> (XEN)    ffff83000008bf00 0000000000000000 0000000000000001 ffff83023f5a5000
>>>>> (XEN)    ffff82d080307f08 ffff82d0802bf0c9 0000000000000000 0000000000000000
>>>>> (XEN)    0000000000000000 ffff82d080307f18 ffff83000008bee0 0000000000000001
>>>>> (XEN)    0000000000000001 0000000000000001 0000000000000000 0000000000100000
>>>>> (XEN)    0000000000000001 0000000000247000 ffff83000008bef4 0000000000100000
>>>>> (XEN)    ffff830100000000 0000000000247001 0000000000000014 0000000100000000
>>>>> (XEN)    ffff8300ffffffec ffff83000008bef0 ffff82d0802e0640 ffff83000008bfb0
>>>>> (XEN)    0000000000000000 0000000000000000 0000000000000111 0000000800000000
>>>>> (XEN)    000000010000006e 0000000000000003 00000000000002f8 0000000000000000
>>>>> (XEN)    00000000ad5c0bd0 0000000000000000 0000000000000001 0000000000000008
>>>>> (XEN)    0000000000000000 ffff82d080100073 0000000000000000 0000000000000000
>>>>> (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>>>> (XEN) Xen call trace:
>>>>> (XEN)    [<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
>>>>> (XEN)    [<ffff82d08022fc47>] hap_set_allocation+0x107/0x130
>>>>> (XEN)    [<ffff82d08020008c>] paging_set_allocation+0x4c/0x80
>>>>> (XEN)    [<ffff82d0802cad30>] domain_build.c#hvm_setup_p2m+0x70/0x1a0
>>>>> (XEN)    [<ffff82d0802c91e0>] domain_build.c#construct_dom0_hvm+0x60/0x120
>>>>> (XEN)    [<ffff82d0802bf0c9>] __start_xen+0x1ea9/0x23a0
>>>>> (XEN)    [<ffff82d080100073>] __high_start+0x53/0x60
>>>>> (XEN)
>>>>> (XEN) Pagetable walk from 0000000000000000:
>>>>
>>>> Sadly you don't make clear what pointer it is that is NULL at that point.
>>>
>>> It sounds from what he says in the following paragraph like current is NULL.
>> 
>> I don't recall us re-setting current to this late in the boot process.
>> Even during early boot we set it to a bogus non-NULL value rather
>> than NULL.
>> 
>>>>> I've tried setting current to d->vcpu[0], but that just makes the call to
>>>>> hypercall_preempt_check crash in some scheduler assert. In any case, I've
>>>>> added the preempt parameter to the paging_set_allocation function, but I
>>>>> don't plan to use it in the domain builder for the time being. Does that
>>>>> sound right?
>>>>
>>>> Not really, new huge latency issues like this shouldn't be reintroduced;
>>>> we've been fighting hard to get rid of those (and we still occasionally
>>>> find some no-one had noticed before).
>>>
>>> You mean latency in processing softirqs?
>>>
>>> Maybe what we need to do is to make local_events_need_delivery() safe
>>> to call at this point by having it return 0 if current is NULL rather
>>> than crashing?
>> 
>> That would have the same effect - no softirq processing, and hence
>> possible time issues on huge systems.
> 
> No, local_events_delivery() only checks to see if the current vcpu has
> outstanding virtual interrupts.  The other half of
> hypercall_preempt_check() checks for softirqs, which doesn't appear to
> rely on having current available.

Good point, but
- current should nevertheless not be NULL (afaict at least),
- hypercall_preempt_check() is probably the wrong construct,
  as we're no in a hypercall.
The latter of course could be addressed by, as you did suggest,
some refinement to one of the pieces it's being made up from,
but I'm not sure that would be better than perhaps making its
invocation conditional (with some better alternative in the "else"
case) in hap_set_allocation(). Not the least because any
adjustment to hypercall_preempt_check() itself would affect all
other of its users.

Jan
George Dunlap Aug. 3, 2016, 3:59 p.m. UTC | #10
On Wed, Aug 3, 2016 at 4:37 PM, Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 03.08.16 at 17:28, <george.dunlap@citrix.com> wrote:
>> On 03/08/16 16:25, Jan Beulich wrote:
>>>>>> On 03.08.16 at 17:11, <George.Dunlap@eu.citrix.com> wrote:
>>>> On Tue, Aug 2, 2016 at 5:12 PM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>>> On 02.08.16 at 17:49, <roger.pau@citrix.com> wrote:
>>>>>> On Tue, Aug 02, 2016 at 11:47:22AM +0200, Roger Pau Monne wrote:
>>>>>>> On Fri, Jul 29, 2016 at 05:47:24PM +0100, Andrew Cooper wrote:
>>>>>>>> As this is for the construction of dom0, it would be better to take a
>>>>>>>> preemptible pointer, loop in construct_dom0(), with a
>>>>>>>> process_pending_softirqs() call in between.
>>>>>>>
>>>>>>> Now fixed.
>>>>>>
>>>>>> Hm, I have to stand corrected, using hypercall_preempt_check (as
>>>>>> any of the *_set_allocation function use), is not safe at this point:
>>>>>>
>>>>>> (XEN) ----[ Xen-4.8-unstable  x86_64  debug=y  Tainted:    C  ]----
>>>>>> (XEN) CPU:    0
>>>>>> (XEN) RIP:    e008:[<ffff82d08022fd47>]
>>>> hap.c#local_events_need_delivery+0x27/0x40
>>>>>> (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor
>>>>>> (XEN) rax: 0000000000000000   rbx: ffff83023f5a5000   rcx: ffff82d080312900
>>>>>> (XEN) rdx: 0000000000000001   rsi: ffff83023f5a56c8   rdi: ffff8300b213d000
>>>>>> (XEN) rbp: ffff82d080307cc8   rsp: ffff82d080307cc8   r8:  0180000000000000
>>>>>> (XEN) r9:  0000000000000000   r10: 0000000000247000   r11: ffff82d08029a5b0
>>>>>> (XEN) r12: 0000000000000011   r13: 00000000000023ac   r14: ffff82d080307d4c
>>>>>> (XEN) r15: ffff83023f5a56c8   cr0: 000000008005003b   cr4: 00000000001526e0
>>>>>> (XEN) cr3: 00000000b20fc000   cr2: 0000000000000000
>>>>>> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
>>>>>> (XEN) Xen code around <ffff82d08022fd47>
>>>> (hap.c#local_events_need_delivery+0x27/0x40):
>>>>>> (XEN)  0d ad fa ff 48 8b 47 08 <80> 38 00 74 09 80 78 01 00 0f 94 c0 eb 02 31
>>>> c0
>>>>>> (XEN) Xen stack trace from rsp=ffff82d080307cc8:
>>>>>> (XEN)    ffff82d080307d08 ffff82d08022fc47 0000000000000000 ffff83023f5a5000
>>>>>> (XEN)    ffff83023f5a5648 0000000000000000 ffff82d080307d4c 0000000000002400
>>>>>> (XEN)    ffff82d080307d38 ffff82d08020008c 00000000000ffffd ffff8300b1efd000
>>>>>> (XEN)    ffff83023f5a5000 ffff82d080307d4c ffff82d080307d78 ffff82d0802cad30
>>>>>> (XEN)    0000000000203000 ffff83023f5a5000 ffff82d0802bf860 0000000000000000
>>>>>> (XEN)    0000000000000001 ffff83000008bef0 ffff82d080307de8 ffff82d0802c91e0
>>>>>> (XEN)    ffff82d080307de8 ffff82d080143900 ffff82d080307de8 0000000000000000
>>>>>> (XEN)    ffff83000008bf00 ffff82d0802eb480 ffff82d080307dc4 ffff82d08028b1cd
>>>>>> (XEN)    ffff83000008bf00 0000000000000000 0000000000000001 ffff83023f5a5000
>>>>>> (XEN)    ffff82d080307f08 ffff82d0802bf0c9 0000000000000000 0000000000000000
>>>>>> (XEN)    0000000000000000 ffff82d080307f18 ffff83000008bee0 0000000000000001
>>>>>> (XEN)    0000000000000001 0000000000000001 0000000000000000 0000000000100000
>>>>>> (XEN)    0000000000000001 0000000000247000 ffff83000008bef4 0000000000100000
>>>>>> (XEN)    ffff830100000000 0000000000247001 0000000000000014 0000000100000000
>>>>>> (XEN)    ffff8300ffffffec ffff83000008bef0 ffff82d0802e0640 ffff83000008bfb0
>>>>>> (XEN)    0000000000000000 0000000000000000 0000000000000111 0000000800000000
>>>>>> (XEN)    000000010000006e 0000000000000003 00000000000002f8 0000000000000000
>>>>>> (XEN)    00000000ad5c0bd0 0000000000000000 0000000000000001 0000000000000008
>>>>>> (XEN)    0000000000000000 ffff82d080100073 0000000000000000 0000000000000000
>>>>>> (XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>>>>> (XEN) Xen call trace:
>>>>>> (XEN)    [<ffff82d08022fd47>] hap.c#local_events_need_delivery+0x27/0x40
>>>>>> (XEN)    [<ffff82d08022fc47>] hap_set_allocation+0x107/0x130
>>>>>> (XEN)    [<ffff82d08020008c>] paging_set_allocation+0x4c/0x80
>>>>>> (XEN)    [<ffff82d0802cad30>] domain_build.c#hvm_setup_p2m+0x70/0x1a0
>>>>>> (XEN)    [<ffff82d0802c91e0>] domain_build.c#construct_dom0_hvm+0x60/0x120
>>>>>> (XEN)    [<ffff82d0802bf0c9>] __start_xen+0x1ea9/0x23a0
>>>>>> (XEN)    [<ffff82d080100073>] __high_start+0x53/0x60
>>>>>> (XEN)
>>>>>> (XEN) Pagetable walk from 0000000000000000:
>>>>>
>>>>> Sadly you don't make clear what pointer it is that is NULL at that point.
>>>>
>>>> It sounds from what he says in the following paragraph like current is NULL.
>>>
>>> I don't recall us re-setting current to this late in the boot process.
>>> Even during early boot we set it to a bogus non-NULL value rather
>>> than NULL.
>>>
>>>>>> I've tried setting current to d->vcpu[0], but that just makes the call to
>>>>>> hypercall_preempt_check crash in some scheduler assert. In any case, I've
>>>>>> added the preempt parameter to the paging_set_allocation function, but I
>>>>>> don't plan to use it in the domain builder for the time being. Does that
>>>>>> sound right?
>>>>>
>>>>> Not really, new huge latency issues like this shouldn't be reintroduced;
>>>>> we've been fighting hard to get rid of those (and we still occasionally
>>>>> find some no-one had noticed before).
>>>>
>>>> You mean latency in processing softirqs?
>>>>
>>>> Maybe what we need to do is to make local_events_need_delivery() safe
>>>> to call at this point by having it return 0 if current is NULL rather
>>>> than crashing?
>>>
>>> That would have the same effect - no softirq processing, and hence
>>> possible time issues on huge systems.
>>
>> No, local_events_delivery() only checks to see if the current vcpu has
>> outstanding virtual interrupts.  The other half of
>> hypercall_preempt_check() checks for softirqs, which doesn't appear to
>> rely on having current available.
>
> Good point, but
> - current should nevertheless not be NULL (afaict at least),

Well then it's likely to be one of the vcpu fields that the idle
domain doesn't have.  I'd be willing to bet that v->vcpu_info is NULL
for the idle domain.

> - hypercall_preempt_check() is probably the wrong construct,
>   as we're no in a hypercall.
> The latter of course could be addressed by, as you did suggest,
> some refinement to one of the pieces it's being made up from,
> but I'm not sure that would be better than perhaps making its
> invocation conditional (with some better alternative in the "else"
> case) in hap_set_allocation(). Not the least because any
> adjustment to hypercall_preempt_check() itself would affect all
> other of its users.

Well at the moment any user that calls it with current == NULL (or
perhaps is_idle_vcpu(current)) will crash.  So the only cost will be
an extra check for all the other callers.

The only other option would be to make {hap,sh}_set_allocation() check
to see if is_idle_vcpu(current), and if so just call
softirq_pending(smp_processor_id()) instead.  Or make *another* macro,
boot_softirq_check_or_hypercall_preempt_check(), which will do the
check.  But both seem a bit excessive compared to just modifying
hypercall_preempt_check() to handle being called during boot.

 -George
diff mbox

Patch

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 0a02d65..d7d4afc 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -35,7 +35,6 @@ 
 #include <asm/setup.h>
 #include <asm/bzimage.h> /* for bzimage_parse */
 #include <asm/io_apic.h>
-#include <asm/hap.h>
 #include <asm/hpet.h>
 
 #include <public/version.h>
@@ -1383,15 +1382,15 @@  int __init construct_dom0(
                          nr_pages);
     }
 
-    if ( is_pvh_domain(d) )
-        hap_set_alloc_for_pvh_dom0(d, dom0_paging_pages(d, nr_pages));
-
     /*
      * We enable paging mode again so guest_physmap_add_page will do the
      * right thing for us.
      */
     d->arch.paging.mode = save_pvh_pg_mode;
 
+    if ( is_pvh_domain(d) )
+        paging_set_allocation(d, dom0_paging_pages(d, nr_pages));
+
     /* Write the phys->machine and machine->phys table entries. */
     for ( pfn = 0; pfn < count; pfn++ )
     {
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 3218fa2..b49b38f 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -334,7 +334,7 @@  hap_get_allocation(struct domain *d)
 
 /* Set the pool of pages to the required number of pages.
  * Returns 0 for success, non-zero for failure. */
-static unsigned int
+unsigned int
 hap_set_allocation(struct domain *d, unsigned int pages, int *preempted)
 {
     struct page_info *pg;
@@ -638,18 +638,6 @@  int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
     }
 }
 
-void __init hap_set_alloc_for_pvh_dom0(struct domain *d,
-                                       unsigned long hap_pages)
-{
-    int rc;
-
-    paging_lock(d);
-    rc = hap_set_allocation(d, hap_pages, NULL);
-    paging_unlock(d);
-
-    BUG_ON(rc);
-}
-
 static const struct paging_mode hap_paging_real_mode;
 static const struct paging_mode hap_paging_protected_mode;
 static const struct paging_mode hap_paging_pae_mode;
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index 107fc8c..1b270df 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -953,6 +953,22 @@  void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
         safe_write_pte(p, new);
 }
 
+int paging_set_allocation(struct domain *d, unsigned long pages)
+{
+    int rc;
+
+    ASSERT(paging_mode_enabled(d));
+
+    paging_lock(d);
+    if ( hap_enabled(d) )
+        rc = hap_set_allocation(d, pages, NULL);
+    else
+        rc = sh_set_allocation(d, pages, NULL);
+    paging_unlock(d);
+
+    return rc;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index c22362f..452e22e 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1604,9 +1604,8 @@  shadow_free_p2m_page(struct domain *d, struct page_info *pg)
  * Input will be rounded up to at least shadow_min_acceptable_pages(),
  * plus space for the p2m table.
  * Returns 0 for success, non-zero for failure. */
-static unsigned int sh_set_allocation(struct domain *d,
-                                      unsigned int pages,
-                                      int *preempted)
+unsigned int sh_set_allocation(struct domain *d, unsigned int pages,
+                               int *preempted)
 {
     struct page_info *sp;
     unsigned int lower_bound;
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index c613836..e3c9c98 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -46,7 +46,8 @@  int   hap_track_dirty_vram(struct domain *d,
                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
-void hap_set_alloc_for_pvh_dom0(struct domain *d, unsigned long num_pages);
+unsigned int hap_set_allocation(struct domain *d, unsigned int pages,
+				int *preempted);
 
 #endif /* XEN_HAP_H */
 
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index a1401ab..6598007 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -347,6 +347,9 @@  void pagetable_dying(struct domain *d, paddr_t gpa);
 void paging_dump_domain_info(struct domain *d);
 void paging_dump_vcpu_info(struct vcpu *v);
 
+/* Set pool of pages for paging. */
+int paging_set_allocation(struct domain *d, unsigned long pages);
+
 #endif /* XEN_PAGING_H */
 
 /*
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
index 6d0aefb..bc068ec 100644
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -83,6 +83,10 @@  void sh_remove_shadows(struct domain *d, mfn_t gmfn, int fast, int all);
 /* Discard _all_ mappings from the domain's shadows. */
 void shadow_blow_tables_per_domain(struct domain *d);
 
+/* Set the pool of shadow pages to the required number of pages. */
+unsigned int sh_set_allocation(struct domain *d, unsigned int pages,
+                               int *preempted);
+
 #else /* !CONFIG_SHADOW_PAGING */
 
 #define shadow_teardown(d, p) ASSERT(is_pv_domain(d))
@@ -91,6 +95,8 @@  void shadow_blow_tables_per_domain(struct domain *d);
     ({ ASSERT(is_pv_domain(d)); -EOPNOTSUPP; })
 #define shadow_track_dirty_vram(d, begin_pfn, nr, bitmap) \
     ({ ASSERT_UNREACHABLE(); -EOPNOTSUPP; })
+#define sh_set_allocation(d, pages, preempted) \
+    ({ ASSERT_UNREACHABLE(); -EOPNOTSUPP; })
 
 static inline void sh_remove_shadows(struct domain *d, mfn_t gmfn,
                                      bool_t fast, bool_t all) {}