diff mbox

[RFC,06/12] xen/x86: populate PVHv2 Dom0 physical memory map

Message ID 1469809747-11176-7-git-send-email-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné July 29, 2016, 4:29 p.m. UTC
Craft the Dom0 e820 memory map and populate it.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
 xen/arch/x86/domain_build.c | 199 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 193 insertions(+), 6 deletions(-)

Comments

Andrew Cooper July 29, 2016, 7:04 p.m. UTC | #1
On 29/07/16 17:29, Roger Pau Monne wrote:
> Craft the Dom0 e820 memory map and populate it.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
>  xen/arch/x86/domain_build.c | 199 ++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 193 insertions(+), 6 deletions(-)
>
> diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
> index c0ef40f..cb8ecbd 100644
> --- a/xen/arch/x86/domain_build.c
> +++ b/xen/arch/x86/domain_build.c
> @@ -43,6 +43,11 @@ static long __initdata dom0_nrpages;
>  static long __initdata dom0_min_nrpages;
>  static long __initdata dom0_max_nrpages = LONG_MAX;
>  
> +/* GFN of the identity map for EPT. */
> +#define HVM_IDENT_PT_GFN  0xfeffeu

The IDENT_PT is only needed on Gen1 VT-x hardware which doesn't support
unrestricted-guest mode.  OTOH, it needs to be matched with VM86_TSS,
which is also needed if hardware doesn't support unrestricted-guest. 
Also, the IDENT_PT can live anywhere in GFN space.  0xfeffe is just
convention for HVM guests as nothing else interesting lives there, but a
PVH dom0 will want to ensure it doesn't alias anything interesting it
might wish to map.

Now I look at it, there is substantial WTF.  The domain builder makes
IDENT_PT, but HVMLoader makes VM86_TSS.  I presume this is a historical
side effect of IDENT_PT being a prerequisite to even executing
hvmloader, while VM86_TSS was only a prerequisite to executing the
rombios payload.  Either way, eww :(

I think the VM86_TSS setup needs to move to unconditionally being beside
IDENT_PT setup in the domain builder, and mirrored here in dom0_hvm()
creation.  I don't think it is reasonable to expect an HVMLite payload
to set up its own VM86_TSS if it didn't set up IDENT_PT.  (OTOH, the
chances of an HVMLite guest ever needing to return to real mode is slim,
but in the name of flexibility, it would be nice not to rule it out).

Longterm, it would be nice to disentangle the unrestricted-guest support
from the main vmx code, and make it able to be compiled out.  There is
lots of it, and it all-but-dead on modern hardware.
> @@ -591,6 +610,8 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>          {
>              cur_pages += pages;
>          }
> +        ASSERT((entry_guest->addr & ~PAGE_MASK) == 0 &&
> +               (entry_guest->size & ~PAGE_MASK) == 0);

This would be clearer as ASSERT(IS_ALIGNED(entry_guest->addr, PAGE_SIZE));

(although I suspect the IS_ALIGNED() predicate is newer than your first
iteration of this code.)

>   next:
>          d->arch.nr_e820++;
>          entry_guest++;
> @@ -1631,7 +1652,7 @@ static int __init construct_dom0_pv(
>          dom0_update_physmap(d, pfn, mfn, 0);
>  
>          pvh_map_all_iomem(d, nr_pages);
> -        pvh_setup_e820(d, nr_pages);
> +        hvm_setup_e820(d, nr_pages);
>      }
>  
>      if ( d->domain_id == hardware_domid )
> @@ -1647,15 +1668,181 @@ out:
>      return rc;
>  }
>  
> +/* Helper to convert from bytes into human-readable form. */
> +static void __init pretty_print_bytes(uint64_t size)
> +{
> +    const char* units[] = {"B", "KB", "MB", "GB", "TB"};

static const char *units[] __initconst

However, this particular array would be far smaller as

static const char units[][3] __initconst = { "B", ... };

as it avoids embedding 5x8byte pointers to get at 14 useful bytes of
information.

> +    int i = 0;
> +
> +    while ( ++i < sizeof(units) && size >= 1024 )
> +        size >>= 10; /* size /= 1024 */
> +
> +    printk("%4" PRIu64 "%2s", size, units[i-1]);

Wouldn't it be better to introduce a new %p format identifier to do
this?  (Linux doesn't appear to have an existing format identifier which
we can 'borrow')

It looks potentially useful elsewhere, and avoids the awkward

printk("Order %2u: ", MAX_ORDER - i);
pretty_print_bytes(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) *
                   hvm_mem_stats[MAX_ORDER - i]);
printk("\n");

constructs you have below.


> +}
> +
> +/* Calculate the biggest usable order given a size in bytes. */
> +static inline unsigned int get_order(uint64_t size)
> +{
> +    unsigned int order;
> +    uint64_t pg;
> +
> +    ASSERT((size & ~PAGE_MASK) == 0);
> +    pg = PFN_DOWN(size);
> +    for ( order = 0; pg >= (1 << (order + 1)); order++ );
> +
> +    return order;
> +}

We already have get_order_from_bytes() and get_order_from_pages(), the
latter of which looks like it will suit your usecase.

As a TODO item, I really would like to borrow some of the Linux
infrastructure to calculate orders of constants at compile time, because
a large number of callers of the existing get_order_*() functions are
performing unnecessary runtime calculation.  For those which need
runtime calculation, some careful use of ffs() would be preferable to a
loop.

> +
> +/* Populate an HVM memory range using the biggest possible order. */
> +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
> +                                             uint64_t size)
> +{
> +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
> +    unsigned int order;
> +    struct page_info *page;
> +    int rc;
> +
> +    ASSERT((size & ~PAGE_MASK) == 0 && (start & ~PAGE_MASK) == 0);
> +
> +    order = MAX_ORDER;
> +    while ( size != 0 )
> +    {
> +        order = min(get_order(size), order);
> +        page = alloc_domheap_pages(d, order, memflags);
> +        if ( page == NULL )
> +        {
> +            if ( order == 0 && memflags )
> +            {
> +                /* Try again without any memflags. */
> +                memflags = 0;
> +                order = MAX_ORDER;
> +                continue;
> +            }
> +            if ( order == 0 )
> +                panic("Unable to allocate memory with order 0!\n");
> +            order--;
> +            continue;
> +        }

It would be far more efficient to try and allocate only 1G and 2M
blocks.  Most of memory is free at this point, and it would allow the
use of HAP superpage mappings, which will be a massive performance boost
if they aren't shattered.

> +
> +        hvm_mem_stats[order]++;
> +        rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)),
> +                                    _mfn(page_to_mfn(page)), order);
> +        if ( rc != 0 )
> +            panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n",
> +                  start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc);
> +        start += ((uint64_t)1) << (order + PAGE_SHIFT);
> +        size -= ((uint64_t)1) << (order + PAGE_SHIFT);
> +        if ( (size & 0xffffffff) == 0 )
> +            process_pending_softirqs();
> +    }
> +
> +}
> +
> +static int __init hvm_setup_p2m(struct domain *d)
> +{
> +    struct vcpu *v = d->vcpu[0];
> +    unsigned long nr_pages;
> +    int i;
> +
> +    printk("** Preparing memory map **\n");
> +
> +    /*
> +     * Subtract one page for the EPT identity page table and two pages
> +     * for the MADT replacement.
> +     */
> +    nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3;
> +
> +    hvm_setup_e820(d, nr_pages);
> +    paging_set_allocation(d, dom0_paging_pages(d, nr_pages));
> +
> +    printk("Dom0 memory map:\n");
> +    print_e820_memory_map(d->arch.e820, d->arch.nr_e820);
> +
> +    printk("** Populating memory map **\n");
> +    /* Populate memory map. */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        if ( d->arch.e820[i].type != E820_RAM )
> +            continue;
> +
> +        hvm_populate_memory_range(d, d->arch.e820[i].addr,
> +                                  d->arch.e820[i].size);
> +    }
> +
> +    printk("Memory allocation stats:\n");
> +    for ( i = 0; i <= MAX_ORDER; i++ )
> +    {
> +        if ( hvm_mem_stats[MAX_ORDER - i] != 0 )
> +        {
> +            printk("Order %2u: ", MAX_ORDER - i);
> +            pretty_print_bytes(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) *
> +                               hvm_mem_stats[MAX_ORDER - i]);
> +            printk("\n");
> +        }
> +    }
> +
> +    if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
> +    {
> +        struct vcpu *saved_current;
> +        struct page_info *page;
> +        uint32_t *ident_pt;
> +
> +        /*
> +         * Identity-map page table is required for running with CR0.PG=0
> +         * when using Intel EPT. Create a 32-bit non-PAE page directory of
> +         * superpages.
> +         */
> +        page = alloc_domheap_pages(d, 0, 0);
> +        if ( unlikely(!page) )
> +        {
> +            printk("Unable to allocate page for identity map\n");
> +            return -ENOMEM;
> +        }
> +
> +        saved_current = current;
> +        set_current(v);

What is this swizzle of current for?  I presume you hit an assertion
somewhere?  Given how late we are in the boot process, it might be worth
formally context switching to d0v0 earlier.

> +
> +        ident_pt = __map_domain_page(page);
> +        for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
> +            ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
> +                           _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
> +        unmap_domain_page(ident_pt);
> +
> +        guest_physmap_add_page(d, _gfn(HVM_IDENT_PT_GFN),
> +                               _mfn(page_to_mfn(page)), 0);

Please pick an existing gfn and map_domain_gfn() instead.  This will
avoid an unnecessary shattering of hap superpages.

~Andrew
Roger Pau Monné Aug. 2, 2016, 9:19 a.m. UTC | #2
On Fri, Jul 29, 2016 at 08:04:12PM +0100, Andrew Cooper wrote:
> On 29/07/16 17:29, Roger Pau Monne wrote:
> > Craft the Dom0 e820 memory map and populate it.
> >
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> > Cc: Jan Beulich <jbeulich@suse.com>
> > Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> > ---
> >  xen/arch/x86/domain_build.c | 199 ++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 193 insertions(+), 6 deletions(-)
> >
> > diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
> > index c0ef40f..cb8ecbd 100644
> > --- a/xen/arch/x86/domain_build.c
> > +++ b/xen/arch/x86/domain_build.c
> > @@ -43,6 +43,11 @@ static long __initdata dom0_nrpages;
> >  static long __initdata dom0_min_nrpages;
> >  static long __initdata dom0_max_nrpages = LONG_MAX;
> >  
> > +/* GFN of the identity map for EPT. */
> > +#define HVM_IDENT_PT_GFN  0xfeffeu
> 
> The IDENT_PT is only needed on Gen1 VT-x hardware which doesn't support
> unrestricted-guest mode.  OTOH, it needs to be matched with VM86_TSS,
> which is also needed if hardware doesn't support unrestricted-guest. 
> Also, the IDENT_PT can live anywhere in GFN space.  0xfeffe is just
> convention for HVM guests as nothing else interesting lives there, but a
> PVH dom0 will want to ensure it doesn't alias anything interesting it
> might wish to map.
> 
> Now I look at it, there is substantial WTF.  The domain builder makes
> IDENT_PT, but HVMLoader makes VM86_TSS.  I presume this is a historical
> side effect of IDENT_PT being a prerequisite to even executing
> hvmloader, while VM86_TSS was only a prerequisite to executing the
> rombios payload.  Either way, eww :(
> 
> I think the VM86_TSS setup needs to move to unconditionally being beside
> IDENT_PT setup in the domain builder, and mirrored here in dom0_hvm()
> creation.  I don't think it is reasonable to expect an HVMLite payload
> to set up its own VM86_TSS if it didn't set up IDENT_PT.  (OTOH, the
> chances of an HVMLite guest ever needing to return to real mode is slim,
> but in the name of flexibility, it would be nice not to rule it out).
> 
> Longterm, it would be nice to disentangle the unrestricted-guest support
> from the main vmx code, and make it able to be compiled out.  There is
> lots of it, and it all-but-dead on modern hardware.

Thanks! I didn't know anything about the VM86 TSS stuff, the fact that the 
identity page tables and the VM86 TSS are set at completely different points 
makes it quite hard to follow :/.

I've now moved the setup of the VM86 TSS inside the domain builder for both 
DomU and Dom0.

> > @@ -591,6 +610,8 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
> >          {
> >              cur_pages += pages;
> >          }
> > +        ASSERT((entry_guest->addr & ~PAGE_MASK) == 0 &&
> > +               (entry_guest->size & ~PAGE_MASK) == 0);
> 
> This would be clearer as ASSERT(IS_ALIGNED(entry_guest->addr, PAGE_SIZE));
> 
> (although I suspect the IS_ALIGNED() predicate is newer than your first
> iteration of this code.)
> 
> >   next:
> >          d->arch.nr_e820++;
> >          entry_guest++;
> > @@ -1631,7 +1652,7 @@ static int __init construct_dom0_pv(
> >          dom0_update_physmap(d, pfn, mfn, 0);
> >  
> >          pvh_map_all_iomem(d, nr_pages);
> > -        pvh_setup_e820(d, nr_pages);
> > +        hvm_setup_e820(d, nr_pages);
> >      }
> >  
> >      if ( d->domain_id == hardware_domid )
> > @@ -1647,15 +1668,181 @@ out:
> >      return rc;
> >  }
> >  
> > +/* Helper to convert from bytes into human-readable form. */
> > +static void __init pretty_print_bytes(uint64_t size)
> > +{
> > +    const char* units[] = {"B", "KB", "MB", "GB", "TB"};
> 
> static const char *units[] __initconst
> 
> However, this particular array would be far smaller as
> 
> static const char units[][3] __initconst = { "B", ... };
> 
> as it avoids embedding 5x8byte pointers to get at 14 useful bytes of
> information.
> 
> > +    int i = 0;
> > +
> > +    while ( ++i < sizeof(units) && size >= 1024 )
> > +        size >>= 10; /* size /= 1024 */
> > +
> > +    printk("%4" PRIu64 "%2s", size, units[i-1]);
> 
> Wouldn't it be better to introduce a new %p format identifier to do
> this?  (Linux doesn't appear to have an existing format identifier which
> we can 'borrow')
> 
> It looks potentially useful elsewhere, and avoids the awkward
> 
> printk("Order %2u: ", MAX_ORDER - i);
> pretty_print_bytes(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) *
>                    hvm_mem_stats[MAX_ORDER - i]);
> printk("\n");
> 
> constructs you have below.

Done all the above.
 
> 
> > +}
> > +
> > +/* Calculate the biggest usable order given a size in bytes. */
> > +static inline unsigned int get_order(uint64_t size)
> > +{
> > +    unsigned int order;
> > +    uint64_t pg;
> > +
> > +    ASSERT((size & ~PAGE_MASK) == 0);
> > +    pg = PFN_DOWN(size);
> > +    for ( order = 0; pg >= (1 << (order + 1)); order++ );
> > +
> > +    return order;
> > +}
> 
> We already have get_order_from_bytes() and get_order_from_pages(), the
> latter of which looks like it will suit your usecase.

Not really, or at least they don't do the same as get_order. This function 
calculates the maximum order you can use so that there are no pages left 
over, (ie: if you have a size of 3145728bytes (3MiB), this function will 
return order 9 (2MiB), while the other ones will return order 10 (4MiB)). I 
don't really understand while other places in code request bigger orders and 
then free the leftovers, isn't this also causing memory shattering?

> As a TODO item, I really would like to borrow some of the Linux
> infrastructure to calculate orders of constants at compile time, because
> a large number of callers of the existing get_order_*() functions are
> performing unnecessary runtime calculation.  For those which need
> runtime calculation, some careful use of ffs() would be preferable to a
> loop.
> 
> > +
> > +/* Populate an HVM memory range using the biggest possible order. */
> > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
> > +                                             uint64_t size)
> > +{
> > +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
> > +    unsigned int order;
> > +    struct page_info *page;
> > +    int rc;
> > +
> > +    ASSERT((size & ~PAGE_MASK) == 0 && (start & ~PAGE_MASK) == 0);
> > +
> > +    order = MAX_ORDER;
> > +    while ( size != 0 )
> > +    {
> > +        order = min(get_order(size), order);
> > +        page = alloc_domheap_pages(d, order, memflags);
> > +        if ( page == NULL )
> > +        {
> > +            if ( order == 0 && memflags )
> > +            {
> > +                /* Try again without any memflags. */
> > +                memflags = 0;
> > +                order = MAX_ORDER;
> > +                continue;
> > +            }
> > +            if ( order == 0 )
> > +                panic("Unable to allocate memory with order 0!\n");
> > +            order--;
> > +            continue;
> > +        }
> 
> It would be far more efficient to try and allocate only 1G and 2M
> blocks.  Most of memory is free at this point, and it would allow the
> use of HAP superpage mappings, which will be a massive performance boost
> if they aren't shattered.

That's what I'm trying to do, but we might have to use pages of lower order 
when filling the smaller gaps. As an example, this are the stats when 
building a domain with 6048M of RAM:

(XEN) Memory allocation stats:
(XEN) Order 18: 5GB
(XEN) Order 17: 512MB
(XEN) Order 15: 256MB
(XEN) Order 14: 128MB
(XEN) Order 12: 16MB
(XEN) Order 10: 8MB
(XEN) Order  9: 4MB
(XEN) Order  8: 2MB
(XEN) Order  7: 1MB
(XEN) Order  6: 512KB
(XEN) Order  5: 256KB
(XEN) Order  4: 128KB
(XEN) Order  3: 64KB
(XEN) Order  2: 32KB
(XEN) Order  1: 16KB
(XEN) Order  0: 4KB

IMHO, they are quite good.

> > +
> > +        hvm_mem_stats[order]++;
> > +        rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)),
> > +                                    _mfn(page_to_mfn(page)), order);
> > +        if ( rc != 0 )
> > +            panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n",
> > +                  start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc);
> > +        start += ((uint64_t)1) << (order + PAGE_SHIFT);
> > +        size -= ((uint64_t)1) << (order + PAGE_SHIFT);
> > +        if ( (size & 0xffffffff) == 0 )
> > +            process_pending_softirqs();
> > +    }
> > +
> > +}
> > +
> > +static int __init hvm_setup_p2m(struct domain *d)
> > +{
> > +    struct vcpu *v = d->vcpu[0];
> > +    unsigned long nr_pages;
> > +    int i;
> > +
> > +    printk("** Preparing memory map **\n");
> > +
> > +    /*
> > +     * Subtract one page for the EPT identity page table and two pages
> > +     * for the MADT replacement.
> > +     */
> > +    nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3;
> > +
> > +    hvm_setup_e820(d, nr_pages);
> > +    paging_set_allocation(d, dom0_paging_pages(d, nr_pages));
> > +
> > +    printk("Dom0 memory map:\n");
> > +    print_e820_memory_map(d->arch.e820, d->arch.nr_e820);
> > +
> > +    printk("** Populating memory map **\n");
> > +    /* Populate memory map. */
> > +    for ( i = 0; i < d->arch.nr_e820; i++ )
> > +    {
> > +        if ( d->arch.e820[i].type != E820_RAM )
> > +            continue;
> > +
> > +        hvm_populate_memory_range(d, d->arch.e820[i].addr,
> > +                                  d->arch.e820[i].size);
> > +    }
> > +
> > +    printk("Memory allocation stats:\n");
> > +    for ( i = 0; i <= MAX_ORDER; i++ )
> > +    {
> > +        if ( hvm_mem_stats[MAX_ORDER - i] != 0 )
> > +        {
> > +            printk("Order %2u: ", MAX_ORDER - i);
> > +            pretty_print_bytes(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) *
> > +                               hvm_mem_stats[MAX_ORDER - i]);
> > +            printk("\n");
> > +        }
> > +    }
> > +
> > +    if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
> > +    {
> > +        struct vcpu *saved_current;
> > +        struct page_info *page;
> > +        uint32_t *ident_pt;
> > +
> > +        /*
> > +         * Identity-map page table is required for running with CR0.PG=0
> > +         * when using Intel EPT. Create a 32-bit non-PAE page directory of
> > +         * superpages.
> > +         */
> > +        page = alloc_domheap_pages(d, 0, 0);
> > +        if ( unlikely(!page) )
> > +        {
> > +            printk("Unable to allocate page for identity map\n");
> > +            return -ENOMEM;
> > +        }
> > +
> > +        saved_current = current;
> > +        set_current(v);
> 
> What is this swizzle of current for?  I presume you hit an assertion
> somewhere?  Given how late we are in the boot process, it might be worth
> formally context switching to d0v0 earlier.

Hm, probably some leftover from a previous iteration, now removed.

> > +
> > +        ident_pt = __map_domain_page(page);
> > +        for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
> > +            ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
> > +                           _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
> > +        unmap_domain_page(ident_pt);
> > +
> > +        guest_physmap_add_page(d, _gfn(HVM_IDENT_PT_GFN),
> > +                               _mfn(page_to_mfn(page)), 0);
> 
> Please pick an existing gfn and map_domain_gfn() instead.  This will
> avoid an unnecessary shattering of hap superpages.

Right, I've picked some of the previously mapped RAM and shrunk the e820.

Roger.
Andrew Cooper Aug. 4, 2016, 6:43 p.m. UTC | #3
On 02/08/16 10:19, Roger Pau Monne wrote:
> On Fri, Jul 29, 2016 at 08:04:12PM +0100, Andrew Cooper wrote:
>> On 29/07/16 17:29, Roger Pau Monne wrote:
>>> Craft the Dom0 e820 memory map and populate it.
>>>
>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>>> ---
>>> Cc: Jan Beulich <jbeulich@suse.com>
>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>> ---
>>>  xen/arch/x86/domain_build.c | 199 ++++++++++++++++++++++++++++++++++++++++++--
>>>  1 file changed, 193 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
>>> index c0ef40f..cb8ecbd 100644
>>> --- a/xen/arch/x86/domain_build.c
>>> +++ b/xen/arch/x86/domain_build.c
>>> @@ -43,6 +43,11 @@ static long __initdata dom0_nrpages;
>>>  static long __initdata dom0_min_nrpages;
>>>  static long __initdata dom0_max_nrpages = LONG_MAX;
>>>  
>>> +/* GFN of the identity map for EPT. */
>>> +#define HVM_IDENT_PT_GFN  0xfeffeu
>> The IDENT_PT is only needed on Gen1 VT-x hardware which doesn't support
>> unrestricted-guest mode.  OTOH, it needs to be matched with VM86_TSS,
>> which is also needed if hardware doesn't support unrestricted-guest. 
>> Also, the IDENT_PT can live anywhere in GFN space.  0xfeffe is just
>> convention for HVM guests as nothing else interesting lives there, but a
>> PVH dom0 will want to ensure it doesn't alias anything interesting it
>> might wish to map.
>>
>> Now I look at it, there is substantial WTF.  The domain builder makes
>> IDENT_PT, but HVMLoader makes VM86_TSS.  I presume this is a historical
>> side effect of IDENT_PT being a prerequisite to even executing
>> hvmloader, while VM86_TSS was only a prerequisite to executing the
>> rombios payload.  Either way, eww :(
>>
>> I think the VM86_TSS setup needs to move to unconditionally being beside
>> IDENT_PT setup in the domain builder, and mirrored here in dom0_hvm()
>> creation.  I don't think it is reasonable to expect an HVMLite payload
>> to set up its own VM86_TSS if it didn't set up IDENT_PT.  (OTOH, the
>> chances of an HVMLite guest ever needing to return to real mode is slim,
>> but in the name of flexibility, it would be nice not to rule it out).
>>
>> Longterm, it would be nice to disentangle the unrestricted-guest support
>> from the main vmx code, and make it able to be compiled out.  There is
>> lots of it, and it all-but-dead on modern hardware.
> Thanks! I didn't know anything about the VM86 TSS stuff, the fact that the 
> identity page tables and the VM86 TSS are set at completely different points 
> makes it quite hard to follow :/.
>
> I've now moved the setup of the VM86 TSS inside the domain builder for both 
> DomU and Dom0.

Thanks.  (Guess who has recently had to delve back into this code^W swamp.)

>
>>> +}
>>> +
>>> +/* Calculate the biggest usable order given a size in bytes. */
>>> +static inline unsigned int get_order(uint64_t size)
>>> +{
>>> +    unsigned int order;
>>> +    uint64_t pg;
>>> +
>>> +    ASSERT((size & ~PAGE_MASK) == 0);
>>> +    pg = PFN_DOWN(size);
>>> +    for ( order = 0; pg >= (1 << (order + 1)); order++ );
>>> +
>>> +    return order;
>>> +}
>> We already have get_order_from_bytes() and get_order_from_pages(), the
>> latter of which looks like it will suit your usecase.
> Not really, or at least they don't do the same as get_order. This function 
> calculates the maximum order you can use so that there are no pages left 
> over, (ie: if you have a size of 3145728bytes (3MiB), this function will 
> return order 9 (2MiB), while the other ones will return order 10 (4MiB)). I 
> don't really understand while other places in code request bigger orders and 
> then free the leftovers, isn't this also causing memory shattering?

Sounds like we want something like get_order_{floor,ceil}() which makes
it obvious which way non-power-of-two get rounded.

>
>> As a TODO item, I really would like to borrow some of the Linux
>> infrastructure to calculate orders of constants at compile time, because
>> a large number of callers of the existing get_order_*() functions are
>> performing unnecessary runtime calculation.  For those which need
>> runtime calculation, some careful use of ffs() would be preferable to a
>> loop.
>>
>>> +
>>> +/* Populate an HVM memory range using the biggest possible order. */
>>> +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
>>> +                                             uint64_t size)
>>> +{
>>> +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
>>> +    unsigned int order;
>>> +    struct page_info *page;
>>> +    int rc;
>>> +
>>> +    ASSERT((size & ~PAGE_MASK) == 0 && (start & ~PAGE_MASK) == 0);
>>> +
>>> +    order = MAX_ORDER;
>>> +    while ( size != 0 )
>>> +    {
>>> +        order = min(get_order(size), order);
>>> +        page = alloc_domheap_pages(d, order, memflags);
>>> +        if ( page == NULL )
>>> +        {
>>> +            if ( order == 0 && memflags )
>>> +            {
>>> +                /* Try again without any memflags. */
>>> +                memflags = 0;
>>> +                order = MAX_ORDER;
>>> +                continue;
>>> +            }
>>> +            if ( order == 0 )
>>> +                panic("Unable to allocate memory with order 0!\n");
>>> +            order--;
>>> +            continue;
>>> +        }
>> It would be far more efficient to try and allocate only 1G and 2M
>> blocks.  Most of memory is free at this point, and it would allow the
>> use of HAP superpage mappings, which will be a massive performance boost
>> if they aren't shattered.
> That's what I'm trying to do, but we might have to use pages of lower order 
> when filling the smaller gaps.

As a general principle, we should try not to have any gaps.  This also
extends to guests using more intelligence when deciding now to mutate
its physmap.

>  As an example, this are the stats when 
> building a domain with 6048M of RAM:
>
> (XEN) Memory allocation stats:
> (XEN) Order 18: 5GB
> (XEN) Order 17: 512MB
> (XEN) Order 15: 256MB
> (XEN) Order 14: 128MB
> (XEN) Order 12: 16MB
> (XEN) Order 10: 8MB
> (XEN) Order  9: 4MB
> (XEN) Order  8: 2MB
> (XEN) Order  7: 1MB
> (XEN) Order  6: 512KB
> (XEN) Order  5: 256KB
> (XEN) Order  4: 128KB
> (XEN) Order  3: 64KB
> (XEN) Order  2: 32KB
> (XEN) Order  1: 16KB
> (XEN) Order  0: 4KB
>
> IMHO, they are quite good.

What are the RAM characteristics of the host?  Do you have any idea what
the hap superpage characteristics are like after the guest has booted?

In a case like this, I think it would be entirely reasonable to round up
to the nearest 2MB, and avoid all of those small page mappings.

~Andrew
Roger Pau Monné Aug. 5, 2016, 9:40 a.m. UTC | #4
On Thu, Aug 04, 2016 at 07:43:39PM +0100, Andrew Cooper wrote:
> On 02/08/16 10:19, Roger Pau Monne wrote:
> > On Fri, Jul 29, 2016 at 08:04:12PM +0100, Andrew Cooper wrote:
> >> On 29/07/16 17:29, Roger Pau Monne wrote:
> >>> +/* Calculate the biggest usable order given a size in bytes. */
> >>> +static inline unsigned int get_order(uint64_t size)
> >>> +{
> >>> +    unsigned int order;
> >>> +    uint64_t pg;
> >>> +
> >>> +    ASSERT((size & ~PAGE_MASK) == 0);
> >>> +    pg = PFN_DOWN(size);
> >>> +    for ( order = 0; pg >= (1 << (order + 1)); order++ );
> >>> +
> >>> +    return order;
> >>> +}
> >> We already have get_order_from_bytes() and get_order_from_pages(), the
> >> latter of which looks like it will suit your usecase.
> > Not really, or at least they don't do the same as get_order. This function 
> > calculates the maximum order you can use so that there are no pages left 
> > over, (ie: if you have a size of 3145728bytes (3MiB), this function will 
> > return order 9 (2MiB), while the other ones will return order 10 (4MiB)). I 
> > don't really understand while other places in code request bigger orders and 
> > then free the leftovers, isn't this also causing memory shattering?
> 
> Sounds like we want something like get_order_{floor,ceil}() which makes
> it obvious which way non-power-of-two get rounded.

Right, that makes sense, will rename the current one to ceil, and add the 
floor variant.

> >>> +            if ( order == 0 && memflags )
> >>> +            {
> >>> +                /* Try again without any memflags. */
> >>> +                memflags = 0;
> >>> +                order = MAX_ORDER;
> >>> +                continue;
> >>> +            }
> >>> +            if ( order == 0 )
> >>> +                panic("Unable to allocate memory with order 0!\n");
> >>> +            order--;
> >>> +            continue;
> >>> +        }
> >> It would be far more efficient to try and allocate only 1G and 2M
> >> blocks.  Most of memory is free at this point, and it would allow the
> >> use of HAP superpage mappings, which will be a massive performance boost
> >> if they aren't shattered.
> > That's what I'm trying to do, but we might have to use pages of lower order 
> > when filling the smaller gaps.
> 
> As a general principle, we should try not to have any gaps.  This also
> extends to guests using more intelligence when deciding now to mutate
> its physmap.

Yes, but in this case we are limited by the original e820 from the host.
A DomU (without passthrough) will have all it's memory contiguously.
 
> >  As an example, this are the stats when 
> > building a domain with 6048M of RAM:
> >
> > (XEN) Memory allocation stats:
> > (XEN) Order 18: 5GB
> > (XEN) Order 17: 512MB
> > (XEN) Order 15: 256MB
> > (XEN) Order 14: 128MB
> > (XEN) Order 12: 16MB
> > (XEN) Order 10: 8MB
> > (XEN) Order  9: 4MB
> > (XEN) Order  8: 2MB
> > (XEN) Order  7: 1MB
> > (XEN) Order  6: 512KB
> > (XEN) Order  5: 256KB
> > (XEN) Order  4: 128KB
> > (XEN) Order  3: 64KB
> > (XEN) Order  2: 32KB
> > (XEN) Order  1: 16KB
> > (XEN) Order  0: 4KB
> >
> > IMHO, they are quite good.
> 
> What are the RAM characteristics of the host?  Do you have any idea what
> the hap superpage characteristics are like after the guest has booted?

This is the host RAM map:

(XEN)  0000000000000000 - 000000000009c800 (usable)
(XEN)  000000000009c800 - 00000000000a0000 (reserved)
(XEN)  00000000000e0000 - 0000000000100000 (reserved)
(XEN)  0000000000100000 - 00000000ad662000 (usable)
(XEN)  00000000ad662000 - 00000000adb1f000 (reserved)
(XEN)  00000000adb1f000 - 00000000b228b000 (usable)
(XEN)  00000000b228b000 - 00000000b2345000 (reserved)
(XEN)  00000000b2345000 - 00000000b236a000 (ACPI data)
(XEN)  00000000b236a000 - 00000000b2c9a000 (ACPI NVS)
(XEN)  00000000b2c9a000 - 00000000b2fff000 (reserved)
(XEN)  00000000b2fff000 - 00000000b3000000 (usable)
(XEN)  00000000b3800000 - 00000000b8000000 (reserved)
(XEN)  00000000f8000000 - 00000000fc000000 (reserved)
(XEN)  00000000fec00000 - 00000000fec01000 (reserved)
(XEN)  00000000fed00000 - 00000000fed04000 (reserved)
(XEN)  00000000fed1c000 - 00000000fed20000 (reserved)
(XEN)  00000000fee00000 - 00000000fee01000 (reserved)
(XEN)  00000000ff000000 - 0000000100000000 (reserved)
(XEN)  0000000100000000 - 0000000247000000 (usable)

No idea about the HAP superpage characteristics, how can I fetch this 
information? (I know I can dump the guest EPT tables, but that just 
saturates the console).

> In a case like this, I think it would be entirely reasonable to round up
> to the nearest 2MB, and avoid all of those small page mappings.

Hm, but then we would be expanding the RAM region and we should either 
modify the guest e820 to reflect that (which I think it's a bad idea, given 
that we might be shadowing MMIO regions), or simply use a 2MB page to cover 
a 4KB hole, in which case we are throwing away memory and the computation of 
the required memory will be off.

Roger.
Andrew Cooper Aug. 11, 2016, 6:28 p.m. UTC | #5
On 05/08/16 10:40, Roger Pau Monne wrote:
> On Thu, Aug 04, 2016 at 07:43:39PM +0100, Andrew Cooper wrote:
>> On 02/08/16 10:19, Roger Pau Monne wrote:
>>> On Fri, Jul 29, 2016 at 08:04:12PM +0100, Andrew Cooper wrote:
>>>> On 29/07/16 17:29, Roger Pau Monne wrote:
>>>>> +/* Calculate the biggest usable order given a size in bytes. */
>>>>> +static inline unsigned int get_order(uint64_t size)
>>>>> +{
>>>>> +    unsigned int order;
>>>>> +    uint64_t pg;
>>>>> +
>>>>> +    ASSERT((size & ~PAGE_MASK) == 0);
>>>>> +    pg = PFN_DOWN(size);
>>>>> +    for ( order = 0; pg >= (1 << (order + 1)); order++ );
>>>>> +
>>>>> +    return order;
>>>>> +}
>>>> We already have get_order_from_bytes() and get_order_from_pages(), the
>>>> latter of which looks like it will suit your usecase.
>>> Not really, or at least they don't do the same as get_order. This function 
>>> calculates the maximum order you can use so that there are no pages left 
>>> over, (ie: if you have a size of 3145728bytes (3MiB), this function will 
>>> return order 9 (2MiB), while the other ones will return order 10 (4MiB)). I 
>>> don't really understand while other places in code request bigger orders and 
>>> then free the leftovers, isn't this also causing memory shattering?
>> Sounds like we want something like get_order_{floor,ceil}() which makes
>> it obvious which way non-power-of-two get rounded.
> Right, that makes sense, will rename the current one to ceil, and add the 
> floor variant.
>
>>>>> +            if ( order == 0 && memflags )
>>>>> +            {
>>>>> +                /* Try again without any memflags. */
>>>>> +                memflags = 0;
>>>>> +                order = MAX_ORDER;
>>>>> +                continue;
>>>>> +            }
>>>>> +            if ( order == 0 )
>>>>> +                panic("Unable to allocate memory with order 0!\n");
>>>>> +            order--;
>>>>> +            continue;
>>>>> +        }
>>>> It would be far more efficient to try and allocate only 1G and 2M
>>>> blocks.  Most of memory is free at this point, and it would allow the
>>>> use of HAP superpage mappings, which will be a massive performance boost
>>>> if they aren't shattered.
>>> That's what I'm trying to do, but we might have to use pages of lower order 
>>> when filling the smaller gaps.
>> As a general principle, we should try not to have any gaps.  This also
>> extends to guests using more intelligence when deciding now to mutate
>> its physmap.
> Yes, but in this case we are limited by the original e820 from the host.
> A DomU (without passthrough) will have all it's memory contiguously.

Ah yes - that is a legitimate restriction.

>  
>>>  As an example, this are the stats when 
>>> building a domain with 6048M of RAM:
>>>
>>> (XEN) Memory allocation stats:
>>> (XEN) Order 18: 5GB
>>> (XEN) Order 17: 512MB
>>> (XEN) Order 15: 256MB
>>> (XEN) Order 14: 128MB
>>> (XEN) Order 12: 16MB
>>> (XEN) Order 10: 8MB
>>> (XEN) Order  9: 4MB
>>> (XEN) Order  8: 2MB
>>> (XEN) Order  7: 1MB
>>> (XEN) Order  6: 512KB
>>> (XEN) Order  5: 256KB
>>> (XEN) Order  4: 128KB
>>> (XEN) Order  3: 64KB
>>> (XEN) Order  2: 32KB
>>> (XEN) Order  1: 16KB
>>> (XEN) Order  0: 4KB
>>>
>>> IMHO, they are quite good.
>> What are the RAM characteristics of the host?  Do you have any idea what
>> the hap superpage characteristics are like after the guest has booted?
> This is the host RAM map:
>
> (XEN)  0000000000000000 - 000000000009c800 (usable)
> (XEN)  000000000009c800 - 00000000000a0000 (reserved)
> (XEN)  00000000000e0000 - 0000000000100000 (reserved)
> (XEN)  0000000000100000 - 00000000ad662000 (usable)
> (XEN)  00000000ad662000 - 00000000adb1f000 (reserved)
> (XEN)  00000000adb1f000 - 00000000b228b000 (usable)
> (XEN)  00000000b228b000 - 00000000b2345000 (reserved)
> (XEN)  00000000b2345000 - 00000000b236a000 (ACPI data)
> (XEN)  00000000b236a000 - 00000000b2c9a000 (ACPI NVS)
> (XEN)  00000000b2c9a000 - 00000000b2fff000 (reserved)
> (XEN)  00000000b2fff000 - 00000000b3000000 (usable)
> (XEN)  00000000b3800000 - 00000000b8000000 (reserved)
> (XEN)  00000000f8000000 - 00000000fc000000 (reserved)
> (XEN)  00000000fec00000 - 00000000fec01000 (reserved)
> (XEN)  00000000fed00000 - 00000000fed04000 (reserved)
> (XEN)  00000000fed1c000 - 00000000fed20000 (reserved)
> (XEN)  00000000fee00000 - 00000000fee01000 (reserved)
> (XEN)  00000000ff000000 - 0000000100000000 (reserved)
> (XEN)  0000000100000000 - 0000000247000000 (usable)
>
> No idea about the HAP superpage characteristics, how can I fetch this 
> information? (I know I can dump the guest EPT tables, but that just 
> saturates the console).

Not easily, and also not the first time I have run into this problem. 
We really should have at least a debug way of identifying this.

For now, you can count how many time ept_split_super_page() get called
at each level.

~Andrew
diff mbox

Patch

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index c0ef40f..cb8ecbd 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -43,6 +43,11 @@  static long __initdata dom0_nrpages;
 static long __initdata dom0_min_nrpages;
 static long __initdata dom0_max_nrpages = LONG_MAX;
 
+/* GFN of the identity map for EPT. */
+#define HVM_IDENT_PT_GFN  0xfeffeu
+
+static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];
+
 /*
  * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
  * 
@@ -304,7 +309,8 @@  static unsigned long __init compute_dom0_nr_pages(
             avail -= max_pdx >> s;
     }
 
-    need_paging = opt_dom0_shadow || (is_pvh_domain(d) && !iommu_hap_pt_share);
+    need_paging = opt_dom0_shadow || (has_hvm_container_domain(d) &&
+                  (!iommu_hap_pt_share || !paging_mode_hap(d)));
     for ( ; ; need_paging = 0 )
     {
         nr_pages = dom0_nrpages;
@@ -336,7 +342,8 @@  static unsigned long __init compute_dom0_nr_pages(
         avail -= dom0_paging_pages(d, nr_pages);
     }
 
-    if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
+    if ( is_pv_domain(d) &&
+         (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
          ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) )
     {
         /*
@@ -547,11 +554,12 @@  static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
     ASSERT(nr_holes == 0);
 }
 
-static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
+static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
 {
     struct e820entry *entry, *entry_guest;
     unsigned int i;
     unsigned long pages, cur_pages = 0;
+    uint64_t start, end;
 
     /*
      * Craft the e820 memory map for Dom0 based on the hardware e820 map.
@@ -579,8 +587,19 @@  static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
             continue;
         }
 
-        *entry_guest = *entry;
-        pages = PFN_UP(entry_guest->size);
+        /*
+         * Make sure the start and length are aligned to PAGE_SIZE, because
+         * that's the minimum granularity of the 2nd stage translation.
+         */
+        start = ROUNDUP(entry->addr, PAGE_SIZE);
+        end = (entry->addr + entry->size) & PAGE_MASK;
+        if ( start >= end )
+            continue;
+
+        entry_guest->type = E820_RAM;
+        entry_guest->addr = start;
+        entry_guest->size = end - start;
+        pages = PFN_DOWN(entry_guest->size);
         if ( (cur_pages + pages) > nr_pages )
         {
             /* Truncate region */
@@ -591,6 +610,8 @@  static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
         {
             cur_pages += pages;
         }
+        ASSERT((entry_guest->addr & ~PAGE_MASK) == 0 &&
+               (entry_guest->size & ~PAGE_MASK) == 0);
  next:
         d->arch.nr_e820++;
         entry_guest++;
@@ -1631,7 +1652,7 @@  static int __init construct_dom0_pv(
         dom0_update_physmap(d, pfn, mfn, 0);
 
         pvh_map_all_iomem(d, nr_pages);
-        pvh_setup_e820(d, nr_pages);
+        hvm_setup_e820(d, nr_pages);
     }
 
     if ( d->domain_id == hardware_domid )
@@ -1647,15 +1668,181 @@  out:
     return rc;
 }
 
+/* Helper to convert from bytes into human-readable form. */
+static void __init pretty_print_bytes(uint64_t size)
+{
+    const char* units[] = {"B", "KB", "MB", "GB", "TB"};
+    int i = 0;
+
+    while ( ++i < sizeof(units) && size >= 1024 )
+        size >>= 10; /* size /= 1024 */
+
+    printk("%4" PRIu64 "%2s", size, units[i-1]);
+}
+
+/* Calculate the biggest usable order given a size in bytes. */
+static inline unsigned int get_order(uint64_t size)
+{
+    unsigned int order;
+    uint64_t pg;
+
+    ASSERT((size & ~PAGE_MASK) == 0);
+    pg = PFN_DOWN(size);
+    for ( order = 0; pg >= (1 << (order + 1)); order++ );
+
+    return order;
+}
+
+/* Populate an HVM memory range using the biggest possible order. */
+static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
+                                             uint64_t size)
+{
+    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
+    unsigned int order;
+    struct page_info *page;
+    int rc;
+
+    ASSERT((size & ~PAGE_MASK) == 0 && (start & ~PAGE_MASK) == 0);
+
+    order = MAX_ORDER;
+    while ( size != 0 )
+    {
+        order = min(get_order(size), order);
+        page = alloc_domheap_pages(d, order, memflags);
+        if ( page == NULL )
+        {
+            if ( order == 0 && memflags )
+            {
+                /* Try again without any memflags. */
+                memflags = 0;
+                order = MAX_ORDER;
+                continue;
+            }
+            if ( order == 0 )
+                panic("Unable to allocate memory with order 0!\n");
+            order--;
+            continue;
+        }
+
+        hvm_mem_stats[order]++;
+        rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)),
+                                    _mfn(page_to_mfn(page)), order);
+        if ( rc != 0 )
+            panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n",
+                  start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc);
+        start += ((uint64_t)1) << (order + PAGE_SHIFT);
+        size -= ((uint64_t)1) << (order + PAGE_SHIFT);
+        if ( (size & 0xffffffff) == 0 )
+            process_pending_softirqs();
+    }
+
+}
+
+static int __init hvm_setup_p2m(struct domain *d)
+{
+    struct vcpu *v = d->vcpu[0];
+    unsigned long nr_pages;
+    int i;
+
+    printk("** Preparing memory map **\n");
+
+    /*
+     * Subtract one page for the EPT identity page table and two pages
+     * for the MADT replacement.
+     */
+    nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3;
+
+    hvm_setup_e820(d, nr_pages);
+    paging_set_allocation(d, dom0_paging_pages(d, nr_pages));
+
+    printk("Dom0 memory map:\n");
+    print_e820_memory_map(d->arch.e820, d->arch.nr_e820);
+
+    printk("** Populating memory map **\n");
+    /* Populate memory map. */
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        if ( d->arch.e820[i].type != E820_RAM )
+            continue;
+
+        hvm_populate_memory_range(d, d->arch.e820[i].addr,
+                                  d->arch.e820[i].size);
+    }
+
+    printk("Memory allocation stats:\n");
+    for ( i = 0; i <= MAX_ORDER; i++ )
+    {
+        if ( hvm_mem_stats[MAX_ORDER - i] != 0 )
+        {
+            printk("Order %2u: ", MAX_ORDER - i);
+            pretty_print_bytes(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) *
+                               hvm_mem_stats[MAX_ORDER - i]);
+            printk("\n");
+        }
+    }
+
+    if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
+    {
+        struct vcpu *saved_current;
+        struct page_info *page;
+        uint32_t *ident_pt;
+
+        /*
+         * Identity-map page table is required for running with CR0.PG=0
+         * when using Intel EPT. Create a 32-bit non-PAE page directory of
+         * superpages.
+         */
+        page = alloc_domheap_pages(d, 0, 0);
+        if ( unlikely(!page) )
+        {
+            printk("Unable to allocate page for identity map\n");
+            return -ENOMEM;
+        }
+
+        saved_current = current;
+        set_current(v);
+
+        ident_pt = __map_domain_page(page);
+        for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
+            ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+                           _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+        unmap_domain_page(ident_pt);
+
+        guest_physmap_add_page(d, _gfn(HVM_IDENT_PT_GFN),
+                               _mfn(page_to_mfn(page)), 0);
+        d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] =
+                                 HVM_IDENT_PT_GFN << PAGE_SHIFT;
+        set_current(saved_current);
+    }
+
+    return 0;
+}
+
 static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
                                      unsigned long image_headroom,
                                      module_t *initrd,
                                      void *(*bootstrap_map)(const module_t *),
                                      char *cmdline)
 {
+    int rc;
 
     printk("** Building a PVH Dom0 **\n");
 
+    /* Sanity! */
+    BUG_ON(d->domain_id != 0);
+    BUG_ON(d->vcpu[0] == NULL);
+
+    process_pending_softirqs();
+
+    iommu_hwdom_init(d);
+
+    rc = hvm_setup_p2m(d);
+    if ( rc )
+    {
+        printk("Failed to setup Dom0 physical memory map\n");
+        return rc;
+    }
+
     return 0;
 }