diff mbox

[v2,15/30] xen/x86: populate PVHv2 Dom0 physical memory map

Message ID 1474991845-27962-16-git-send-email-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné Sept. 27, 2016, 3:57 p.m. UTC
Craft the Dom0 e820 memory map and populate it.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
Changes since RFC:
 - Use IS_ALIGNED instead of checking with PAGE_MASK.
 - Use the new %pB specifier in order to print sizes in human readable form.
 - Create a VM86 TSS for hardware that doesn't support unrestricted mode.
 - Subtract guest RAM for the identity page table and the VM86 TSS.
 - Split the creation of the unrestricted mode helper structures to a
   separate function.
 - Use preemption with paging_set_allocation.
 - Use get_order_from_bytes_floor.
---
 xen/arch/x86/domain_build.c | 257 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 251 insertions(+), 6 deletions(-)

Comments

Jan Beulich Sept. 30, 2016, 3:52 p.m. UTC | #1
>>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
> @@ -43,6 +44,11 @@ static long __initdata dom0_nrpages;
>  static long __initdata dom0_min_nrpages;
>  static long __initdata dom0_max_nrpages = LONG_MAX;
>  
> +/* Size of the VM86 TSS for virtual 8086 mode to use. */
> +#define HVM_VM86_TSS_SIZE   128
> +
> +static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];

This is for your debugging only I suppose?

> @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages(
>          avail -= dom0_paging_pages(d, nr_pages);
>      }
>  
> -    if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
> +    if ( is_pv_domain(d) &&
> +         (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&

Perhaps better to simply force parms->p2m_base to UNSET_ADDR
earlier on?

> @@ -579,8 +588,19 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>              continue;
>          }
>  
> -        *entry_guest = *entry;
> -        pages = PFN_UP(entry_guest->size);
> +        /*
> +         * Make sure the start and length are aligned to PAGE_SIZE, because
> +         * that's the minimum granularity of the 2nd stage translation.
> +         */
> +        start = ROUNDUP(entry->addr, PAGE_SIZE);
> +        end = (entry->addr + entry->size) & PAGE_MASK;
> +        if ( start >= end )
> +            continue;
> +
> +        entry_guest->type = E820_RAM;
> +        entry_guest->addr = start;
> +        entry_guest->size = end - start;
> +        pages = PFN_DOWN(entry_guest->size);
>          if ( (cur_pages + pages) > nr_pages )
>          {
>              /* Truncate region */
> @@ -591,6 +611,8 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>          {
>              cur_pages += pages;
>          }
> +        ASSERT(IS_ALIGNED(entry_guest->addr, PAGE_SIZE) &&
> +               IS_ALIGNED(entry_guest->size, PAGE_SIZE));

What does this guard against? Your addition arranges for things to
be page aligned, and the only adjustment done until we get here is
one that obviously also doesn't violate that requirement. I'm all for
assertions when they check state which is not obviously right, but
here I don't see the need.

> @@ -1657,15 +1679,238 @@ out:
>      return rc;
>  }
>  
> +/* Populate an HVM memory range using the biggest possible order. */
> +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
> +                                             uint64_t size)
> +{
> +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
> +    unsigned int order;
> +    struct page_info *page;
> +    int rc;
> +
> +    ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE));
> +
> +    order = MAX_ORDER;
> +    while ( size != 0 )
> +    {
> +        order = min(get_order_from_bytes_floor(size), order);
> +        page = alloc_domheap_pages(d, order, memflags);
> +        if ( page == NULL )
> +        {
> +            if ( order == 0 && memflags )
> +            {
> +                /* Try again without any memflags. */
> +                memflags = 0;
> +                order = MAX_ORDER;
> +                continue;
> +            }
> +            if ( order == 0 )
> +                panic("Unable to allocate memory with order 0!\n");
> +            order--;
> +            continue;
> +        }

Is it not possible to utilize alloc_chunk() here?

> +        hvm_mem_stats[order]++;
> +        rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)),
> +                                    _mfn(page_to_mfn(page)), order);
> +        if ( rc != 0 )
> +            panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n",

[<start>,<end>) please.

> +                  start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc);
> +        start += ((uint64_t)1) << (order + PAGE_SHIFT);
> +        size -= ((uint64_t)1) << (order + PAGE_SHIFT);

Please prefer 1ULL over (uint64_t)1.

> +        if ( (size & 0xffffffff) == 0 )
> +            process_pending_softirqs();

That's 4Gb at a time - isn't that a little too much?

> +    }
> +
> +}

Stray blank line.

> +static int __init hvm_setup_vmx_unrestricted_guest(struct domain *d)
> +{
> +    struct e820entry *entry;
> +    p2m_type_t p2mt;
> +    uint32_t rc, *ident_pt;
> +    uint8_t *tss;
> +    mfn_t mfn;
> +    paddr_t gaddr = 0;
> +    int i;

unsigned int

> +    /*
> +     * Stole some space from the last found RAM region. One page will be

Steal

> +     * used for the identify page tables, and the remaining space for the

identity

> +     * VM86 TSS. Note that after this not all e820 regions will be aligned
> +     * to PAGE_SIZE.
> +     */
> +    for ( i = 1; i <= d->arch.nr_e820; i++ )
> +    {
> +        entry = &d->arch.e820[d->arch.nr_e820 - i];
> +        if ( entry->type != E820_RAM ||
> +             entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE )
> +            continue;
> +
> +        entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE;
> +        gaddr = entry->addr + entry->size;
> +        break;
> +    }
> +
> +    if ( gaddr == 0 || gaddr < MB(1) )
> +    {
> +        printk("Unable to find memory to stash the identity map and TSS\n");
> +        return -ENOMEM;

One function up you panic() on error - please be consistent. Also for
one of the other patches I think we figured that the TSS isn't really
required, so please only warn in that case.

> +    }
> +
> +    /*
> +     * Identity-map page table is required for running with CR0.PG=0
> +     * when using Intel EPT. Create a 32-bit non-PAE page directory of
> +     * superpages.
> +     */
> +    tss = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)),
> +                         &mfn, &p2mt, 0, &rc);

Comment and operation don't really fit together.

> +static int __init hvm_setup_p2m(struct domain *d)
> +{
> +    struct vcpu *saved_current, *v = d->vcpu[0];
> +    unsigned long nr_pages;
> +    int i, rc, preempted;
> +
> +    printk("** Preparing memory map **\n");

Debugging leftover again?

> +    /*
> +     * Subtract one page for the EPT identity page table and two pages
> +     * for the MADT replacement.
> +     */
> +    nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3;

How do you know the MADT replacement requires two pages? Isn't
that CPU-count dependent? And doesn't the partial page used for
the TSS also need accounting for here?

> +    hvm_setup_e820(d, nr_pages);
> +    do {
> +        preempted = 0;
> +        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
> +                              &preempted);
> +        process_pending_softirqs();
> +    } while ( preempted );
> +
> +    /*
> +     * Special treatment for memory < 1MB:
> +     *  - Copy the data in e820 regions marked as RAM (BDA, EBDA...).
> +     *  - Map everything else as 1:1.
> +     * NB: all this only makes sense if booted from legacy BIOSes.
> +     */
> +    rc = modify_mmio_11(d, 0, PFN_DOWN(MB(1)), true);
> +    if ( rc )
> +    {
> +        printk("Failed to map low 1MB 1:1: %d\n", rc);
> +        return rc;
> +    }
> +
> +    printk("** Populating memory map **\n");
> +    /* Populate memory map. */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        if ( d->arch.e820[i].type != E820_RAM )
> +            continue;
> +
> +        hvm_populate_memory_range(d, d->arch.e820[i].addr,
> +                                  d->arch.e820[i].size);
> +        if ( d->arch.e820[i].addr < MB(1) )
> +        {
> +            unsigned long end = min_t(unsigned long,
> +                            d->arch.e820[i].addr + d->arch.e820[i].size, MB(1));
> +
> +            saved_current = current;
> +            set_current(v);
> +            rc = hvm_copy_to_guest_phys(d->arch.e820[i].addr,
> +                                        maddr_to_virt(d->arch.e820[i].addr),
> +                                        end - d->arch.e820[i].addr);
> +            set_current(saved_current);
> +            if ( rc != HVMCOPY_okay )
> +            {
> +                printk("Unable to copy RAM region %#lx - %#lx\n",
> +                       d->arch.e820[i].addr, end);
> +                return -EFAULT;
> +            }
> +        }
> +    }
> +
> +    printk("Memory allocation stats:\n");
> +    for ( i = 0; i <= MAX_ORDER; i++ )
> +    {
> +        if ( hvm_mem_stats[MAX_ORDER - i] != 0 )
> +            printk("Order %2u: %pZ\n", MAX_ORDER - i,
> +                   _p(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) *
> +                      hvm_mem_stats[MAX_ORDER - i]));
> +    }
> +
> +    if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
> +    {
> +        /*
> +         * Since Dom0 cannot be migrated, we will only setup the
> +         * unrestricted guest helpers if they are needed by the current
> +         * hardware we are running on.
> +         */
> +        rc = hvm_setup_vmx_unrestricted_guest(d);

Calling a function of this name inside an if() checking for
!vmx_unrestricted_guest() is, well, odd.

>  static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
>                                       unsigned long image_headroom,
>                                       module_t *initrd,
>                                       void *(*bootstrap_map)(const module_t *),
>                                       char *cmdline)
>  {
> +    int rc;
>  
>      printk("** Building a PVH Dom0 **\n");
>  
> +    /* Sanity! */
> +    BUG_ON(d->domain_id != 0);
> +    BUG_ON(d->vcpu[0] == NULL);

May I suggest

    BUG_ON(d->domain_id);
    BUG_ON(!d->vcpu[0]);

in cases like this?

> +    process_pending_softirqs();

Why, outside of any loop?

Jan
Roger Pau Monné Oct. 4, 2016, 9:12 a.m. UTC | #2
On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote:
> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
> > @@ -43,6 +44,11 @@ static long __initdata dom0_nrpages;
> >  static long __initdata dom0_min_nrpages;
> >  static long __initdata dom0_max_nrpages = LONG_MAX;
> >  
> > +/* Size of the VM86 TSS for virtual 8086 mode to use. */
> > +#define HVM_VM86_TSS_SIZE   128
> > +
> > +static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];
> 
> This is for your debugging only I suppose?

Probably, I wasn't sure if it was relevant so I left it here. Would it make 
sense to only print this for debug builds of the hypervisor? Or better to 
just remove it?

> > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages(
> >          avail -= dom0_paging_pages(d, nr_pages);
> >      }
> >  
> > -    if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
> > +    if ( is_pv_domain(d) &&
> > +         (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
> 
> Perhaps better to simply force parms->p2m_base to UNSET_ADDR
> earlier on?

p2m_base is already unconditionally set to UNSET_ADDR for PVHv2 domains, 
hence the added is_pv_domain check in order to make sure that PVHv2 guests 
don't get into that branch, which AFAICT is only relevant to PV guests.

> > @@ -579,8 +588,19 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
> >              continue;
> >          }
> >  
> > -        *entry_guest = *entry;
> > -        pages = PFN_UP(entry_guest->size);
> > +        /*
> > +         * Make sure the start and length are aligned to PAGE_SIZE, because
> > +         * that's the minimum granularity of the 2nd stage translation.
> > +         */
> > +        start = ROUNDUP(entry->addr, PAGE_SIZE);
> > +        end = (entry->addr + entry->size) & PAGE_MASK;
> > +        if ( start >= end )
> > +            continue;
> > +
> > +        entry_guest->type = E820_RAM;
> > +        entry_guest->addr = start;
> > +        entry_guest->size = end - start;
> > +        pages = PFN_DOWN(entry_guest->size);
> >          if ( (cur_pages + pages) > nr_pages )
> >          {
> >              /* Truncate region */
> > @@ -591,6 +611,8 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
> >          {
> >              cur_pages += pages;
> >          }
> > +        ASSERT(IS_ALIGNED(entry_guest->addr, PAGE_SIZE) &&
> > +               IS_ALIGNED(entry_guest->size, PAGE_SIZE));
> 
> What does this guard against? Your addition arranges for things to
> be page aligned, and the only adjustment done until we get here is
> one that obviously also doesn't violate that requirement. I'm all for
> assertions when they check state which is not obviously right, but
> here I don't see the need.

Right, I'm going to remove it. I've added more seat belts than needed when 
testing this and forgot to remove them.

> > @@ -1657,15 +1679,238 @@ out:
> >      return rc;
> >  }
> >  
> > +/* Populate an HVM memory range using the biggest possible order. */
> > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
> > +                                             uint64_t size)
> > +{
> > +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
> > +    unsigned int order;
> > +    struct page_info *page;
> > +    int rc;
> > +
> > +    ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE));
> > +
> > +    order = MAX_ORDER;
> > +    while ( size != 0 )
> > +    {
> > +        order = min(get_order_from_bytes_floor(size), order);
> > +        page = alloc_domheap_pages(d, order, memflags);
> > +        if ( page == NULL )
> > +        {
> > +            if ( order == 0 && memflags )
> > +            {
> > +                /* Try again without any memflags. */
> > +                memflags = 0;
> > +                order = MAX_ORDER;
> > +                continue;
> > +            }
> > +            if ( order == 0 )
> > +                panic("Unable to allocate memory with order 0!\n");
> > +            order--;
> > +            continue;
> > +        }
> 
> Is it not possible to utilize alloc_chunk() here?

Not really, alloc_chunk will do a ceil calculation of the number of needed 
pages, which means we end up allocating bigger chunks than needed and then 
free them. I prefer this approach, since we always request the exact memory 
that's required, so there's no need to free leftover pages.

> > +        hvm_mem_stats[order]++;
> > +        rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)),
> > +                                    _mfn(page_to_mfn(page)), order);
> > +        if ( rc != 0 )
> > +            panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n",
> 
> [<start>,<end>) please.
> 
> > +                  start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc);
> > +        start += ((uint64_t)1) << (order + PAGE_SHIFT);
> > +        size -= ((uint64_t)1) << (order + PAGE_SHIFT);
> 
> Please prefer 1ULL over (uint64_t)1.
> 
> > +        if ( (size & 0xffffffff) == 0 )
> > +            process_pending_softirqs();
> 
> That's 4Gb at a time - isn't that a little too much?

Hm, it's the same that's used in pvh_add_mem_mapping AFAICT. I could reduce 
it to 0xfffffff, but I'm also wondering if it makes sense to just call it on 
each iteration, since we are possibly mapping regions with big orders here.

> > +    }
> > +
> > +}
> 
> Stray blank line.
> 
> > +static int __init hvm_setup_vmx_unrestricted_guest(struct domain *d)
> > +{
> > +    struct e820entry *entry;
> > +    p2m_type_t p2mt;
> > +    uint32_t rc, *ident_pt;
> > +    uint8_t *tss;
> > +    mfn_t mfn;
> > +    paddr_t gaddr = 0;
> > +    int i;
> 
> unsigned int
> 
> > +    /*
> > +     * Stole some space from the last found RAM region. One page will be
> 
> Steal
> 
> > +     * used for the identify page tables, and the remaining space for the
> 
> identity
> 
> > +     * VM86 TSS. Note that after this not all e820 regions will be aligned
> > +     * to PAGE_SIZE.
> > +     */
> > +    for ( i = 1; i <= d->arch.nr_e820; i++ )
> > +    {
> > +        entry = &d->arch.e820[d->arch.nr_e820 - i];
> > +        if ( entry->type != E820_RAM ||
> > +             entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE )
> > +            continue;
> > +
> > +        entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE;
> > +        gaddr = entry->addr + entry->size;
> > +        break;
> > +    }
> > +
> > +    if ( gaddr == 0 || gaddr < MB(1) )
> > +    {
> > +        printk("Unable to find memory to stash the identity map and TSS\n");
> > +        return -ENOMEM;
> 
> One function up you panic() on error - please be consistent. Also for
> one of the other patches I think we figured that the TSS isn't really
> required, so please only warn in that case.
> 
> > +    }
> > +
> > +    /*
> > +     * Identity-map page table is required for running with CR0.PG=0
> > +     * when using Intel EPT. Create a 32-bit non-PAE page directory of
> > +     * superpages.
> > +     */
> > +    tss = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)),
> > +                         &mfn, &p2mt, 0, &rc);
> 
> Comment and operation don't really fit together.
> 
> > +static int __init hvm_setup_p2m(struct domain *d)
> > +{
> > +    struct vcpu *saved_current, *v = d->vcpu[0];
> > +    unsigned long nr_pages;
> > +    int i, rc, preempted;
> > +
> > +    printk("** Preparing memory map **\n");
> 
> Debugging leftover again?

Should this be merged with the next label, so that it reads as "Preparing 
and populating the memory map"?

> > +    /*
> > +     * Subtract one page for the EPT identity page table and two pages
> > +     * for the MADT replacement.
> > +     */
> > +    nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3;
> 
> How do you know the MADT replacement requires two pages? Isn't
> that CPU-count dependent? And doesn't the partial page used for
> the TSS also need accounting for here?

Yes, it's CPU-count dependent. This is just an approximation, since we only 
support up to 256 CPUs on HVM guests, and each Processor Local APIC entry is 
8 bytes, this means that the CPU-related data is going to use up to 2048 
bytes of data, which still leaves plenty of space for the IO APIC and the 
Interrupt Source Override entries. We requests two pages in case the 
original MADT crosses a page boundary. FWIW, I could also fetch the original 
MADT size earlier and use that as the upper bound here for memory 
reservation.

In the RFC series we also spoke about placing the MADT in a different 
position that the native one, which would mean that we would end up stealing 
some space from a RAM region in order to place it, so that we wouldn't have 
to do this accounting.

In fact this is slightly wrong, and it doesn't need to account for the 
identity page table or the VM86 TSS, since we end up stealing this space 
from populated RAM regions. At the moment we only need to account for the 2 
pages that could be used by the MADT.

> > +    hvm_setup_e820(d, nr_pages);
> > +    do {
> > +        preempted = 0;
> > +        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
> > +                              &preempted);
> > +        process_pending_softirqs();
> > +    } while ( preempted );
> > +
> > +    /*
> > +     * Special treatment for memory < 1MB:
> > +     *  - Copy the data in e820 regions marked as RAM (BDA, EBDA...).
> > +     *  - Map everything else as 1:1.
> > +     * NB: all this only makes sense if booted from legacy BIOSes.
> > +     */
> > +    rc = modify_mmio_11(d, 0, PFN_DOWN(MB(1)), true);
> > +    if ( rc )
> > +    {
> > +        printk("Failed to map low 1MB 1:1: %d\n", rc);
> > +        return rc;
> > +    }
> > +
> > +    printk("** Populating memory map **\n");
> > +    /* Populate memory map. */
> > +    for ( i = 0; i < d->arch.nr_e820; i++ )
> > +    {
> > +        if ( d->arch.e820[i].type != E820_RAM )
> > +            continue;
> > +
> > +        hvm_populate_memory_range(d, d->arch.e820[i].addr,
> > +                                  d->arch.e820[i].size);
> > +        if ( d->arch.e820[i].addr < MB(1) )
> > +        {
> > +            unsigned long end = min_t(unsigned long,
> > +                            d->arch.e820[i].addr + d->arch.e820[i].size, MB(1));
> > +
> > +            saved_current = current;
> > +            set_current(v);
> > +            rc = hvm_copy_to_guest_phys(d->arch.e820[i].addr,
> > +                                        maddr_to_virt(d->arch.e820[i].addr),
> > +                                        end - d->arch.e820[i].addr);
> > +            set_current(saved_current);
> > +            if ( rc != HVMCOPY_okay )
> > +            {
> > +                printk("Unable to copy RAM region %#lx - %#lx\n",
> > +                       d->arch.e820[i].addr, end);
> > +                return -EFAULT;
> > +            }
> > +        }
> > +    }
> > +
> > +    printk("Memory allocation stats:\n");
> > +    for ( i = 0; i <= MAX_ORDER; i++ )
> > +    {
> > +        if ( hvm_mem_stats[MAX_ORDER - i] != 0 )
> > +            printk("Order %2u: %pZ\n", MAX_ORDER - i,
> > +                   _p(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) *
> > +                      hvm_mem_stats[MAX_ORDER - i]));
> > +    }
> > +
> > +    if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
> > +    {
> > +        /*
> > +         * Since Dom0 cannot be migrated, we will only setup the
> > +         * unrestricted guest helpers if they are needed by the current
> > +         * hardware we are running on.
> > +         */
> > +        rc = hvm_setup_vmx_unrestricted_guest(d);
> 
> Calling a function of this name inside an if() checking for
> !vmx_unrestricted_guest() is, well, odd.

Yes, that's right. What about calling it hvm_setup_vmx_realmode_helpers?

> >  static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
> >                                       unsigned long image_headroom,
> >                                       module_t *initrd,
> >                                       void *(*bootstrap_map)(const module_t *),
> >                                       char *cmdline)
> >  {
> > +    int rc;
> >  
> >      printk("** Building a PVH Dom0 **\n");
> >  
> > +    /* Sanity! */
> > +    BUG_ON(d->domain_id != 0);
> > +    BUG_ON(d->vcpu[0] == NULL);
> 
> May I suggest
> 
>     BUG_ON(d->domain_id);
>     BUG_ON(!d->vcpu[0]);
> 
> in cases like this?

Yes, I have the tendency to not use '!' or perform direct checks unless it's 
a boolean type.

> > +    process_pending_softirqs();
> 
> Why, outside of any loop?

It's the same that's done in construct_dom0_pv, so I though that it was a 
good idea to drain any pending softirqs before starting domain build.

Roger.
Jan Beulich Oct. 4, 2016, 11:16 a.m. UTC | #3
>>> On 04.10.16 at 11:12, <roger.pau@citrix.com> wrote:
> On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote:
>> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
>> > @@ -43,6 +44,11 @@ static long __initdata dom0_nrpages;
>> >  static long __initdata dom0_min_nrpages;
>> >  static long __initdata dom0_max_nrpages = LONG_MAX;
>> >  
>> > +/* Size of the VM86 TSS for virtual 8086 mode to use. */
>> > +#define HVM_VM86_TSS_SIZE   128
>> > +
>> > +static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];
>> 
>> This is for your debugging only I suppose?
> 
> Probably, I wasn't sure if it was relevant so I left it here. Would it make 
> sense to only print this for debug builds of the hypervisor? Or better to 
> just remove it?

Statistics in debug builds often aren't really meaningful, so my order
of preference would be to remove it, then to keep it for all cases but
default it to be silent in non-debug builds (with a command line option
to enable).

>> > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages(
>> >          avail -= dom0_paging_pages(d, nr_pages);
>> >      }
>> >  
>> > -    if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
>> > +    if ( is_pv_domain(d) &&
>> > +         (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
>> 
>> Perhaps better to simply force parms->p2m_base to UNSET_ADDR
>> earlier on?
> 
> p2m_base is already unconditionally set to UNSET_ADDR for PVHv2 domains, 
> hence the added is_pv_domain check in order to make sure that PVHv2 guests 
> don't get into that branch, which AFAICT is only relevant to PV guests.

This reads contradictory: If it's set to UNSET_ADDR, why the extra
check?

>> > @@ -1657,15 +1679,238 @@ out:
>> >      return rc;
>> >  }
>> >  
>> > +/* Populate an HVM memory range using the biggest possible order. */
>> > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
>> > +                                             uint64_t size)
>> > +{
>> > +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
>> > +    unsigned int order;
>> > +    struct page_info *page;
>> > +    int rc;
>> > +
>> > +    ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE));
>> > +
>> > +    order = MAX_ORDER;
>> > +    while ( size != 0 )
>> > +    {
>> > +        order = min(get_order_from_bytes_floor(size), order);
>> > +        page = alloc_domheap_pages(d, order, memflags);
>> > +        if ( page == NULL )
>> > +        {
>> > +            if ( order == 0 && memflags )
>> > +            {
>> > +                /* Try again without any memflags. */
>> > +                memflags = 0;
>> > +                order = MAX_ORDER;
>> > +                continue;
>> > +            }
>> > +            if ( order == 0 )
>> > +                panic("Unable to allocate memory with order 0!\n");
>> > +            order--;
>> > +            continue;
>> > +        }
>> 
>> Is it not possible to utilize alloc_chunk() here?
> 
> Not really, alloc_chunk will do a ceil calculation of the number of needed 
> pages, which means we end up allocating bigger chunks than needed and then 
> free them. I prefer this approach, since we always request the exact memory 
> that's required, so there's no need to free leftover pages.

Hmm, in that case at least some of the shared logic would be nice to
get abstracted out.

>> > +        if ( (size & 0xffffffff) == 0 )
>> > +            process_pending_softirqs();
>> 
>> That's 4Gb at a time - isn't that a little too much?
> 
> Hm, it's the same that's used in pvh_add_mem_mapping AFAICT. I could reduce 
> it to 0xfffffff, but I'm also wondering if it makes sense to just call it on 
> each iteration, since we are possibly mapping regions with big orders here.

Iteration could is all that matters here really; the size of the mapping
wouldn't normally (as long as its one of the hardware supported sizes).
Doing the check on every iteration may be a little much (you may want
to check whether there's noticeable extra overhead), but doing the
check on like every 64 iterations may limit overhead enough to be
acceptable without making this more sophisticated.

>> > +static int __init hvm_setup_p2m(struct domain *d)
>> > +{
>> > +    struct vcpu *saved_current, *v = d->vcpu[0];
>> > +    unsigned long nr_pages;
>> > +    int i, rc, preempted;
>> > +
>> > +    printk("** Preparing memory map **\n");
>> 
>> Debugging leftover again?
> 
> Should this be merged with the next label, so that it reads as "Preparing 
> and populating the memory map"?

No, I'd rather see the other(s) gone too.

>> > +    /*
>> > +     * Subtract one page for the EPT identity page table and two pages
>> > +     * for the MADT replacement.
>> > +     */
>> > +    nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3;
>> 
>> How do you know the MADT replacement requires two pages? Isn't
>> that CPU-count dependent? And doesn't the partial page used for
>> the TSS also need accounting for here?
> 
> Yes, it's CPU-count dependent. This is just an approximation, since we only 
> support up to 256 CPUs on HVM guests, and each Processor Local APIC entry is 
> 
> 8 bytes, this means that the CPU-related data is going to use up to 2048 
> bytes of data, which still leaves plenty of space for the IO APIC and the 
> Interrupt Source Override entries. We requests two pages in case the 
> original MADT crosses a page boundary. FWIW, I could also fetch the original 
> MADT size earlier and use that as the upper bound here for memory 
> reservation.

That wouldn't help in case someone wants more vCPU-s than there
are pCPU-s. And baking in another assumption of there being <=
128 vCPU-s when there's already work being done to eliminate that
limit is likely not too good an idea.

> In the RFC series we also spoke about placing the MADT in a different 
> position that the native one, which would mean that we would end up stealing 
> some space from a RAM region in order to place it, so that we wouldn't have 
> to do this accounting.

Putting the new MADT at the same address as the old one won't work
anyway, again because possibly vCPU-s > pCPU-s.

>> > +    if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
>> > +    {
>> > +        /*
>> > +         * Since Dom0 cannot be migrated, we will only setup the
>> > +         * unrestricted guest helpers if they are needed by the current
>> > +         * hardware we are running on.
>> > +         */
>> > +        rc = hvm_setup_vmx_unrestricted_guest(d);
>> 
>> Calling a function of this name inside an if() checking for
>> !vmx_unrestricted_guest() is, well, odd.
> 
> Yes, that's right. What about calling it hvm_setup_vmx_realmode_helpers?

Sounds quite a bit more reasonable.

>> > +    process_pending_softirqs();
>> 
>> Why, outside of any loop?
> 
> It's the same that's done in construct_dom0_pv, so I though that it was a 
> good idea to drain any pending softirqs before starting domain build.

Perhaps in that case it should be pulled out of there into the
wrapper?

Jan
Roger Pau Monné Oct. 11, 2016, 2:01 p.m. UTC | #4
On Tue, Oct 04, 2016 at 05:16:09AM -0600, Jan Beulich wrote:
> >>> On 04.10.16 at 11:12, <roger.pau@citrix.com> wrote:
> > On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote:
> >> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
> >> > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages(
> >> >          avail -= dom0_paging_pages(d, nr_pages);
> >> >      }
> >> >  
> >> > -    if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
> >> > +    if ( is_pv_domain(d) &&
> >> > +         (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
> >> 
> >> Perhaps better to simply force parms->p2m_base to UNSET_ADDR
> >> earlier on?
> > 
> > p2m_base is already unconditionally set to UNSET_ADDR for PVHv2 domains, 
> > hence the added is_pv_domain check in order to make sure that PVHv2 guests 
> > don't get into that branch, which AFAICT is only relevant to PV guests.
> 
> This reads contradictory: If it's set to UNSET_ADDR, why the extra
> check?

On PVHv2 p2m_base == UNSET_ADDR already, so the extra check is to make sure 
PVHv2 guests don't execute the code inside of the if condition, which is 
for classic PV guests (note that the check is not against != UNSET_ADDR). Or 
maybe I'm missing something?

> >> > @@ -1657,15 +1679,238 @@ out:
> >> >      return rc;
> >> >  }
> >> >  
> >> > +/* Populate an HVM memory range using the biggest possible order. */
> >> > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
> >> > +                                             uint64_t size)
> >> > +{
> >> > +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
> >> > +    unsigned int order;
> >> > +    struct page_info *page;
> >> > +    int rc;
> >> > +
> >> > +    ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE));
> >> > +
> >> > +    order = MAX_ORDER;
> >> > +    while ( size != 0 )
> >> > +    {
> >> > +        order = min(get_order_from_bytes_floor(size), order);
> >> > +        page = alloc_domheap_pages(d, order, memflags);
> >> > +        if ( page == NULL )
> >> > +        {
> >> > +            if ( order == 0 && memflags )
> >> > +            {
> >> > +                /* Try again without any memflags. */
> >> > +                memflags = 0;
> >> > +                order = MAX_ORDER;
> >> > +                continue;
> >> > +            }
> >> > +            if ( order == 0 )
> >> > +                panic("Unable to allocate memory with order 0!\n");
> >> > +            order--;
> >> > +            continue;
> >> > +        }
> >> 
> >> Is it not possible to utilize alloc_chunk() here?
> > 
> > Not really, alloc_chunk will do a ceil calculation of the number of needed 
> > pages, which means we end up allocating bigger chunks than needed and then 
> > free them. I prefer this approach, since we always request the exact memory 
> > that's required, so there's no need to free leftover pages.
> 
> Hmm, in that case at least some of the shared logic would be nice to
> get abstracted out.

TBH, I don't see much benefit in that, alloc_chunk works fine for PV guests 
because Xen ends up walking the list of returned pages and mapping them one 
to one. This is IMHO not what should be done for PVH guests, and instead the 
caller needs to know the actual order of the allocated chunk, so it can pass 
it to guest_physmap_add_page. ATM it's a very simple function that's clear, 
if I start mixing up bits of alloc_chunk it's going to get more complex, and 
I would like to avoid that, so that PVH Dom0 build doesn't end up like 
current PV Dom0 build.

> >> > +        if ( (size & 0xffffffff) == 0 )
> >> > +            process_pending_softirqs();
> >> 
> >> That's 4Gb at a time - isn't that a little too much?
> > 
> > Hm, it's the same that's used in pvh_add_mem_mapping AFAICT. I could reduce 
> > it to 0xfffffff, but I'm also wondering if it makes sense to just call it on 
> > each iteration, since we are possibly mapping regions with big orders here.
> 
> Iteration could is all that matters here really; the size of the mapping
> wouldn't normally (as long as its one of the hardware supported sizes).
> Doing the check on every iteration may be a little much (you may want
> to check whether there's noticeable extra overhead), but doing the
> check on like every 64 iterations may limit overhead enough to be
> acceptable without making this more sophisticated.

Ack, I've done it using a define, so we can always change that 64 to 
something else if the need arises.

> >> > +static int __init hvm_setup_p2m(struct domain *d)
> >> > +{
> >> > +    struct vcpu *saved_current, *v = d->vcpu[0];
> >> > +    unsigned long nr_pages;
> >> > +    int i, rc, preempted;
> >> > +
> >> > +    printk("** Preparing memory map **\n");
> >> 
> >> Debugging leftover again?
> > 
> > Should this be merged with the next label, so that it reads as "Preparing 
> > and populating the memory map"?
> 
> No, I'd rather see the other(s) gone too.

Done, I've just left the initial "Building a PVH Dom0" message.
 
> >> > +    /*
> >> > +     * Subtract one page for the EPT identity page table and two pages
> >> > +     * for the MADT replacement.
> >> > +     */
> >> > +    nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3;
> >> 
> >> How do you know the MADT replacement requires two pages? Isn't
> >> that CPU-count dependent? And doesn't the partial page used for
> >> the TSS also need accounting for here?
> > 
> > Yes, it's CPU-count dependent. This is just an approximation, since we only 
> > support up to 256 CPUs on HVM guests, and each Processor Local APIC entry is 
> > 
> > 8 bytes, this means that the CPU-related data is going to use up to 2048 
> > bytes of data, which still leaves plenty of space for the IO APIC and the 
> > Interrupt Source Override entries. We requests two pages in case the 
> > original MADT crosses a page boundary. FWIW, I could also fetch the original 
> > MADT size earlier and use that as the upper bound here for memory 
> > reservation.
> 
> That wouldn't help in case someone wants more vCPU-s than there
> are pCPU-s. And baking in another assumption of there being <=
> 128 vCPU-s when there's already work being done to eliminate that
> limit is likely not too good an idea.

Right, I've now removed all this, since for the MADT we are going to steal 
RAM pages.

> > In the RFC series we also spoke about placing the MADT in a different 
> > position that the native one, which would mean that we would end up stealing 
> > some space from a RAM region in order to place it, so that we wouldn't have 
> > to do this accounting.
> 
> Putting the new MADT at the same address as the old one won't work
> anyway, again because possibly vCPU-s > pCPU-s.

Yes, although from my tests doing CPU over-allocation on HVM guests works 
very badly.

> >> > +    process_pending_softirqs();
> >> 
> >> Why, outside of any loop?
> > 
> > It's the same that's done in construct_dom0_pv, so I though that it was a 
> > good idea to drain any pending softirqs before starting domain build.
> 
> Perhaps in that case it should be pulled out of there into the
> wrapper?

Yes, good point.

Thanks, Roger.
Roger Pau Monné Oct. 11, 2016, 2:06 p.m. UTC | #5
It seems like I've forgotten to answer one of your comments in a previous 
email, sorry.

On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote:
> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
> > +     * VM86 TSS. Note that after this not all e820 regions will be aligned
> > +     * to PAGE_SIZE.
> > +     */
> > +    for ( i = 1; i <= d->arch.nr_e820; i++ )
> > +    {
> > +        entry = &d->arch.e820[d->arch.nr_e820 - i];
> > +        if ( entry->type != E820_RAM ||
> > +             entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE )
> > +            continue;
> > +
> > +        entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE;
> > +        gaddr = entry->addr + entry->size;
> > +        break;
> > +    }
> > +
> > +    if ( gaddr == 0 || gaddr < MB(1) )
> > +    {
> > +        printk("Unable to find memory to stash the identity map and TSS\n");
> > +        return -ENOMEM;
> 
> One function up you panic() on error - please be consistent. Also for
> one of the other patches I think we figured that the TSS isn't really
> required, so please only warn in that case.

The allocation is done together for the ident PT and the TSS, and while 
the TSS is not mandatory the identity page-table is (or else Dom0 kernel 
won't boot at all on this kind of systems). In any case, it's almost 
impossible for this allocation to fail (because Xen is not actually 
allocating memory, just stealing a part of a RAM region that has already 
been populated).

Roger.
Jan Beulich Oct. 12, 2016, 11:51 a.m. UTC | #6
>>> On 11.10.16 at 16:01, <roger.pau@citrix.com> wrote:
> On Tue, Oct 04, 2016 at 05:16:09AM -0600, Jan Beulich wrote:
>> >>> On 04.10.16 at 11:12, <roger.pau@citrix.com> wrote:
>> > On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote:
>> >> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
>> >> > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages(
>> >> >          avail -= dom0_paging_pages(d, nr_pages);
>> >> >      }
>> >> >  
>> >> > -    if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
>> >> > +    if ( is_pv_domain(d) &&
>> >> > +         (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
>> >> 
>> >> Perhaps better to simply force parms->p2m_base to UNSET_ADDR
>> >> earlier on?
>> > 
>> > p2m_base is already unconditionally set to UNSET_ADDR for PVHv2 domains, 
>> > hence the added is_pv_domain check in order to make sure that PVHv2 guests 
>> > don't get into that branch, which AFAICT is only relevant to PV guests.
>> 
>> This reads contradictory: If it's set to UNSET_ADDR, why the extra
>> check?
> 
> On PVHv2 p2m_base == UNSET_ADDR already, so the extra check is to make sure 
> PVHv2 guests don't execute the code inside of the if condition, which is 
> for classic PV guests (note that the check is not against != UNSET_ADDR). Or 
> maybe I'm missing something?

No, I have to apologize - I read things the wrong way round.
Thanks for bearing with me.

>> >> > @@ -1657,15 +1679,238 @@ out:
>> >> >      return rc;
>> >> >  }
>> >> >  
>> >> > +/* Populate an HVM memory range using the biggest possible order. */
>> >> > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
>> >> > +                                             uint64_t size)
>> >> > +{
>> >> > +    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
>> >> > +    unsigned int order;
>> >> > +    struct page_info *page;
>> >> > +    int rc;
>> >> > +
>> >> > +    ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE));
>> >> > +
>> >> > +    order = MAX_ORDER;
>> >> > +    while ( size != 0 )
>> >> > +    {
>> >> > +        order = min(get_order_from_bytes_floor(size), order);
>> >> > +        page = alloc_domheap_pages(d, order, memflags);
>> >> > +        if ( page == NULL )
>> >> > +        {
>> >> > +            if ( order == 0 && memflags )
>> >> > +            {
>> >> > +                /* Try again without any memflags. */
>> >> > +                memflags = 0;
>> >> > +                order = MAX_ORDER;
>> >> > +                continue;
>> >> > +            }
>> >> > +            if ( order == 0 )
>> >> > +                panic("Unable to allocate memory with order 0!\n");
>> >> > +            order--;
>> >> > +            continue;
>> >> > +        }
>> >> 
>> >> Is it not possible to utilize alloc_chunk() here?
>> > 
>> > Not really, alloc_chunk will do a ceil calculation of the number of needed 
>> > pages, which means we end up allocating bigger chunks than needed and then 
>> > free them. I prefer this approach, since we always request the exact memory 
>> > that's required, so there's no need to free leftover pages.
>> 
>> Hmm, in that case at least some of the shared logic would be nice to
>> get abstracted out.
> 
> TBH, I don't see much benefit in that, alloc_chunk works fine for PV guests 
> because Xen ends up walking the list of returned pages and mapping them one 
> to one. This is IMHO not what should be done for PVH guests, and instead the 
> caller needs to know the actual order of the allocated chunk, so it can pass 
> it to guest_physmap_add_page. ATM it's a very simple function that's clear, 
> if I start mixing up bits of alloc_chunk it's going to get more complex, and 
> I would like to avoid that, so that PVH Dom0 build doesn't end up like 
> current PV Dom0 build.

Hmm - I did say "abstract out", not "re-use". The way how memflags
get set and decreasing orders get tried in your code looks suspiciously
similar to what alloc_chunk() does.

>> > In the RFC series we also spoke about placing the MADT in a different 
>> > position that the native one, which would mean that we would end up stealing 
>> > some space from a RAM region in order to place it, so that we wouldn't have 
>> > to do this accounting.
>> 
>> Putting the new MADT at the same address as the old one won't work
>> anyway, again because possibly vCPU-s > pCPU-s.
> 
> Yes, although from my tests doing CPU over-allocation on HVM guests works 
> very badly.

Interesting. I didn't try recently, but I recall having tried a longer
while ago without seeing severe issues.

Jan
Jan Beulich Oct. 12, 2016, 11:58 a.m. UTC | #7
>>> On 11.10.16 at 16:06, <roger.pau@citrix.com> wrote:
> On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote:
>> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote:
>> > +     * VM86 TSS. Note that after this not all e820 regions will be aligned
>> > +     * to PAGE_SIZE.
>> > +     */
>> > +    for ( i = 1; i <= d->arch.nr_e820; i++ )
>> > +    {
>> > +        entry = &d->arch.e820[d->arch.nr_e820 - i];
>> > +        if ( entry->type != E820_RAM ||
>> > +             entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE )
>> > +            continue;
>> > +
>> > +        entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE;
>> > +        gaddr = entry->addr + entry->size;
>> > +        break;
>> > +    }
>> > +
>> > +    if ( gaddr == 0 || gaddr < MB(1) )
>> > +    {
>> > +        printk("Unable to find memory to stash the identity map and TSS\n");
>> > +        return -ENOMEM;
>> 
>> One function up you panic() on error - please be consistent. Also for
>> one of the other patches I think we figured that the TSS isn't really
>> required, so please only warn in that case.
> 
> The allocation is done together for the ident PT and the TSS, and while 
> the TSS is not mandatory the identity page-table is (or else Dom0 kernel 
> won't boot at all on this kind of systems). In any case, it's almost 
> impossible for this allocation to fail (because Xen is not actually 
> allocating memory, just stealing a part of a RAM region that has already 
> been populated).

All fine, but I continue to think errors should be dealt with in a
consistent manner (no matter how [un]likely they are), and
warnings better get issued when there's any meaningful impact
due to whatever triggers the warning. Albeit - having looked at
the full patch again - it looks like I was wrong with naming this
"warning": Both here and further down you return an error if any
of the steps failed. The allocation being done in one go is fine to
be an error; failure of the mapping of the non-required TSS, otoh,
shouldn't cause the loading of Dom0 to fail (and I think that is
where I've been unsure the printk() is warranted).

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 982bb5f..c590c58 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -22,6 +22,7 @@ 
 #include <xen/compat.h>
 #include <xen/libelf.h>
 #include <xen/pfn.h>
+#include <xen/guest_access.h>
 #include <asm/regs.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -43,6 +44,11 @@  static long __initdata dom0_nrpages;
 static long __initdata dom0_min_nrpages;
 static long __initdata dom0_max_nrpages = LONG_MAX;
 
+/* Size of the VM86 TSS for virtual 8086 mode to use. */
+#define HVM_VM86_TSS_SIZE   128
+
+static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1];
+
 /*
  * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
  * 
@@ -304,7 +310,8 @@  static unsigned long __init compute_dom0_nr_pages(
             avail -= max_pdx >> s;
     }
 
-    need_paging = opt_dom0_shadow || (is_pvh_domain(d) && !iommu_hap_pt_share);
+    need_paging = opt_dom0_shadow || (has_hvm_container_domain(d) &&
+                  (!iommu_hap_pt_share || !paging_mode_hap(d)));
     for ( ; ; need_paging = 0 )
     {
         nr_pages = dom0_nrpages;
@@ -336,7 +343,8 @@  static unsigned long __init compute_dom0_nr_pages(
         avail -= dom0_paging_pages(d, nr_pages);
     }
 
-    if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
+    if ( is_pv_domain(d) &&
+         (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
          ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) )
     {
         /*
@@ -547,11 +555,12 @@  static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
     ASSERT(nr_holes == 0);
 }
 
-static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
+static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
 {
     struct e820entry *entry, *entry_guest;
     unsigned int i;
     unsigned long pages, cur_pages = 0;
+    uint64_t start, end;
 
     /*
      * Craft the e820 memory map for Dom0 based on the hardware e820 map.
@@ -579,8 +588,19 @@  static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
             continue;
         }
 
-        *entry_guest = *entry;
-        pages = PFN_UP(entry_guest->size);
+        /*
+         * Make sure the start and length are aligned to PAGE_SIZE, because
+         * that's the minimum granularity of the 2nd stage translation.
+         */
+        start = ROUNDUP(entry->addr, PAGE_SIZE);
+        end = (entry->addr + entry->size) & PAGE_MASK;
+        if ( start >= end )
+            continue;
+
+        entry_guest->type = E820_RAM;
+        entry_guest->addr = start;
+        entry_guest->size = end - start;
+        pages = PFN_DOWN(entry_guest->size);
         if ( (cur_pages + pages) > nr_pages )
         {
             /* Truncate region */
@@ -591,6 +611,8 @@  static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
         {
             cur_pages += pages;
         }
+        ASSERT(IS_ALIGNED(entry_guest->addr, PAGE_SIZE) &&
+               IS_ALIGNED(entry_guest->size, PAGE_SIZE));
  next:
         d->arch.nr_e820++;
         entry_guest++;
@@ -1641,7 +1663,7 @@  static int __init construct_dom0_pv(
         dom0_update_physmap(d, pfn, mfn, 0);
 
         pvh_map_all_iomem(d, nr_pages);
-        pvh_setup_e820(d, nr_pages);
+        hvm_setup_e820(d, nr_pages);
     }
 
     if ( d->domain_id == hardware_domid )
@@ -1657,15 +1679,238 @@  out:
     return rc;
 }
 
+/* Populate an HVM memory range using the biggest possible order. */
+static void __init hvm_populate_memory_range(struct domain *d, uint64_t start,
+                                             uint64_t size)
+{
+    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
+    unsigned int order;
+    struct page_info *page;
+    int rc;
+
+    ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE));
+
+    order = MAX_ORDER;
+    while ( size != 0 )
+    {
+        order = min(get_order_from_bytes_floor(size), order);
+        page = alloc_domheap_pages(d, order, memflags);
+        if ( page == NULL )
+        {
+            if ( order == 0 && memflags )
+            {
+                /* Try again without any memflags. */
+                memflags = 0;
+                order = MAX_ORDER;
+                continue;
+            }
+            if ( order == 0 )
+                panic("Unable to allocate memory with order 0!\n");
+            order--;
+            continue;
+        }
+
+        hvm_mem_stats[order]++;
+        rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)),
+                                    _mfn(page_to_mfn(page)), order);
+        if ( rc != 0 )
+            panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n",
+                  start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc);
+        start += ((uint64_t)1) << (order + PAGE_SHIFT);
+        size -= ((uint64_t)1) << (order + PAGE_SHIFT);
+        if ( (size & 0xffffffff) == 0 )
+            process_pending_softirqs();
+    }
+
+}
+
+static int __init hvm_setup_vmx_unrestricted_guest(struct domain *d)
+{
+    struct e820entry *entry;
+    p2m_type_t p2mt;
+    uint32_t rc, *ident_pt;
+    uint8_t *tss;
+    mfn_t mfn;
+    paddr_t gaddr = 0;
+    int i;
+
+    /*
+     * Stole some space from the last found RAM region. One page will be
+     * used for the identify page tables, and the remaining space for the
+     * VM86 TSS. Note that after this not all e820 regions will be aligned
+     * to PAGE_SIZE.
+     */
+    for ( i = 1; i <= d->arch.nr_e820; i++ )
+    {
+        entry = &d->arch.e820[d->arch.nr_e820 - i];
+        if ( entry->type != E820_RAM ||
+             entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE )
+            continue;
+
+        entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE;
+        gaddr = entry->addr + entry->size;
+        break;
+    }
+
+    if ( gaddr == 0 || gaddr < MB(1) )
+    {
+        printk("Unable to find memory to stash the identity map and TSS\n");
+        return -ENOMEM;
+    }
+
+    /*
+     * Identity-map page table is required for running with CR0.PG=0
+     * when using Intel EPT. Create a 32-bit non-PAE page directory of
+     * superpages.
+     */
+    tss = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)),
+                         &mfn, &p2mt, 0, &rc);
+    if ( tss == NULL )
+    {
+        printk("Unable to map VM86 TSS area\n");
+        return -ENOMEM;
+    }
+    tss += (gaddr & ~PAGE_MASK);
+    memset(tss, 0, HVM_VM86_TSS_SIZE);
+    unmap_domain_page(tss);
+    put_page(mfn_to_page(mfn_x(mfn)));
+    d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] = gaddr;
+    gaddr += HVM_VM86_TSS_SIZE;
+    ASSERT(IS_ALIGNED(gaddr, PAGE_SIZE));
+
+    ident_pt = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)),
+                              &mfn, &p2mt, 0, &rc);
+    if ( ident_pt == NULL )
+    {
+        printk("Unable to map identity page tables\n");
+        return -ENOMEM;
+    }
+    for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
+        ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+                       _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+    unmap_domain_page(ident_pt);
+    put_page(mfn_to_page(mfn_x(mfn)));
+    d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] = gaddr;
+
+    return 0;
+}
+
+static int __init hvm_setup_p2m(struct domain *d)
+{
+    struct vcpu *saved_current, *v = d->vcpu[0];
+    unsigned long nr_pages;
+    int i, rc, preempted;
+
+    printk("** Preparing memory map **\n");
+
+    /*
+     * Subtract one page for the EPT identity page table and two pages
+     * for the MADT replacement.
+     */
+    nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3;
+
+    hvm_setup_e820(d, nr_pages);
+    do {
+        preempted = 0;
+        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
+                              &preempted);
+        process_pending_softirqs();
+    } while ( preempted );
+
+    /*
+     * Special treatment for memory < 1MB:
+     *  - Copy the data in e820 regions marked as RAM (BDA, EBDA...).
+     *  - Map everything else as 1:1.
+     * NB: all this only makes sense if booted from legacy BIOSes.
+     */
+    rc = modify_mmio_11(d, 0, PFN_DOWN(MB(1)), true);
+    if ( rc )
+    {
+        printk("Failed to map low 1MB 1:1: %d\n", rc);
+        return rc;
+    }
+
+    printk("** Populating memory map **\n");
+    /* Populate memory map. */
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        if ( d->arch.e820[i].type != E820_RAM )
+            continue;
+
+        hvm_populate_memory_range(d, d->arch.e820[i].addr,
+                                  d->arch.e820[i].size);
+        if ( d->arch.e820[i].addr < MB(1) )
+        {
+            unsigned long end = min_t(unsigned long,
+                            d->arch.e820[i].addr + d->arch.e820[i].size, MB(1));
+
+            saved_current = current;
+            set_current(v);
+            rc = hvm_copy_to_guest_phys(d->arch.e820[i].addr,
+                                        maddr_to_virt(d->arch.e820[i].addr),
+                                        end - d->arch.e820[i].addr);
+            set_current(saved_current);
+            if ( rc != HVMCOPY_okay )
+            {
+                printk("Unable to copy RAM region %#lx - %#lx\n",
+                       d->arch.e820[i].addr, end);
+                return -EFAULT;
+            }
+        }
+    }
+
+    printk("Memory allocation stats:\n");
+    for ( i = 0; i <= MAX_ORDER; i++ )
+    {
+        if ( hvm_mem_stats[MAX_ORDER - i] != 0 )
+            printk("Order %2u: %pZ\n", MAX_ORDER - i,
+                   _p(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) *
+                      hvm_mem_stats[MAX_ORDER - i]));
+    }
+
+    if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) )
+    {
+        /*
+         * Since Dom0 cannot be migrated, we will only setup the
+         * unrestricted guest helpers if they are needed by the current
+         * hardware we are running on.
+         */
+        rc = hvm_setup_vmx_unrestricted_guest(d);
+        if ( rc )
+            return rc;
+    }
+
+    printk("Dom0 memory map:\n");
+    print_e820_memory_map(d->arch.e820, d->arch.nr_e820);
+
+    return 0;
+}
+
 static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
                                      unsigned long image_headroom,
                                      module_t *initrd,
                                      void *(*bootstrap_map)(const module_t *),
                                      char *cmdline)
 {
+    int rc;
 
     printk("** Building a PVH Dom0 **\n");
 
+    /* Sanity! */
+    BUG_ON(d->domain_id != 0);
+    BUG_ON(d->vcpu[0] == NULL);
+
+    process_pending_softirqs();
+
+    iommu_hwdom_init(d);
+
+    rc = hvm_setup_p2m(d);
+    if ( rc )
+    {
+        printk("Failed to setup Dom0 physical memory map\n");
+        return rc;
+    }
+
     return 0;
 }