Message ID | 1474991845-27962-16-git-send-email-roger.pau@citrix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
>>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: > @@ -43,6 +44,11 @@ static long __initdata dom0_nrpages; > static long __initdata dom0_min_nrpages; > static long __initdata dom0_max_nrpages = LONG_MAX; > > +/* Size of the VM86 TSS for virtual 8086 mode to use. */ > +#define HVM_VM86_TSS_SIZE 128 > + > +static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1]; This is for your debugging only I suppose? > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages( > avail -= dom0_paging_pages(d, nr_pages); > } > > - if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && > + if ( is_pv_domain(d) && > + (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && Perhaps better to simply force parms->p2m_base to UNSET_ADDR earlier on? > @@ -579,8 +588,19 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) > continue; > } > > - *entry_guest = *entry; > - pages = PFN_UP(entry_guest->size); > + /* > + * Make sure the start and length are aligned to PAGE_SIZE, because > + * that's the minimum granularity of the 2nd stage translation. > + */ > + start = ROUNDUP(entry->addr, PAGE_SIZE); > + end = (entry->addr + entry->size) & PAGE_MASK; > + if ( start >= end ) > + continue; > + > + entry_guest->type = E820_RAM; > + entry_guest->addr = start; > + entry_guest->size = end - start; > + pages = PFN_DOWN(entry_guest->size); > if ( (cur_pages + pages) > nr_pages ) > { > /* Truncate region */ > @@ -591,6 +611,8 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) > { > cur_pages += pages; > } > + ASSERT(IS_ALIGNED(entry_guest->addr, PAGE_SIZE) && > + IS_ALIGNED(entry_guest->size, PAGE_SIZE)); What does this guard against? Your addition arranges for things to be page aligned, and the only adjustment done until we get here is one that obviously also doesn't violate that requirement. I'm all for assertions when they check state which is not obviously right, but here I don't see the need. > @@ -1657,15 +1679,238 @@ out: > return rc; > } > > +/* Populate an HVM memory range using the biggest possible order. */ > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start, > + uint64_t size) > +{ > + static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; > + unsigned int order; > + struct page_info *page; > + int rc; > + > + ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE)); > + > + order = MAX_ORDER; > + while ( size != 0 ) > + { > + order = min(get_order_from_bytes_floor(size), order); > + page = alloc_domheap_pages(d, order, memflags); > + if ( page == NULL ) > + { > + if ( order == 0 && memflags ) > + { > + /* Try again without any memflags. */ > + memflags = 0; > + order = MAX_ORDER; > + continue; > + } > + if ( order == 0 ) > + panic("Unable to allocate memory with order 0!\n"); > + order--; > + continue; > + } Is it not possible to utilize alloc_chunk() here? > + hvm_mem_stats[order]++; > + rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)), > + _mfn(page_to_mfn(page)), order); > + if ( rc != 0 ) > + panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n", [<start>,<end>) please. > + start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc); > + start += ((uint64_t)1) << (order + PAGE_SHIFT); > + size -= ((uint64_t)1) << (order + PAGE_SHIFT); Please prefer 1ULL over (uint64_t)1. > + if ( (size & 0xffffffff) == 0 ) > + process_pending_softirqs(); That's 4Gb at a time - isn't that a little too much? > + } > + > +} Stray blank line. > +static int __init hvm_setup_vmx_unrestricted_guest(struct domain *d) > +{ > + struct e820entry *entry; > + p2m_type_t p2mt; > + uint32_t rc, *ident_pt; > + uint8_t *tss; > + mfn_t mfn; > + paddr_t gaddr = 0; > + int i; unsigned int > + /* > + * Stole some space from the last found RAM region. One page will be Steal > + * used for the identify page tables, and the remaining space for the identity > + * VM86 TSS. Note that after this not all e820 regions will be aligned > + * to PAGE_SIZE. > + */ > + for ( i = 1; i <= d->arch.nr_e820; i++ ) > + { > + entry = &d->arch.e820[d->arch.nr_e820 - i]; > + if ( entry->type != E820_RAM || > + entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE ) > + continue; > + > + entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE; > + gaddr = entry->addr + entry->size; > + break; > + } > + > + if ( gaddr == 0 || gaddr < MB(1) ) > + { > + printk("Unable to find memory to stash the identity map and TSS\n"); > + return -ENOMEM; One function up you panic() on error - please be consistent. Also for one of the other patches I think we figured that the TSS isn't really required, so please only warn in that case. > + } > + > + /* > + * Identity-map page table is required for running with CR0.PG=0 > + * when using Intel EPT. Create a 32-bit non-PAE page directory of > + * superpages. > + */ > + tss = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)), > + &mfn, &p2mt, 0, &rc); Comment and operation don't really fit together. > +static int __init hvm_setup_p2m(struct domain *d) > +{ > + struct vcpu *saved_current, *v = d->vcpu[0]; > + unsigned long nr_pages; > + int i, rc, preempted; > + > + printk("** Preparing memory map **\n"); Debugging leftover again? > + /* > + * Subtract one page for the EPT identity page table and two pages > + * for the MADT replacement. > + */ > + nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3; How do you know the MADT replacement requires two pages? Isn't that CPU-count dependent? And doesn't the partial page used for the TSS also need accounting for here? > + hvm_setup_e820(d, nr_pages); > + do { > + preempted = 0; > + paging_set_allocation(d, dom0_paging_pages(d, nr_pages), > + &preempted); > + process_pending_softirqs(); > + } while ( preempted ); > + > + /* > + * Special treatment for memory < 1MB: > + * - Copy the data in e820 regions marked as RAM (BDA, EBDA...). > + * - Map everything else as 1:1. > + * NB: all this only makes sense if booted from legacy BIOSes. > + */ > + rc = modify_mmio_11(d, 0, PFN_DOWN(MB(1)), true); > + if ( rc ) > + { > + printk("Failed to map low 1MB 1:1: %d\n", rc); > + return rc; > + } > + > + printk("** Populating memory map **\n"); > + /* Populate memory map. */ > + for ( i = 0; i < d->arch.nr_e820; i++ ) > + { > + if ( d->arch.e820[i].type != E820_RAM ) > + continue; > + > + hvm_populate_memory_range(d, d->arch.e820[i].addr, > + d->arch.e820[i].size); > + if ( d->arch.e820[i].addr < MB(1) ) > + { > + unsigned long end = min_t(unsigned long, > + d->arch.e820[i].addr + d->arch.e820[i].size, MB(1)); > + > + saved_current = current; > + set_current(v); > + rc = hvm_copy_to_guest_phys(d->arch.e820[i].addr, > + maddr_to_virt(d->arch.e820[i].addr), > + end - d->arch.e820[i].addr); > + set_current(saved_current); > + if ( rc != HVMCOPY_okay ) > + { > + printk("Unable to copy RAM region %#lx - %#lx\n", > + d->arch.e820[i].addr, end); > + return -EFAULT; > + } > + } > + } > + > + printk("Memory allocation stats:\n"); > + for ( i = 0; i <= MAX_ORDER; i++ ) > + { > + if ( hvm_mem_stats[MAX_ORDER - i] != 0 ) > + printk("Order %2u: %pZ\n", MAX_ORDER - i, > + _p(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) * > + hvm_mem_stats[MAX_ORDER - i])); > + } > + > + if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) ) > + { > + /* > + * Since Dom0 cannot be migrated, we will only setup the > + * unrestricted guest helpers if they are needed by the current > + * hardware we are running on. > + */ > + rc = hvm_setup_vmx_unrestricted_guest(d); Calling a function of this name inside an if() checking for !vmx_unrestricted_guest() is, well, odd. > static int __init construct_dom0_hvm(struct domain *d, const module_t *image, > unsigned long image_headroom, > module_t *initrd, > void *(*bootstrap_map)(const module_t *), > char *cmdline) > { > + int rc; > > printk("** Building a PVH Dom0 **\n"); > > + /* Sanity! */ > + BUG_ON(d->domain_id != 0); > + BUG_ON(d->vcpu[0] == NULL); May I suggest BUG_ON(d->domain_id); BUG_ON(!d->vcpu[0]); in cases like this? > + process_pending_softirqs(); Why, outside of any loop? Jan
On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote: > >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: > > @@ -43,6 +44,11 @@ static long __initdata dom0_nrpages; > > static long __initdata dom0_min_nrpages; > > static long __initdata dom0_max_nrpages = LONG_MAX; > > > > +/* Size of the VM86 TSS for virtual 8086 mode to use. */ > > +#define HVM_VM86_TSS_SIZE 128 > > + > > +static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1]; > > This is for your debugging only I suppose? Probably, I wasn't sure if it was relevant so I left it here. Would it make sense to only print this for debug builds of the hypervisor? Or better to just remove it? > > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages( > > avail -= dom0_paging_pages(d, nr_pages); > > } > > > > - if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && > > + if ( is_pv_domain(d) && > > + (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && > > Perhaps better to simply force parms->p2m_base to UNSET_ADDR > earlier on? p2m_base is already unconditionally set to UNSET_ADDR for PVHv2 domains, hence the added is_pv_domain check in order to make sure that PVHv2 guests don't get into that branch, which AFAICT is only relevant to PV guests. > > @@ -579,8 +588,19 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) > > continue; > > } > > > > - *entry_guest = *entry; > > - pages = PFN_UP(entry_guest->size); > > + /* > > + * Make sure the start and length are aligned to PAGE_SIZE, because > > + * that's the minimum granularity of the 2nd stage translation. > > + */ > > + start = ROUNDUP(entry->addr, PAGE_SIZE); > > + end = (entry->addr + entry->size) & PAGE_MASK; > > + if ( start >= end ) > > + continue; > > + > > + entry_guest->type = E820_RAM; > > + entry_guest->addr = start; > > + entry_guest->size = end - start; > > + pages = PFN_DOWN(entry_guest->size); > > if ( (cur_pages + pages) > nr_pages ) > > { > > /* Truncate region */ > > @@ -591,6 +611,8 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) > > { > > cur_pages += pages; > > } > > + ASSERT(IS_ALIGNED(entry_guest->addr, PAGE_SIZE) && > > + IS_ALIGNED(entry_guest->size, PAGE_SIZE)); > > What does this guard against? Your addition arranges for things to > be page aligned, and the only adjustment done until we get here is > one that obviously also doesn't violate that requirement. I'm all for > assertions when they check state which is not obviously right, but > here I don't see the need. Right, I'm going to remove it. I've added more seat belts than needed when testing this and forgot to remove them. > > @@ -1657,15 +1679,238 @@ out: > > return rc; > > } > > > > +/* Populate an HVM memory range using the biggest possible order. */ > > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start, > > + uint64_t size) > > +{ > > + static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; > > + unsigned int order; > > + struct page_info *page; > > + int rc; > > + > > + ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE)); > > + > > + order = MAX_ORDER; > > + while ( size != 0 ) > > + { > > + order = min(get_order_from_bytes_floor(size), order); > > + page = alloc_domheap_pages(d, order, memflags); > > + if ( page == NULL ) > > + { > > + if ( order == 0 && memflags ) > > + { > > + /* Try again without any memflags. */ > > + memflags = 0; > > + order = MAX_ORDER; > > + continue; > > + } > > + if ( order == 0 ) > > + panic("Unable to allocate memory with order 0!\n"); > > + order--; > > + continue; > > + } > > Is it not possible to utilize alloc_chunk() here? Not really, alloc_chunk will do a ceil calculation of the number of needed pages, which means we end up allocating bigger chunks than needed and then free them. I prefer this approach, since we always request the exact memory that's required, so there's no need to free leftover pages. > > + hvm_mem_stats[order]++; > > + rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)), > > + _mfn(page_to_mfn(page)), order); > > + if ( rc != 0 ) > > + panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n", > > [<start>,<end>) please. > > > + start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc); > > + start += ((uint64_t)1) << (order + PAGE_SHIFT); > > + size -= ((uint64_t)1) << (order + PAGE_SHIFT); > > Please prefer 1ULL over (uint64_t)1. > > > + if ( (size & 0xffffffff) == 0 ) > > + process_pending_softirqs(); > > That's 4Gb at a time - isn't that a little too much? Hm, it's the same that's used in pvh_add_mem_mapping AFAICT. I could reduce it to 0xfffffff, but I'm also wondering if it makes sense to just call it on each iteration, since we are possibly mapping regions with big orders here. > > + } > > + > > +} > > Stray blank line. > > > +static int __init hvm_setup_vmx_unrestricted_guest(struct domain *d) > > +{ > > + struct e820entry *entry; > > + p2m_type_t p2mt; > > + uint32_t rc, *ident_pt; > > + uint8_t *tss; > > + mfn_t mfn; > > + paddr_t gaddr = 0; > > + int i; > > unsigned int > > > + /* > > + * Stole some space from the last found RAM region. One page will be > > Steal > > > + * used for the identify page tables, and the remaining space for the > > identity > > > + * VM86 TSS. Note that after this not all e820 regions will be aligned > > + * to PAGE_SIZE. > > + */ > > + for ( i = 1; i <= d->arch.nr_e820; i++ ) > > + { > > + entry = &d->arch.e820[d->arch.nr_e820 - i]; > > + if ( entry->type != E820_RAM || > > + entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE ) > > + continue; > > + > > + entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE; > > + gaddr = entry->addr + entry->size; > > + break; > > + } > > + > > + if ( gaddr == 0 || gaddr < MB(1) ) > > + { > > + printk("Unable to find memory to stash the identity map and TSS\n"); > > + return -ENOMEM; > > One function up you panic() on error - please be consistent. Also for > one of the other patches I think we figured that the TSS isn't really > required, so please only warn in that case. > > > + } > > + > > + /* > > + * Identity-map page table is required for running with CR0.PG=0 > > + * when using Intel EPT. Create a 32-bit non-PAE page directory of > > + * superpages. > > + */ > > + tss = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)), > > + &mfn, &p2mt, 0, &rc); > > Comment and operation don't really fit together. > > > +static int __init hvm_setup_p2m(struct domain *d) > > +{ > > + struct vcpu *saved_current, *v = d->vcpu[0]; > > + unsigned long nr_pages; > > + int i, rc, preempted; > > + > > + printk("** Preparing memory map **\n"); > > Debugging leftover again? Should this be merged with the next label, so that it reads as "Preparing and populating the memory map"? > > + /* > > + * Subtract one page for the EPT identity page table and two pages > > + * for the MADT replacement. > > + */ > > + nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3; > > How do you know the MADT replacement requires two pages? Isn't > that CPU-count dependent? And doesn't the partial page used for > the TSS also need accounting for here? Yes, it's CPU-count dependent. This is just an approximation, since we only support up to 256 CPUs on HVM guests, and each Processor Local APIC entry is 8 bytes, this means that the CPU-related data is going to use up to 2048 bytes of data, which still leaves plenty of space for the IO APIC and the Interrupt Source Override entries. We requests two pages in case the original MADT crosses a page boundary. FWIW, I could also fetch the original MADT size earlier and use that as the upper bound here for memory reservation. In the RFC series we also spoke about placing the MADT in a different position that the native one, which would mean that we would end up stealing some space from a RAM region in order to place it, so that we wouldn't have to do this accounting. In fact this is slightly wrong, and it doesn't need to account for the identity page table or the VM86 TSS, since we end up stealing this space from populated RAM regions. At the moment we only need to account for the 2 pages that could be used by the MADT. > > + hvm_setup_e820(d, nr_pages); > > + do { > > + preempted = 0; > > + paging_set_allocation(d, dom0_paging_pages(d, nr_pages), > > + &preempted); > > + process_pending_softirqs(); > > + } while ( preempted ); > > + > > + /* > > + * Special treatment for memory < 1MB: > > + * - Copy the data in e820 regions marked as RAM (BDA, EBDA...). > > + * - Map everything else as 1:1. > > + * NB: all this only makes sense if booted from legacy BIOSes. > > + */ > > + rc = modify_mmio_11(d, 0, PFN_DOWN(MB(1)), true); > > + if ( rc ) > > + { > > + printk("Failed to map low 1MB 1:1: %d\n", rc); > > + return rc; > > + } > > + > > + printk("** Populating memory map **\n"); > > + /* Populate memory map. */ > > + for ( i = 0; i < d->arch.nr_e820; i++ ) > > + { > > + if ( d->arch.e820[i].type != E820_RAM ) > > + continue; > > + > > + hvm_populate_memory_range(d, d->arch.e820[i].addr, > > + d->arch.e820[i].size); > > + if ( d->arch.e820[i].addr < MB(1) ) > > + { > > + unsigned long end = min_t(unsigned long, > > + d->arch.e820[i].addr + d->arch.e820[i].size, MB(1)); > > + > > + saved_current = current; > > + set_current(v); > > + rc = hvm_copy_to_guest_phys(d->arch.e820[i].addr, > > + maddr_to_virt(d->arch.e820[i].addr), > > + end - d->arch.e820[i].addr); > > + set_current(saved_current); > > + if ( rc != HVMCOPY_okay ) > > + { > > + printk("Unable to copy RAM region %#lx - %#lx\n", > > + d->arch.e820[i].addr, end); > > + return -EFAULT; > > + } > > + } > > + } > > + > > + printk("Memory allocation stats:\n"); > > + for ( i = 0; i <= MAX_ORDER; i++ ) > > + { > > + if ( hvm_mem_stats[MAX_ORDER - i] != 0 ) > > + printk("Order %2u: %pZ\n", MAX_ORDER - i, > > + _p(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) * > > + hvm_mem_stats[MAX_ORDER - i])); > > + } > > + > > + if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) ) > > + { > > + /* > > + * Since Dom0 cannot be migrated, we will only setup the > > + * unrestricted guest helpers if they are needed by the current > > + * hardware we are running on. > > + */ > > + rc = hvm_setup_vmx_unrestricted_guest(d); > > Calling a function of this name inside an if() checking for > !vmx_unrestricted_guest() is, well, odd. Yes, that's right. What about calling it hvm_setup_vmx_realmode_helpers? > > static int __init construct_dom0_hvm(struct domain *d, const module_t *image, > > unsigned long image_headroom, > > module_t *initrd, > > void *(*bootstrap_map)(const module_t *), > > char *cmdline) > > { > > + int rc; > > > > printk("** Building a PVH Dom0 **\n"); > > > > + /* Sanity! */ > > + BUG_ON(d->domain_id != 0); > > + BUG_ON(d->vcpu[0] == NULL); > > May I suggest > > BUG_ON(d->domain_id); > BUG_ON(!d->vcpu[0]); > > in cases like this? Yes, I have the tendency to not use '!' or perform direct checks unless it's a boolean type. > > + process_pending_softirqs(); > > Why, outside of any loop? It's the same that's done in construct_dom0_pv, so I though that it was a good idea to drain any pending softirqs before starting domain build. Roger.
>>> On 04.10.16 at 11:12, <roger.pau@citrix.com> wrote: > On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote: >> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: >> > @@ -43,6 +44,11 @@ static long __initdata dom0_nrpages; >> > static long __initdata dom0_min_nrpages; >> > static long __initdata dom0_max_nrpages = LONG_MAX; >> > >> > +/* Size of the VM86 TSS for virtual 8086 mode to use. */ >> > +#define HVM_VM86_TSS_SIZE 128 >> > + >> > +static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1]; >> >> This is for your debugging only I suppose? > > Probably, I wasn't sure if it was relevant so I left it here. Would it make > sense to only print this for debug builds of the hypervisor? Or better to > just remove it? Statistics in debug builds often aren't really meaningful, so my order of preference would be to remove it, then to keep it for all cases but default it to be silent in non-debug builds (with a command line option to enable). >> > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages( >> > avail -= dom0_paging_pages(d, nr_pages); >> > } >> > >> > - if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && >> > + if ( is_pv_domain(d) && >> > + (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && >> >> Perhaps better to simply force parms->p2m_base to UNSET_ADDR >> earlier on? > > p2m_base is already unconditionally set to UNSET_ADDR for PVHv2 domains, > hence the added is_pv_domain check in order to make sure that PVHv2 guests > don't get into that branch, which AFAICT is only relevant to PV guests. This reads contradictory: If it's set to UNSET_ADDR, why the extra check? >> > @@ -1657,15 +1679,238 @@ out: >> > return rc; >> > } >> > >> > +/* Populate an HVM memory range using the biggest possible order. */ >> > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start, >> > + uint64_t size) >> > +{ >> > + static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; >> > + unsigned int order; >> > + struct page_info *page; >> > + int rc; >> > + >> > + ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE)); >> > + >> > + order = MAX_ORDER; >> > + while ( size != 0 ) >> > + { >> > + order = min(get_order_from_bytes_floor(size), order); >> > + page = alloc_domheap_pages(d, order, memflags); >> > + if ( page == NULL ) >> > + { >> > + if ( order == 0 && memflags ) >> > + { >> > + /* Try again without any memflags. */ >> > + memflags = 0; >> > + order = MAX_ORDER; >> > + continue; >> > + } >> > + if ( order == 0 ) >> > + panic("Unable to allocate memory with order 0!\n"); >> > + order--; >> > + continue; >> > + } >> >> Is it not possible to utilize alloc_chunk() here? > > Not really, alloc_chunk will do a ceil calculation of the number of needed > pages, which means we end up allocating bigger chunks than needed and then > free them. I prefer this approach, since we always request the exact memory > that's required, so there's no need to free leftover pages. Hmm, in that case at least some of the shared logic would be nice to get abstracted out. >> > + if ( (size & 0xffffffff) == 0 ) >> > + process_pending_softirqs(); >> >> That's 4Gb at a time - isn't that a little too much? > > Hm, it's the same that's used in pvh_add_mem_mapping AFAICT. I could reduce > it to 0xfffffff, but I'm also wondering if it makes sense to just call it on > each iteration, since we are possibly mapping regions with big orders here. Iteration could is all that matters here really; the size of the mapping wouldn't normally (as long as its one of the hardware supported sizes). Doing the check on every iteration may be a little much (you may want to check whether there's noticeable extra overhead), but doing the check on like every 64 iterations may limit overhead enough to be acceptable without making this more sophisticated. >> > +static int __init hvm_setup_p2m(struct domain *d) >> > +{ >> > + struct vcpu *saved_current, *v = d->vcpu[0]; >> > + unsigned long nr_pages; >> > + int i, rc, preempted; >> > + >> > + printk("** Preparing memory map **\n"); >> >> Debugging leftover again? > > Should this be merged with the next label, so that it reads as "Preparing > and populating the memory map"? No, I'd rather see the other(s) gone too. >> > + /* >> > + * Subtract one page for the EPT identity page table and two pages >> > + * for the MADT replacement. >> > + */ >> > + nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3; >> >> How do you know the MADT replacement requires two pages? Isn't >> that CPU-count dependent? And doesn't the partial page used for >> the TSS also need accounting for here? > > Yes, it's CPU-count dependent. This is just an approximation, since we only > support up to 256 CPUs on HVM guests, and each Processor Local APIC entry is > > 8 bytes, this means that the CPU-related data is going to use up to 2048 > bytes of data, which still leaves plenty of space for the IO APIC and the > Interrupt Source Override entries. We requests two pages in case the > original MADT crosses a page boundary. FWIW, I could also fetch the original > MADT size earlier and use that as the upper bound here for memory > reservation. That wouldn't help in case someone wants more vCPU-s than there are pCPU-s. And baking in another assumption of there being <= 128 vCPU-s when there's already work being done to eliminate that limit is likely not too good an idea. > In the RFC series we also spoke about placing the MADT in a different > position that the native one, which would mean that we would end up stealing > some space from a RAM region in order to place it, so that we wouldn't have > to do this accounting. Putting the new MADT at the same address as the old one won't work anyway, again because possibly vCPU-s > pCPU-s. >> > + if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) ) >> > + { >> > + /* >> > + * Since Dom0 cannot be migrated, we will only setup the >> > + * unrestricted guest helpers if they are needed by the current >> > + * hardware we are running on. >> > + */ >> > + rc = hvm_setup_vmx_unrestricted_guest(d); >> >> Calling a function of this name inside an if() checking for >> !vmx_unrestricted_guest() is, well, odd. > > Yes, that's right. What about calling it hvm_setup_vmx_realmode_helpers? Sounds quite a bit more reasonable. >> > + process_pending_softirqs(); >> >> Why, outside of any loop? > > It's the same that's done in construct_dom0_pv, so I though that it was a > good idea to drain any pending softirqs before starting domain build. Perhaps in that case it should be pulled out of there into the wrapper? Jan
On Tue, Oct 04, 2016 at 05:16:09AM -0600, Jan Beulich wrote: > >>> On 04.10.16 at 11:12, <roger.pau@citrix.com> wrote: > > On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote: > >> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: > >> > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages( > >> > avail -= dom0_paging_pages(d, nr_pages); > >> > } > >> > > >> > - if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && > >> > + if ( is_pv_domain(d) && > >> > + (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && > >> > >> Perhaps better to simply force parms->p2m_base to UNSET_ADDR > >> earlier on? > > > > p2m_base is already unconditionally set to UNSET_ADDR for PVHv2 domains, > > hence the added is_pv_domain check in order to make sure that PVHv2 guests > > don't get into that branch, which AFAICT is only relevant to PV guests. > > This reads contradictory: If it's set to UNSET_ADDR, why the extra > check? On PVHv2 p2m_base == UNSET_ADDR already, so the extra check is to make sure PVHv2 guests don't execute the code inside of the if condition, which is for classic PV guests (note that the check is not against != UNSET_ADDR). Or maybe I'm missing something? > >> > @@ -1657,15 +1679,238 @@ out: > >> > return rc; > >> > } > >> > > >> > +/* Populate an HVM memory range using the biggest possible order. */ > >> > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start, > >> > + uint64_t size) > >> > +{ > >> > + static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; > >> > + unsigned int order; > >> > + struct page_info *page; > >> > + int rc; > >> > + > >> > + ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE)); > >> > + > >> > + order = MAX_ORDER; > >> > + while ( size != 0 ) > >> > + { > >> > + order = min(get_order_from_bytes_floor(size), order); > >> > + page = alloc_domheap_pages(d, order, memflags); > >> > + if ( page == NULL ) > >> > + { > >> > + if ( order == 0 && memflags ) > >> > + { > >> > + /* Try again without any memflags. */ > >> > + memflags = 0; > >> > + order = MAX_ORDER; > >> > + continue; > >> > + } > >> > + if ( order == 0 ) > >> > + panic("Unable to allocate memory with order 0!\n"); > >> > + order--; > >> > + continue; > >> > + } > >> > >> Is it not possible to utilize alloc_chunk() here? > > > > Not really, alloc_chunk will do a ceil calculation of the number of needed > > pages, which means we end up allocating bigger chunks than needed and then > > free them. I prefer this approach, since we always request the exact memory > > that's required, so there's no need to free leftover pages. > > Hmm, in that case at least some of the shared logic would be nice to > get abstracted out. TBH, I don't see much benefit in that, alloc_chunk works fine for PV guests because Xen ends up walking the list of returned pages and mapping them one to one. This is IMHO not what should be done for PVH guests, and instead the caller needs to know the actual order of the allocated chunk, so it can pass it to guest_physmap_add_page. ATM it's a very simple function that's clear, if I start mixing up bits of alloc_chunk it's going to get more complex, and I would like to avoid that, so that PVH Dom0 build doesn't end up like current PV Dom0 build. > >> > + if ( (size & 0xffffffff) == 0 ) > >> > + process_pending_softirqs(); > >> > >> That's 4Gb at a time - isn't that a little too much? > > > > Hm, it's the same that's used in pvh_add_mem_mapping AFAICT. I could reduce > > it to 0xfffffff, but I'm also wondering if it makes sense to just call it on > > each iteration, since we are possibly mapping regions with big orders here. > > Iteration could is all that matters here really; the size of the mapping > wouldn't normally (as long as its one of the hardware supported sizes). > Doing the check on every iteration may be a little much (you may want > to check whether there's noticeable extra overhead), but doing the > check on like every 64 iterations may limit overhead enough to be > acceptable without making this more sophisticated. Ack, I've done it using a define, so we can always change that 64 to something else if the need arises. > >> > +static int __init hvm_setup_p2m(struct domain *d) > >> > +{ > >> > + struct vcpu *saved_current, *v = d->vcpu[0]; > >> > + unsigned long nr_pages; > >> > + int i, rc, preempted; > >> > + > >> > + printk("** Preparing memory map **\n"); > >> > >> Debugging leftover again? > > > > Should this be merged with the next label, so that it reads as "Preparing > > and populating the memory map"? > > No, I'd rather see the other(s) gone too. Done, I've just left the initial "Building a PVH Dom0" message. > >> > + /* > >> > + * Subtract one page for the EPT identity page table and two pages > >> > + * for the MADT replacement. > >> > + */ > >> > + nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3; > >> > >> How do you know the MADT replacement requires two pages? Isn't > >> that CPU-count dependent? And doesn't the partial page used for > >> the TSS also need accounting for here? > > > > Yes, it's CPU-count dependent. This is just an approximation, since we only > > support up to 256 CPUs on HVM guests, and each Processor Local APIC entry is > > > > 8 bytes, this means that the CPU-related data is going to use up to 2048 > > bytes of data, which still leaves plenty of space for the IO APIC and the > > Interrupt Source Override entries. We requests two pages in case the > > original MADT crosses a page boundary. FWIW, I could also fetch the original > > MADT size earlier and use that as the upper bound here for memory > > reservation. > > That wouldn't help in case someone wants more vCPU-s than there > are pCPU-s. And baking in another assumption of there being <= > 128 vCPU-s when there's already work being done to eliminate that > limit is likely not too good an idea. Right, I've now removed all this, since for the MADT we are going to steal RAM pages. > > In the RFC series we also spoke about placing the MADT in a different > > position that the native one, which would mean that we would end up stealing > > some space from a RAM region in order to place it, so that we wouldn't have > > to do this accounting. > > Putting the new MADT at the same address as the old one won't work > anyway, again because possibly vCPU-s > pCPU-s. Yes, although from my tests doing CPU over-allocation on HVM guests works very badly. > >> > + process_pending_softirqs(); > >> > >> Why, outside of any loop? > > > > It's the same that's done in construct_dom0_pv, so I though that it was a > > good idea to drain any pending softirqs before starting domain build. > > Perhaps in that case it should be pulled out of there into the > wrapper? Yes, good point. Thanks, Roger.
It seems like I've forgotten to answer one of your comments in a previous email, sorry. On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote: > >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: > > + * VM86 TSS. Note that after this not all e820 regions will be aligned > > + * to PAGE_SIZE. > > + */ > > + for ( i = 1; i <= d->arch.nr_e820; i++ ) > > + { > > + entry = &d->arch.e820[d->arch.nr_e820 - i]; > > + if ( entry->type != E820_RAM || > > + entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE ) > > + continue; > > + > > + entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE; > > + gaddr = entry->addr + entry->size; > > + break; > > + } > > + > > + if ( gaddr == 0 || gaddr < MB(1) ) > > + { > > + printk("Unable to find memory to stash the identity map and TSS\n"); > > + return -ENOMEM; > > One function up you panic() on error - please be consistent. Also for > one of the other patches I think we figured that the TSS isn't really > required, so please only warn in that case. The allocation is done together for the ident PT and the TSS, and while the TSS is not mandatory the identity page-table is (or else Dom0 kernel won't boot at all on this kind of systems). In any case, it's almost impossible for this allocation to fail (because Xen is not actually allocating memory, just stealing a part of a RAM region that has already been populated). Roger.
>>> On 11.10.16 at 16:01, <roger.pau@citrix.com> wrote: > On Tue, Oct 04, 2016 at 05:16:09AM -0600, Jan Beulich wrote: >> >>> On 04.10.16 at 11:12, <roger.pau@citrix.com> wrote: >> > On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote: >> >> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: >> >> > @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages( >> >> > avail -= dom0_paging_pages(d, nr_pages); >> >> > } >> >> > >> >> > - if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && >> >> > + if ( is_pv_domain(d) && >> >> > + (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && >> >> >> >> Perhaps better to simply force parms->p2m_base to UNSET_ADDR >> >> earlier on? >> > >> > p2m_base is already unconditionally set to UNSET_ADDR for PVHv2 domains, >> > hence the added is_pv_domain check in order to make sure that PVHv2 guests >> > don't get into that branch, which AFAICT is only relevant to PV guests. >> >> This reads contradictory: If it's set to UNSET_ADDR, why the extra >> check? > > On PVHv2 p2m_base == UNSET_ADDR already, so the extra check is to make sure > PVHv2 guests don't execute the code inside of the if condition, which is > for classic PV guests (note that the check is not against != UNSET_ADDR). Or > maybe I'm missing something? No, I have to apologize - I read things the wrong way round. Thanks for bearing with me. >> >> > @@ -1657,15 +1679,238 @@ out: >> >> > return rc; >> >> > } >> >> > >> >> > +/* Populate an HVM memory range using the biggest possible order. */ >> >> > +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start, >> >> > + uint64_t size) >> >> > +{ >> >> > + static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; >> >> > + unsigned int order; >> >> > + struct page_info *page; >> >> > + int rc; >> >> > + >> >> > + ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE)); >> >> > + >> >> > + order = MAX_ORDER; >> >> > + while ( size != 0 ) >> >> > + { >> >> > + order = min(get_order_from_bytes_floor(size), order); >> >> > + page = alloc_domheap_pages(d, order, memflags); >> >> > + if ( page == NULL ) >> >> > + { >> >> > + if ( order == 0 && memflags ) >> >> > + { >> >> > + /* Try again without any memflags. */ >> >> > + memflags = 0; >> >> > + order = MAX_ORDER; >> >> > + continue; >> >> > + } >> >> > + if ( order == 0 ) >> >> > + panic("Unable to allocate memory with order 0!\n"); >> >> > + order--; >> >> > + continue; >> >> > + } >> >> >> >> Is it not possible to utilize alloc_chunk() here? >> > >> > Not really, alloc_chunk will do a ceil calculation of the number of needed >> > pages, which means we end up allocating bigger chunks than needed and then >> > free them. I prefer this approach, since we always request the exact memory >> > that's required, so there's no need to free leftover pages. >> >> Hmm, in that case at least some of the shared logic would be nice to >> get abstracted out. > > TBH, I don't see much benefit in that, alloc_chunk works fine for PV guests > because Xen ends up walking the list of returned pages and mapping them one > to one. This is IMHO not what should be done for PVH guests, and instead the > caller needs to know the actual order of the allocated chunk, so it can pass > it to guest_physmap_add_page. ATM it's a very simple function that's clear, > if I start mixing up bits of alloc_chunk it's going to get more complex, and > I would like to avoid that, so that PVH Dom0 build doesn't end up like > current PV Dom0 build. Hmm - I did say "abstract out", not "re-use". The way how memflags get set and decreasing orders get tried in your code looks suspiciously similar to what alloc_chunk() does. >> > In the RFC series we also spoke about placing the MADT in a different >> > position that the native one, which would mean that we would end up stealing >> > some space from a RAM region in order to place it, so that we wouldn't have >> > to do this accounting. >> >> Putting the new MADT at the same address as the old one won't work >> anyway, again because possibly vCPU-s > pCPU-s. > > Yes, although from my tests doing CPU over-allocation on HVM guests works > very badly. Interesting. I didn't try recently, but I recall having tried a longer while ago without seeing severe issues. Jan
>>> On 11.10.16 at 16:06, <roger.pau@citrix.com> wrote: > On Fri, Sep 30, 2016 at 09:52:56AM -0600, Jan Beulich wrote: >> >>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: >> > + * VM86 TSS. Note that after this not all e820 regions will be aligned >> > + * to PAGE_SIZE. >> > + */ >> > + for ( i = 1; i <= d->arch.nr_e820; i++ ) >> > + { >> > + entry = &d->arch.e820[d->arch.nr_e820 - i]; >> > + if ( entry->type != E820_RAM || >> > + entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE ) >> > + continue; >> > + >> > + entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE; >> > + gaddr = entry->addr + entry->size; >> > + break; >> > + } >> > + >> > + if ( gaddr == 0 || gaddr < MB(1) ) >> > + { >> > + printk("Unable to find memory to stash the identity map and TSS\n"); >> > + return -ENOMEM; >> >> One function up you panic() on error - please be consistent. Also for >> one of the other patches I think we figured that the TSS isn't really >> required, so please only warn in that case. > > The allocation is done together for the ident PT and the TSS, and while > the TSS is not mandatory the identity page-table is (or else Dom0 kernel > won't boot at all on this kind of systems). In any case, it's almost > impossible for this allocation to fail (because Xen is not actually > allocating memory, just stealing a part of a RAM region that has already > been populated). All fine, but I continue to think errors should be dealt with in a consistent manner (no matter how [un]likely they are), and warnings better get issued when there's any meaningful impact due to whatever triggers the warning. Albeit - having looked at the full patch again - it looks like I was wrong with naming this "warning": Both here and further down you return an error if any of the steps failed. The allocation being done in one go is fine to be an error; failure of the mapping of the non-required TSS, otoh, shouldn't cause the loading of Dom0 to fail (and I think that is where I've been unsure the printk() is warranted). Jan
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index 982bb5f..c590c58 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -22,6 +22,7 @@ #include <xen/compat.h> #include <xen/libelf.h> #include <xen/pfn.h> +#include <xen/guest_access.h> #include <asm/regs.h> #include <asm/system.h> #include <asm/io.h> @@ -43,6 +44,11 @@ static long __initdata dom0_nrpages; static long __initdata dom0_min_nrpages; static long __initdata dom0_max_nrpages = LONG_MAX; +/* Size of the VM86 TSS for virtual 8086 mode to use. */ +#define HVM_VM86_TSS_SIZE 128 + +static unsigned int __initdata hvm_mem_stats[MAX_ORDER + 1]; + /* * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>] * @@ -304,7 +310,8 @@ static unsigned long __init compute_dom0_nr_pages( avail -= max_pdx >> s; } - need_paging = opt_dom0_shadow || (is_pvh_domain(d) && !iommu_hap_pt_share); + need_paging = opt_dom0_shadow || (has_hvm_container_domain(d) && + (!iommu_hap_pt_share || !paging_mode_hap(d))); for ( ; ; need_paging = 0 ) { nr_pages = dom0_nrpages; @@ -336,7 +343,8 @@ static unsigned long __init compute_dom0_nr_pages( avail -= dom0_paging_pages(d, nr_pages); } - if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && + if ( is_pv_domain(d) && + (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) ) { /* @@ -547,11 +555,12 @@ static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages) ASSERT(nr_holes == 0); } -static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) +static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages) { struct e820entry *entry, *entry_guest; unsigned int i; unsigned long pages, cur_pages = 0; + uint64_t start, end; /* * Craft the e820 memory map for Dom0 based on the hardware e820 map. @@ -579,8 +588,19 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) continue; } - *entry_guest = *entry; - pages = PFN_UP(entry_guest->size); + /* + * Make sure the start and length are aligned to PAGE_SIZE, because + * that's the minimum granularity of the 2nd stage translation. + */ + start = ROUNDUP(entry->addr, PAGE_SIZE); + end = (entry->addr + entry->size) & PAGE_MASK; + if ( start >= end ) + continue; + + entry_guest->type = E820_RAM; + entry_guest->addr = start; + entry_guest->size = end - start; + pages = PFN_DOWN(entry_guest->size); if ( (cur_pages + pages) > nr_pages ) { /* Truncate region */ @@ -591,6 +611,8 @@ static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) { cur_pages += pages; } + ASSERT(IS_ALIGNED(entry_guest->addr, PAGE_SIZE) && + IS_ALIGNED(entry_guest->size, PAGE_SIZE)); next: d->arch.nr_e820++; entry_guest++; @@ -1641,7 +1663,7 @@ static int __init construct_dom0_pv( dom0_update_physmap(d, pfn, mfn, 0); pvh_map_all_iomem(d, nr_pages); - pvh_setup_e820(d, nr_pages); + hvm_setup_e820(d, nr_pages); } if ( d->domain_id == hardware_domid ) @@ -1657,15 +1679,238 @@ out: return rc; } +/* Populate an HVM memory range using the biggest possible order. */ +static void __init hvm_populate_memory_range(struct domain *d, uint64_t start, + uint64_t size) +{ + static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; + unsigned int order; + struct page_info *page; + int rc; + + ASSERT(IS_ALIGNED(size, PAGE_SIZE) && IS_ALIGNED(start, PAGE_SIZE)); + + order = MAX_ORDER; + while ( size != 0 ) + { + order = min(get_order_from_bytes_floor(size), order); + page = alloc_domheap_pages(d, order, memflags); + if ( page == NULL ) + { + if ( order == 0 && memflags ) + { + /* Try again without any memflags. */ + memflags = 0; + order = MAX_ORDER; + continue; + } + if ( order == 0 ) + panic("Unable to allocate memory with order 0!\n"); + order--; + continue; + } + + hvm_mem_stats[order]++; + rc = guest_physmap_add_page(d, _gfn(PFN_DOWN(start)), + _mfn(page_to_mfn(page)), order); + if ( rc != 0 ) + panic("Failed to populate memory: [%" PRIx64 " - %" PRIx64 "] %d\n", + start, start + (((uint64_t)1) << (order + PAGE_SHIFT)), rc); + start += ((uint64_t)1) << (order + PAGE_SHIFT); + size -= ((uint64_t)1) << (order + PAGE_SHIFT); + if ( (size & 0xffffffff) == 0 ) + process_pending_softirqs(); + } + +} + +static int __init hvm_setup_vmx_unrestricted_guest(struct domain *d) +{ + struct e820entry *entry; + p2m_type_t p2mt; + uint32_t rc, *ident_pt; + uint8_t *tss; + mfn_t mfn; + paddr_t gaddr = 0; + int i; + + /* + * Stole some space from the last found RAM region. One page will be + * used for the identify page tables, and the remaining space for the + * VM86 TSS. Note that after this not all e820 regions will be aligned + * to PAGE_SIZE. + */ + for ( i = 1; i <= d->arch.nr_e820; i++ ) + { + entry = &d->arch.e820[d->arch.nr_e820 - i]; + if ( entry->type != E820_RAM || + entry->size < PAGE_SIZE + HVM_VM86_TSS_SIZE ) + continue; + + entry->size -= PAGE_SIZE + HVM_VM86_TSS_SIZE; + gaddr = entry->addr + entry->size; + break; + } + + if ( gaddr == 0 || gaddr < MB(1) ) + { + printk("Unable to find memory to stash the identity map and TSS\n"); + return -ENOMEM; + } + + /* + * Identity-map page table is required for running with CR0.PG=0 + * when using Intel EPT. Create a 32-bit non-PAE page directory of + * superpages. + */ + tss = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)), + &mfn, &p2mt, 0, &rc); + if ( tss == NULL ) + { + printk("Unable to map VM86 TSS area\n"); + return -ENOMEM; + } + tss += (gaddr & ~PAGE_MASK); + memset(tss, 0, HVM_VM86_TSS_SIZE); + unmap_domain_page(tss); + put_page(mfn_to_page(mfn_x(mfn))); + d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] = gaddr; + gaddr += HVM_VM86_TSS_SIZE; + ASSERT(IS_ALIGNED(gaddr, PAGE_SIZE)); + + ident_pt = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)), + &mfn, &p2mt, 0, &rc); + if ( ident_pt == NULL ) + { + printk("Unable to map identity page tables\n"); + return -ENOMEM; + } + for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ ) + ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER | + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + unmap_domain_page(ident_pt); + put_page(mfn_to_page(mfn_x(mfn))); + d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] = gaddr; + + return 0; +} + +static int __init hvm_setup_p2m(struct domain *d) +{ + struct vcpu *saved_current, *v = d->vcpu[0]; + unsigned long nr_pages; + int i, rc, preempted; + + printk("** Preparing memory map **\n"); + + /* + * Subtract one page for the EPT identity page table and two pages + * for the MADT replacement. + */ + nr_pages = compute_dom0_nr_pages(d, NULL, 0) - 3; + + hvm_setup_e820(d, nr_pages); + do { + preempted = 0; + paging_set_allocation(d, dom0_paging_pages(d, nr_pages), + &preempted); + process_pending_softirqs(); + } while ( preempted ); + + /* + * Special treatment for memory < 1MB: + * - Copy the data in e820 regions marked as RAM (BDA, EBDA...). + * - Map everything else as 1:1. + * NB: all this only makes sense if booted from legacy BIOSes. + */ + rc = modify_mmio_11(d, 0, PFN_DOWN(MB(1)), true); + if ( rc ) + { + printk("Failed to map low 1MB 1:1: %d\n", rc); + return rc; + } + + printk("** Populating memory map **\n"); + /* Populate memory map. */ + for ( i = 0; i < d->arch.nr_e820; i++ ) + { + if ( d->arch.e820[i].type != E820_RAM ) + continue; + + hvm_populate_memory_range(d, d->arch.e820[i].addr, + d->arch.e820[i].size); + if ( d->arch.e820[i].addr < MB(1) ) + { + unsigned long end = min_t(unsigned long, + d->arch.e820[i].addr + d->arch.e820[i].size, MB(1)); + + saved_current = current; + set_current(v); + rc = hvm_copy_to_guest_phys(d->arch.e820[i].addr, + maddr_to_virt(d->arch.e820[i].addr), + end - d->arch.e820[i].addr); + set_current(saved_current); + if ( rc != HVMCOPY_okay ) + { + printk("Unable to copy RAM region %#lx - %#lx\n", + d->arch.e820[i].addr, end); + return -EFAULT; + } + } + } + + printk("Memory allocation stats:\n"); + for ( i = 0; i <= MAX_ORDER; i++ ) + { + if ( hvm_mem_stats[MAX_ORDER - i] != 0 ) + printk("Order %2u: %pZ\n", MAX_ORDER - i, + _p(((uint64_t)1 << (MAX_ORDER - i + PAGE_SHIFT)) * + hvm_mem_stats[MAX_ORDER - i])); + } + + if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) ) + { + /* + * Since Dom0 cannot be migrated, we will only setup the + * unrestricted guest helpers if they are needed by the current + * hardware we are running on. + */ + rc = hvm_setup_vmx_unrestricted_guest(d); + if ( rc ) + return rc; + } + + printk("Dom0 memory map:\n"); + print_e820_memory_map(d->arch.e820, d->arch.nr_e820); + + return 0; +} + static int __init construct_dom0_hvm(struct domain *d, const module_t *image, unsigned long image_headroom, module_t *initrd, void *(*bootstrap_map)(const module_t *), char *cmdline) { + int rc; printk("** Building a PVH Dom0 **\n"); + /* Sanity! */ + BUG_ON(d->domain_id != 0); + BUG_ON(d->vcpu[0] == NULL); + + process_pending_softirqs(); + + iommu_hwdom_init(d); + + rc = hvm_setup_p2m(d); + if ( rc ) + { + printk("Failed to setup Dom0 physical memory map\n"); + return rc; + } + return 0; }
Craft the Dom0 e820 memory map and populate it. Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> --- Cc: Jan Beulich <jbeulich@suse.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> --- Changes since RFC: - Use IS_ALIGNED instead of checking with PAGE_MASK. - Use the new %pB specifier in order to print sizes in human readable form. - Create a VM86 TSS for hardware that doesn't support unrestricted mode. - Subtract guest RAM for the identity page table and the VM86 TSS. - Split the creation of the unrestricted mode helper structures to a separate function. - Use preemption with paging_set_allocation. - Use get_order_from_bytes_floor. --- xen/arch/x86/domain_build.c | 257 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 251 insertions(+), 6 deletions(-)