Message ID | 56A658FE02000078000CAC3D@prv-mh.provo.novell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, 2016-01-25 at 09:18 -0700, Jan Beulich wrote: > When mapping large BARs (e.g. the frame buffer of a graphics card) the > overhead of establishing such mappings using only 4k pages has, > particularly after the XSA-125 fix, become unacceptable. Alter the > XEN_DOMCTL_memory_mapping semantics once again, so that there's no > longer a fixed amount of guest frames that represents the upper limit > of what a single invocation can map. Instead bound execution time by > limiting the number of iterations (regardless of page size). > > Signed-off-by: Jan Beulich <jbeulich@suse.com> Tools and domctl API side: Acked-by: Ian Campbell <ian.campbell@citrix.com> > --- > Open issues (perhaps for subsequent changes): > - ARM side unimplemented (and hence libxc for now made cope with both > models), the main issue (besides my inability to test any change > there) being the many internal uses of map_mmio_regions()) > - iommu_{,un}map_page() interfaces don't support "order" (hence > mmio_order() for now returns zero when !iommu_hap_pt_share, which in > particular means the AMD side isn't being taken care of just yet, but > note that this also has the intended effect of suppressing non-zero > order mappings in the shadow mode case) > --- > v5: Refine comment in domctl.h. > v4: Move cleanup duty entirely to the caller of the hypercall. Move > return value description to from commit message to domctl.h. > v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into > hap_capabilities". Extend description to spell out new return value > meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead > of literal 0. Take into consideration r/o MMIO pages. > v2: Produce valid entries for large p2m_mmio_direct mappings in > p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in > mmio_order(). Update function comment of set_typed_p2m_entry() and > clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to > {,un}map_mmio_regions() to detect otherwise endless loops. > > --- a/tools/libxc/xc_domain.c > +++ b/tools/libxc/xc_domain.c > @@ -2174,7 +2174,7 @@ int xc_domain_memory_mapping( > { > DECLARE_DOMCTL; > xc_dominfo_t info; > - int ret = 0, err; > + int ret = 0, rc; > unsigned long done = 0, nr, max_batch_sz; > > if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 || > @@ -2199,19 +2199,24 @@ int xc_domain_memory_mapping( > domctl.u.memory_mapping.nr_mfns = nr; > domctl.u.memory_mapping.first_gfn = first_gfn + done; > domctl.u.memory_mapping.first_mfn = first_mfn + done; > - err = do_domctl(xch, &domctl); > - if ( err && errno == E2BIG ) > + rc = do_domctl(xch, &domctl); > + if ( rc < 0 && errno == E2BIG ) > { > if ( max_batch_sz <= 1 ) > break; > max_batch_sz >>= 1; > continue; > } > + if ( rc > 0 ) > + { > + done += rc; > + continue; > + } > /* Save the first error... */ > if ( !ret ) > - ret = err; > + ret = rc; > /* .. and ignore the rest of them when removing. */ > - if ( err && add_mapping != DPCI_REMOVE_MAPPING ) > + if ( rc && add_mapping != DPCI_REMOVE_MAPPING ) > break; > > done += nr; > --- a/xen/arch/x86/domain_build.c > +++ b/xen/arch/x86/domain_build.c > @@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s > else > a = p2m_access_rw; > > - if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) ) > + if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), > + PAGE_ORDER_4K, a)) ) > panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n", > gfn, mfn, i, rc); > if ( !(i & 0xfffff) ) > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru > share_xen_page_with_guest(pg, d, XENSHARE_writable); > d->arch.hvm_domain.vmx.apic_access_mfn = mfn; > set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), > _mfn(mfn), > - p2m_get_hostp2m(d)->default_access); > + PAGE_ORDER_4K, p2m_get_hostp2m(d)- > >default_access); > > return 0; > } > --- a/xen/arch/x86/mm/p2m.c > +++ b/xen/arch/x86/mm/p2m.c > @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain > p2m_unlock(p2m); > } > > -/* Returns: 0 for success, -errno for failure */ > +/* > + * Returns: > + * 0 for success > + * -errno for failure > + * order+1 for caller to retry with order (guaranteed smaller than > + * the order value passed in) > + */ > static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, > mfn_t mfn, > - p2m_type_t gfn_p2mt, p2m_access_t access) > + unsigned int order, p2m_type_t gfn_p2mt, > + p2m_access_t access) > { > int rc = 0; > p2m_access_t a; > p2m_type_t ot; > mfn_t omfn; > + unsigned int cur_order = 0; > struct p2m_domain *p2m = p2m_get_hostp2m(d); > > if ( !paging_mode_translate(d) ) > return -EIO; > > - gfn_lock(p2m, gfn, 0); > - omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL); > + gfn_lock(p2m, gfn, order); > + omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL); > + if ( cur_order < order ) > + { > + gfn_unlock(p2m, gfn, order); > + return cur_order + 1; > + } > if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) > { > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > domain_crash(d); > return -ENOENT; > } > else if ( p2m_is_ram(ot) ) > { > + unsigned long i; > + > ASSERT(mfn_valid(omfn)); > - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); > + for ( i = 0; i < (1UL << order); ++i ) > + set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); > } > > P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn)); > - rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt, > - access); > + rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); > if ( rc ) > - gdprintk(XENLOG_ERR, > - "p2m_set_entry failed! mfn=%08lx rc:%d\n", > - mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), > rc); > + gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d > (0x%"PRI_mfn")\n", > + gfn, order, rc, mfn_x(mfn)); > else if ( p2m_is_pod(ot) ) > { > pod_lock(p2m); > - p2m->pod.entry_count--; > + p2m->pod.entry_count -= 1UL << order; > BUG_ON(p2m->pod.entry_count < 0); > pod_unlock(p2m); > } > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > > return rc; > } > @@ -949,14 +963,21 @@ static int set_typed_p2m_entry(struct do > static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn, > mfn_t mfn) > { > - return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign, > + return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, > p2m_map_foreign, > p2m_get_hostp2m(d)->default_access); > } > > int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > - p2m_access_t access) > + unsigned int order, p2m_access_t access) > { > - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); > + if ( order && > + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), > + mfn_x(mfn) + (1UL << order) - 1) && > + !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn), > + mfn_x(mfn) + (1UL << order) - 1) ) > + return order; > + > + return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, > access); > } > > int set_identity_p2m_entry(struct domain *d, unsigned long gfn, > @@ -1009,20 +1030,33 @@ int set_identity_p2m_entry(struct domain > return ret; > } > > -/* Returns: 0 for success, -errno for failure */ > -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) > +/* > + * Returns: > + * 0 for success > + * -errno for failure > + * order+1 for caller to retry with order (guaranteed smaller than > + * the order value passed in) > + */ > +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > + unsigned int order) > { > int rc = -EINVAL; > mfn_t actual_mfn; > p2m_access_t a; > p2m_type_t t; > + unsigned int cur_order = 0; > struct p2m_domain *p2m = p2m_get_hostp2m(d); > > if ( !paging_mode_translate(d) ) > return -EIO; > > - gfn_lock(p2m, gfn, 0); > - actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL); > + gfn_lock(p2m, gfn, order); > + actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL); > + if ( cur_order < order ) > + { > + rc = cur_order + 1; > + goto out; > + } > > /* Do not use mfn_valid() here as it will usually fail for MMIO > pages. */ > if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) ) > @@ -1035,11 +1069,11 @@ int clear_mmio_p2m_entry(struct domain * > gdprintk(XENLOG_WARNING, > "no mapping between mfn %08lx and gfn %08lx\n", > mfn_x(mfn), gfn); > - rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, > p2m_invalid, > + rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid, > p2m->default_access); > > out: > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > > return rc; > } > @@ -2095,6 +2129,25 @@ void *map_domain_gfn(struct p2m_domain * > return map_domain_page(*mfn); > } > > +static unsigned int mmio_order(const struct domain *d, > + unsigned long start_fn, unsigned long nr) > +{ > + if ( !need_iommu(d) || !iommu_use_hap_pt(d) || > + (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> > PAGE_ORDER_2M) ) > + return 0; > + > + if ( !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> > PAGE_ORDER_1G) && > + hap_has_1gb ) > + return PAGE_ORDER_1G; > + > + if ( hap_has_2mb ) > + return PAGE_ORDER_2M; > + > + return 0; > +} > + > +#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */ > + > int map_mmio_regions(struct domain *d, > unsigned long start_gfn, > unsigned long nr, > @@ -2102,22 +2155,29 @@ int map_mmio_regions(struct domain *d, > { > int ret = 0; > unsigned long i; > + unsigned int iter, order; > > if ( !paging_mode_translate(d) ) > return 0; > > - for ( i = 0; !ret && i < nr; i++ ) > + for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER; > + i += 1UL << order, ++iter ) > { > - ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), > - p2m_get_hostp2m(d)->default_access); > - if ( ret ) > + /* OR'ing gfn and mfn values will return an order suitable to > both. */ > + for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - > i); ; > + order = ret - 1 ) > { > - unmap_mmio_regions(d, start_gfn, i, mfn); > - break; > + ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), > order, > + p2m_get_hostp2m(d)- > >default_access); > + if ( ret <= 0 ) > + break; > + ASSERT(ret <= order); > } > + if ( ret < 0 ) > + break; > } > > - return ret; > + return i == nr ? 0 : i ?: ret; > } > > int unmap_mmio_regions(struct domain *d, > @@ -2125,20 +2185,30 @@ int unmap_mmio_regions(struct domain *d, > unsigned long nr, > unsigned long mfn) > { > - int err = 0; > + int ret = 0; > unsigned long i; > + unsigned int iter, order; > > if ( !paging_mode_translate(d) ) > return 0; > > - for ( i = 0; i < nr; i++ ) > + for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER; > + i += 1UL << order, ++iter ) > { > - int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i)); > - if ( ret ) > - err = ret; > + /* OR'ing gfn and mfn values will return an order suitable to > both. */ > + for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - > i); ; > + order = ret - 1 ) > + { > + ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), > order); > + if ( ret <= 0 ) > + break; > + ASSERT(ret <= order); > + } > + if ( ret < 0 ) > + break; > } > > - return err; > + return i == nr ? 0 : i ?: ret; > } > > unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp) > --- a/xen/arch/x86/mm/p2m-ept.c > +++ b/xen/arch/x86/mm/p2m-ept.c > @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct > entry->r = entry->x = 1; > entry->w = !rangeset_contains_singleton(mmio_ro_ranges, > entry->mfn); > + ASSERT(entry->w || !is_epte_superpage(entry)); > entry->a = !!cpu_has_vmx_ept_ad; > entry->d = entry->w && cpu_has_vmx_ept_ad; > break; > --- a/xen/arch/x86/mm/p2m-pt.c > +++ b/xen/arch/x86/mm/p2m-pt.c > @@ -72,7 +72,8 @@ static const unsigned long pgt[] = { > PGT_l3_page_table > }; > > -static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) > +static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn, > + unsigned int level) > { > unsigned long flags; > /* > @@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p > case p2m_mmio_direct: > if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) > flags |= _PAGE_RW; > + else > + ASSERT(!level); > return flags | P2M_BASE_FLAGS | _PAGE_PCD; > } > } > @@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain * > p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn > | ~mask) > ? p2m_ram_logdirty : p2m_ram_rw; > unsigned long mfn = l1e_get_pfn(e); > - unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn)); > + unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), > level); > > if ( level ) > { > @@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, > ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); > l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) > ? l3e_from_pfn(mfn_x(mfn), > - p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) > + p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE) > : l3e_empty(); > entry_content.l1 = l3e_content.l3; > > @@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, > > if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) > entry_content = p2m_l1e_from_pfn(mfn_x(mfn), > - p2m_type_to_flags(p2mt, > mfn)); > + p2m_type_to_flags(p2mt, > mfn, 0)); > else > entry_content = l1e_empty(); > > @@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, > ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); > if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) > l2e_content = l2e_from_pfn(mfn_x(mfn), > - p2m_type_to_flags(p2mt, mfn) | > + p2m_type_to_flags(p2mt, mfn, 1) | > _PAGE_PSE); > else > l2e_content = l2e_empty(); > --- a/xen/common/domctl.c > +++ b/xen/common/domctl.c > @@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe > (gfn + nr_mfns - 1) < gfn ) /* wrap? */ > break; > > +#ifndef CONFIG_X86 /* XXX ARM!? */ > ret = -E2BIG; > /* Must break hypercall up as this could take a while. */ > if ( nr_mfns > 64 ) > break; > +#endif > > ret = -EPERM; > if ( !iomem_access_permitted(current->domain, mfn, mfn_end) || > @@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe > d->domain_id, gfn, mfn, nr_mfns); > > ret = map_mmio_regions(d, gfn, nr_mfns, mfn); > - if ( ret ) > + if ( ret < 0 ) > printk(XENLOG_G_WARNING > "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx > ret:%ld\n", > d->domain_id, gfn, mfn, nr_mfns, ret); > @@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe > d->domain_id, gfn, mfn, nr_mfns); > > ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn); > - if ( ret && is_hardware_domain(current->domain) ) > + if ( ret < 0 && is_hardware_domain(current->domain) ) > printk(XENLOG_ERR > "memory_map: error %ld removing dom%d access to > [%lx,%lx]\n", > ret, d->domain_id, mfn, mfn_end); > --- a/xen/common/memory.c > +++ b/xen/common/memory.c > @@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d, > } > if ( p2mt == p2m_mmio_direct ) > { > - clear_mmio_p2m_entry(d, gmfn, _mfn(mfn)); > + clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0); > put_gfn(d, gmfn); > return 1; > } > --- a/xen/include/asm-x86/p2m.h > +++ b/xen/include/asm-x86/p2m.h > @@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom > > /* Set mmio addresses in the p2m table (for pass-through) */ > int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > - p2m_access_t access); > -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t > mfn); > + unsigned int order, p2m_access_t access); > +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > + unsigned int order); > > /* Set identity addresses in the p2m table (for pass-through) */ > int set_identity_p2m_entry(struct domain *d, unsigned long gfn, > --- a/xen/include/public/domctl.h > +++ b/xen/include/public/domctl.h > @@ -542,8 +542,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_ > > > /* Bind machine I/O address range -> HVM address range. */ > -/* If this returns -E2BIG lower nr_mfns value. */ > /* XEN_DOMCTL_memory_mapping */ > +/* Returns > + - zero success, everything done > + - -E2BIG passed in nr_mfns value too large for the implementation > + - positive partial success for the first <result> page frames (with > + <result> less than nr_mfns), requiring re-invocation by > the > + caller after updating inputs > + - negative error; other than -E2BIG > +*/ > #define DPCI_ADD_MAPPING 1 > #define DPCI_REMOVE_MAPPING 0 > struct xen_domctl_memory_mapping { > >
> From: Jan Beulich [mailto:JBeulich@suse.com] > Sent: Tuesday, January 26, 2016 12:19 AM > > When mapping large BARs (e.g. the frame buffer of a graphics card) the > overhead of establishing such mappings using only 4k pages has, > particularly after the XSA-125 fix, become unacceptable. Alter the > XEN_DOMCTL_memory_mapping semantics once again, so that there's no > longer a fixed amount of guest frames that represents the upper limit > of what a single invocation can map. Instead bound execution time by > limiting the number of iterations (regardless of page size). > > Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Kevin Tian <kevin.tian@intel.com> for VMX part. Curious. When you say "become unacceptable", how bad is it? mostly impact the boot time? Thanks Kevin
>>> On 26.01.16 at 23:35, <kevin.tian@intel.com> wrote: >> From: Jan Beulich [mailto:JBeulich@suse.com] >> Sent: Tuesday, January 26, 2016 12:19 AM >> >> When mapping large BARs (e.g. the frame buffer of a graphics card) the >> overhead of establishing such mappings using only 4k pages has, >> particularly after the XSA-125 fix, become unacceptable. Alter the >> XEN_DOMCTL_memory_mapping semantics once again, so that there's no >> longer a fixed amount of guest frames that represents the upper limit >> of what a single invocation can map. Instead bound execution time by >> limiting the number of iterations (regardless of page size). >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > Acked-by: Kevin Tian <kevin.tian@intel.com> for VMX part. > > Curious. When you say "become unacceptable", how bad is it? mostly > impact the boot time? Yes, guest boot time. I don't have a reference to the original report at hand, but that was what someone (Konrad?) had reported. I've never seen the issue myself, largely because I've never made any attempt at GPU pass-through. Jan
On 27/01/16 10:22, Jan Beulich wrote: >>>> On 26.01.16 at 23:35, <kevin.tian@intel.com> wrote: >>> From: Jan Beulich [mailto:JBeulich@suse.com] >>> Sent: Tuesday, January 26, 2016 12:19 AM >>> >>> When mapping large BARs (e.g. the frame buffer of a graphics card) the >>> overhead of establishing such mappings using only 4k pages has, >>> particularly after the XSA-125 fix, become unacceptable. Alter the >>> XEN_DOMCTL_memory_mapping semantics once again, so that there's no >>> longer a fixed amount of guest frames that represents the upper limit >>> of what a single invocation can map. Instead bound execution time by >>> limiting the number of iterations (regardless of page size). >>> >>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> Acked-by: Kevin Tian <kevin.tian@intel.com> for VMX part. >> >> Curious. When you say "become unacceptable", how bad is it? mostly >> impact the boot time? > Yes, guest boot time. I don't have a reference to the original report > at hand, but that was what someone (Konrad?) had reported. I've > never seen the issue myself, largely because I've never made any > attempt at GPU pass-through. From XenServer testing, with a 1GB GPU BAR, XSA-125 caused and additional 70s of guest boot time. Naturally. we had to work around this. Partly upping the repeat limit, and deferring VT-d flushes. ~Andrew
On 25/01/16 16:18, Jan Beulich wrote: > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru > share_xen_page_with_guest(pg, d, XENSHARE_writable); > d->arch.hvm_domain.vmx.apic_access_mfn = mfn; > set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn), > - p2m_get_hostp2m(d)->default_access); > + PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access); > This should ASSERT() success, in case we make further changes to the error handling. > return 0; > } > --- a/xen/arch/x86/mm/p2m.c > +++ b/xen/arch/x86/mm/p2m.c > @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain > p2m_unlock(p2m); > } > > -/* Returns: 0 for success, -errno for failure */ > +/* > + * Returns: > + * 0 for success > + * -errno for failure > + * order+1 for caller to retry with order (guaranteed smaller than > + * the order value passed in) > + */ > static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > - p2m_type_t gfn_p2mt, p2m_access_t access) > + unsigned int order, p2m_type_t gfn_p2mt, > + p2m_access_t access) > { > int rc = 0; > p2m_access_t a; > p2m_type_t ot; > mfn_t omfn; > + unsigned int cur_order = 0; > struct p2m_domain *p2m = p2m_get_hostp2m(d); > > if ( !paging_mode_translate(d) ) > return -EIO; > > - gfn_lock(p2m, gfn, 0); > - omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL); > + gfn_lock(p2m, gfn, order); > + omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL); > + if ( cur_order < order ) > + { > + gfn_unlock(p2m, gfn, order); > + return cur_order + 1; Your comment states that the return value is guarenteed to be less than the passed-in order, but this is not the case here. cur_order could, in principle, be only 1 less than order, at which point your documentation is incorrect. Does this rely on the x86 architectural orders to function as documented? > + } > if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) > { > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > domain_crash(d); > return -ENOENT; > } > else if ( p2m_is_ram(ot) ) > { > + unsigned long i; > + > ASSERT(mfn_valid(omfn)); Shouldn't this check should be extended to the top of the order? > - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); > + for ( i = 0; i < (1UL << order); ++i ) > + set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); > } > > P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn)); > - rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt, > - access); > + rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); > if ( rc ) > - gdprintk(XENLOG_ERR, > - "p2m_set_entry failed! mfn=%08lx rc:%d\n", > - mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc); > + gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n", > + gfn, order, rc, mfn_x(mfn)); > else if ( p2m_is_pod(ot) ) > { > pod_lock(p2m); > - p2m->pod.entry_count--; > + p2m->pod.entry_count -= 1UL << order; > BUG_ON(p2m->pod.entry_count < 0); > pod_unlock(p2m); > } > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > > return rc; > } > @@ -949,14 +963,21 @@ static int set_typed_p2m_entry(struct do > static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn, > mfn_t mfn) > { > - return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign, > + return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign, > p2m_get_hostp2m(d)->default_access); > } > > int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > - p2m_access_t access) > + unsigned int order, p2m_access_t access) > { > - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); > + if ( order && > + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), > + mfn_x(mfn) + (1UL << order) - 1) && > + !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn), > + mfn_x(mfn) + (1UL << order) - 1) ) > + return order; Should this not be a hard error? Even retrying with a lower order is going fail. > + > + return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access); > } > > int set_identity_p2m_entry(struct domain *d, unsigned long gfn, > @@ -1009,20 +1030,33 @@ int set_identity_p2m_entry(struct domain > return ret; > } > > -/* Returns: 0 for success, -errno for failure */ > -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) > +/* > + * Returns: > + * 0 for success > + * -errno for failure > + * order+1 for caller to retry with order (guaranteed smaller than > + * the order value passed in) > + */ > +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > + unsigned int order) > { > int rc = -EINVAL; > mfn_t actual_mfn; > p2m_access_t a; > p2m_type_t t; > + unsigned int cur_order = 0; > struct p2m_domain *p2m = p2m_get_hostp2m(d); > > if ( !paging_mode_translate(d) ) > return -EIO; > > - gfn_lock(p2m, gfn, 0); > - actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL); > + gfn_lock(p2m, gfn, order); > + actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL); > + if ( cur_order < order ) > + { > + rc = cur_order + 1; > + goto out; > + } > > /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */ > if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) ) > @@ -1035,11 +1069,11 @@ int clear_mmio_p2m_entry(struct domain * > gdprintk(XENLOG_WARNING, > "no mapping between mfn %08lx and gfn %08lx\n", > mfn_x(mfn), gfn); > - rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid, > + rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid, > p2m->default_access); > > out: > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > > return rc; > } > @@ -2095,6 +2129,25 @@ void *map_domain_gfn(struct p2m_domain * > return map_domain_page(*mfn); > } > > +static unsigned int mmio_order(const struct domain *d, > + unsigned long start_fn, unsigned long nr) > +{ > + if ( !need_iommu(d) || !iommu_use_hap_pt(d) || > + (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) ) > + return 0; Perhaps PAGE_ORDER_4K for consistency? ~Andrew
>>> On 27.01.16 at 13:32, <andrew.cooper3@citrix.com> wrote: > On 25/01/16 16:18, Jan Beulich wrote: >> --- a/xen/arch/x86/hvm/vmx/vmx.c >> +++ b/xen/arch/x86/hvm/vmx/vmx.c >> @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru >> share_xen_page_with_guest(pg, d, XENSHARE_writable); >> d->arch.hvm_domain.vmx.apic_access_mfn = mfn; >> set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn), >> - p2m_get_hostp2m(d)->default_access); >> + PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access); >> > > This should ASSERT() success, in case we make further changes to the > error handling. Maybe, but since it didn't before I don't see why this couldn't / shouldn't be an independent future patch. >> --- a/xen/arch/x86/mm/p2m.c >> +++ b/xen/arch/x86/mm/p2m.c >> @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain >> p2m_unlock(p2m); >> } >> >> -/* Returns: 0 for success, -errno for failure */ >> +/* >> + * Returns: >> + * 0 for success >> + * -errno for failure >> + * order+1 for caller to retry with order (guaranteed smaller than >> + * the order value passed in) >> + */ >> static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, >> - p2m_type_t gfn_p2mt, p2m_access_t access) >> + unsigned int order, p2m_type_t gfn_p2mt, >> + p2m_access_t access) >> { >> int rc = 0; >> p2m_access_t a; >> p2m_type_t ot; >> mfn_t omfn; >> + unsigned int cur_order = 0; >> struct p2m_domain *p2m = p2m_get_hostp2m(d); >> >> if ( !paging_mode_translate(d) ) >> return -EIO; >> >> - gfn_lock(p2m, gfn, 0); >> - omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL); >> + gfn_lock(p2m, gfn, order); >> + omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL); >> + if ( cur_order < order ) >> + { >> + gfn_unlock(p2m, gfn, order); >> + return cur_order + 1; > > Your comment states that the return value is guarenteed to be less than > the passed-in order, but this is not the case here. cur_order could, in > principle, be only 1 less than order, at which point your documentation > is incorrect. > > Does this rely on the x86 architectural orders to function as documented? No. Maybe the comment text is ambiguous, but I don't see how to improve it without making it too lengthy: The return value is <order>+1, telling the caller to retry with <order>, which is guaranteed to be less than the order that got passed in. I.e. taking the variable naming above, the caller would have to retry with cur_order, which - due to the if() - is smaller than order. >> + } >> if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) >> { >> - gfn_unlock(p2m, gfn, 0); >> + gfn_unlock(p2m, gfn, order); >> domain_crash(d); >> return -ENOENT; >> } >> else if ( p2m_is_ram(ot) ) >> { >> + unsigned long i; >> + >> ASSERT(mfn_valid(omfn)); > > Shouldn't this check should be extended to the top of the order? Well, yes, perhaps better to move it into ... >> - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); >> + for ( i = 0; i < (1UL << order); ++i ) >> + set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); ... the body of the for(). But I'll wait with v6 until we settled on the other aspects you raise. >> int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, >> - p2m_access_t access) >> + unsigned int order, p2m_access_t access) >> { >> - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); >> + if ( order && >> + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), >> + mfn_x(mfn) + (1UL << order) - 1) && >> + !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn), >> + mfn_x(mfn) + (1UL << order) - 1) ) >> + return order; > > Should this not be a hard error? Even retrying with a lower order is > going fail. Why? The latest when order == 0, rangeset_overlaps_range() will return the same as rangeset_contains_range(), and hence the condition above will always be false (one of the two reasons for checking order first here). >> @@ -2095,6 +2129,25 @@ void *map_domain_gfn(struct p2m_domain * >> return map_domain_page(*mfn); >> } >> >> +static unsigned int mmio_order(const struct domain *d, >> + unsigned long start_fn, unsigned long nr) >> +{ >> + if ( !need_iommu(d) || !iommu_use_hap_pt(d) || >> + (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) ) >> + return 0; > > Perhaps PAGE_ORDER_4K for consistency? Oh, indeed. Jan
On 27/01/16 13:37, Jan Beulich wrote: >>>> On 27.01.16 at 13:32, <andrew.cooper3@citrix.com> wrote: >> On 25/01/16 16:18, Jan Beulich wrote: >>> --- a/xen/arch/x86/hvm/vmx/vmx.c >>> +++ b/xen/arch/x86/hvm/vmx/vmx.c >>> @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru >>> share_xen_page_with_guest(pg, d, XENSHARE_writable); >>> d->arch.hvm_domain.vmx.apic_access_mfn = mfn; >>> set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn), >>> - p2m_get_hostp2m(d)->default_access); >>> + PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access); >>> >> This should ASSERT() success, in case we make further changes to the >> error handling. > Maybe, but since it didn't before I don't see why this couldn't / > shouldn't be an independent future patch. Can be. IMO it is a bug that it isn't already checked. (-ENOMEM when allocating p2m leaves perhaps?) > >>> --- a/xen/arch/x86/mm/p2m.c >>> +++ b/xen/arch/x86/mm/p2m.c >>> @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain >>> p2m_unlock(p2m); >>> } >>> >>> -/* Returns: 0 for success, -errno for failure */ >>> +/* >>> + * Returns: >>> + * 0 for success >>> + * -errno for failure >>> + * order+1 for caller to retry with order (guaranteed smaller than >>> + * the order value passed in) >>> + */ >>> static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, >>> - p2m_type_t gfn_p2mt, p2m_access_t access) >>> + unsigned int order, p2m_type_t gfn_p2mt, >>> + p2m_access_t access) >>> { >>> int rc = 0; >>> p2m_access_t a; >>> p2m_type_t ot; >>> mfn_t omfn; >>> + unsigned int cur_order = 0; >>> struct p2m_domain *p2m = p2m_get_hostp2m(d); >>> >>> if ( !paging_mode_translate(d) ) >>> return -EIO; >>> >>> - gfn_lock(p2m, gfn, 0); >>> - omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL); >>> + gfn_lock(p2m, gfn, order); >>> + omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL); >>> + if ( cur_order < order ) >>> + { >>> + gfn_unlock(p2m, gfn, order); >>> + return cur_order + 1; >> Your comment states that the return value is guarenteed to be less than >> the passed-in order, but this is not the case here. cur_order could, in >> principle, be only 1 less than order, at which point your documentation >> is incorrect. >> >> Does this rely on the x86 architectural orders to function as documented? > No. Maybe the comment text is ambiguous, but I don't see how to > improve it without making it too lengthy: The return value is > <order>+1, telling the caller to retry with <order>, which is > guaranteed to be less than the order that got passed in. I.e. taking > the variable naming above, the caller would have to retry with > cur_order, which - due to the if() - is smaller than order. Ah - I see. The text is indeed confusing. How about: "1 + new order: for caller to retry with smaller order (guaranteed to be smaller than order passed in)" > >>> + } >>> if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) >>> { >>> - gfn_unlock(p2m, gfn, 0); >>> + gfn_unlock(p2m, gfn, order); >>> domain_crash(d); >>> return -ENOENT; >>> } >>> else if ( p2m_is_ram(ot) ) >>> { >>> + unsigned long i; >>> + >>> ASSERT(mfn_valid(omfn)); >> Shouldn't this check should be extended to the top of the order? > Well, yes, perhaps better to move it into ... > >>> - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); >>> + for ( i = 0; i < (1UL << order); ++i ) >>> + set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); > ... the body of the for(). But I'll wait with v6 until we settled on > the other aspects you raise. > >>> int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, >>> - p2m_access_t access) >>> + unsigned int order, p2m_access_t access) >>> { >>> - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); >>> + if ( order && >>> + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), >>> + mfn_x(mfn) + (1UL << order) - 1) && >>> + !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn), >>> + mfn_x(mfn) + (1UL << order) - 1) ) >>> + return order; >> Should this not be a hard error? Even retrying with a lower order is >> going fail. > Why? The latest when order == 0, rangeset_overlaps_range() > will return the same as rangeset_contains_range(), and hence > the condition above will always be false (one of the two reasons > for checking order first here). It isn't the order check which is an issue. One way or another, if the original (mfn/order) fails the rangeset checks, the overall call is going to fail, but it will be re-executed repeatedly with an order decreasing to 0. Wouldn't it be better just to short-circuit this back&forth? Relatedly, is there actually anything wrong with making a superpage read-only mapping over some scattered read-only 4K pages? ~Andrew
>>> On 27.01.16 at 15:28, <andrew.cooper3@citrix.com> wrote: > On 27/01/16 13:37, Jan Beulich wrote: >>>>> On 27.01.16 at 13:32, <andrew.cooper3@citrix.com> wrote: >>> On 25/01/16 16:18, Jan Beulich wrote: >>>> --- a/xen/arch/x86/hvm/vmx/vmx.c >>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c >>>> @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru >>>> share_xen_page_with_guest(pg, d, XENSHARE_writable); >>>> d->arch.hvm_domain.vmx.apic_access_mfn = mfn; >>>> set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn), >>>> - p2m_get_hostp2m(d)->default_access); >>>> + PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access); >>>> >>> This should ASSERT() success, in case we make further changes to the >>> error handling. >> Maybe, but since it didn't before I don't see why this couldn't / >> shouldn't be an independent future patch. > > Can be. IMO it is a bug that it isn't already checked. (-ENOMEM when > allocating p2m leaves perhaps?) Indeed, albeit that means ASSERT() wouldn't be right anyway. I hope the VMX maintainers monitor this and will prepare a patch... >>>> --- a/xen/arch/x86/mm/p2m.c >>>> +++ b/xen/arch/x86/mm/p2m.c >>>> @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain >>>> p2m_unlock(p2m); >>>> } >>>> >>>> -/* Returns: 0 for success, -errno for failure */ >>>> +/* >>>> + * Returns: >>>> + * 0 for success >>>> + * -errno for failure >>>> + * order+1 for caller to retry with order (guaranteed smaller than >>>> + * the order value passed in) >>>> + */ >>>> static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, >>>> - p2m_type_t gfn_p2mt, p2m_access_t access) >>>> + unsigned int order, p2m_type_t gfn_p2mt, >>>> + p2m_access_t access) >>>> { >>>> int rc = 0; >>>> p2m_access_t a; >>>> p2m_type_t ot; >>>> mfn_t omfn; >>>> + unsigned int cur_order = 0; >>>> struct p2m_domain *p2m = p2m_get_hostp2m(d); >>>> >>>> if ( !paging_mode_translate(d) ) >>>> return -EIO; >>>> >>>> - gfn_lock(p2m, gfn, 0); >>>> - omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL); >>>> + gfn_lock(p2m, gfn, order); >>>> + omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL); >>>> + if ( cur_order < order ) >>>> + { >>>> + gfn_unlock(p2m, gfn, order); >>>> + return cur_order + 1; >>> Your comment states that the return value is guarenteed to be less than >>> the passed-in order, but this is not the case here. cur_order could, in >>> principle, be only 1 less than order, at which point your documentation >>> is incorrect. >>> >>> Does this rely on the x86 architectural orders to function as documented? >> No. Maybe the comment text is ambiguous, but I don't see how to >> improve it without making it too lengthy: The return value is >> <order>+1, telling the caller to retry with <order>, which is >> guaranteed to be less than the order that got passed in. I.e. taking >> the variable naming above, the caller would have to retry with >> cur_order, which - due to the if() - is smaller than order. > > Ah - I see. The text is indeed confusing. How about: > > "1 + new order: for caller to retry with smaller order (guaranteed to be > smaller than order passed in)" Okay. >>>> int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, >>>> - p2m_access_t access) >>>> + unsigned int order, p2m_access_t access) >>>> { >>>> - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); >>>> + if ( order && >>>> + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), >>>> + mfn_x(mfn) + (1UL << order) - 1) && >>>> + !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn), >>>> + mfn_x(mfn) + (1UL << order) - 1) ) >>>> + return order; >>> Should this not be a hard error? Even retrying with a lower order is >>> going fail. >> Why? The latest when order == 0, rangeset_overlaps_range() >> will return the same as rangeset_contains_range(), and hence >> the condition above will always be false (one of the two reasons >> for checking order first here). > > It isn't the order check which is an issue. > > One way or another, if the original (mfn/order) fails the rangeset > checks, the overall call is going to fail, but it will be re-executed > repeatedly with an order decreasing to 0. Wouldn't it be better just to > short-circuit this back&forth? But this won't necessarily go down to order 0. Short-circuiting would mean taking PAGE_ORDER_2M and PAGE_ORDER_1G into account here, which would imo severely hamper readability. > Relatedly, is there actually anything wrong with making a superpage > read-only mapping over some scattered read-only 4K pages? I'm afraid I don't understand: "scattered pages" and "superpage mapping" don't seem to fit together for me. Jan
On 27/01/16 14:40, Jan Beulich wrote: > >>>>> int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, >>>>> - p2m_access_t access) >>>>> + unsigned int order, p2m_access_t access) >>>>> { >>>>> - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); >>>>> + if ( order && >>>>> + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), >>>>> + mfn_x(mfn) + (1UL << order) - 1) && >>>>> + !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn), >>>>> + mfn_x(mfn) + (1UL << order) - 1) ) >>>>> + return order; >>>> Should this not be a hard error? Even retrying with a lower order is >>>> going fail. >>> Why? The latest when order == 0, rangeset_overlaps_range() >>> will return the same as rangeset_contains_range(), and hence >>> the condition above will always be false (one of the two reasons >>> for checking order first here). >> It isn't the order check which is an issue. >> >> One way or another, if the original (mfn/order) fails the rangeset >> checks, the overall call is going to fail, but it will be re-executed >> repeatedly with an order decreasing to 0. Wouldn't it be better just to >> short-circuit this back&forth? > But this won't necessarily go down to order 0. Short-circuiting > would mean taking PAGE_ORDER_2M and PAGE_ORDER_1G into > account here, which would imo severely hamper readability. Even when this check starts passing, the subsequent set_typed_p2m_entry() will fail for writeable mappings, after having constructed small pages up to the boundary of the RO region. > >> Relatedly, is there actually anything wrong with making a superpage >> read-only mapping over some scattered read-only 4K pages? > I'm afraid I don't understand: "scattered pages" and "superpage > mapping" don't seem to fit together for me. If there is a single 4K page in the RO region, and the caller attempts to create a RO 2M superpage which includes the 4K region, these checks will force the use of 4K mappings even though the 2M mapping would be fine. ~Andrew
>>> On 27.01.16 at 15:51, <andrew.cooper3@citrix.com> wrote: > On 27/01/16 14:40, Jan Beulich wrote: >> >>>>>> int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, >>>>>> - p2m_access_t access) >>>>>> + unsigned int order, p2m_access_t access) >>>>>> { >>>>>> - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); >>>>>> + if ( order && >>>>>> + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), >>>>>> + mfn_x(mfn) + (1UL << order) - 1) && >>>>>> + !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn), >>>>>> + mfn_x(mfn) + (1UL << order) - 1) ) >>>>>> + return order; >>>>> Should this not be a hard error? Even retrying with a lower order is >>>>> going fail. >>>> Why? The latest when order == 0, rangeset_overlaps_range() >>>> will return the same as rangeset_contains_range(), and hence >>>> the condition above will always be false (one of the two reasons >>>> for checking order first here). >>> It isn't the order check which is an issue. >>> >>> One way or another, if the original (mfn/order) fails the rangeset >>> checks, the overall call is going to fail, but it will be re-executed >>> repeatedly with an order decreasing to 0. Wouldn't it be better just to >>> short-circuit this back&forth? >> But this won't necessarily go down to order 0. Short-circuiting >> would mean taking PAGE_ORDER_2M and PAGE_ORDER_1G into >> account here, which would imo severely hamper readability. > > Even when this check starts passing, the subsequent > set_typed_p2m_entry() will fail for writeable mappings, after having > constructed small pages up to the boundary of the RO region. I don't see where such failure would come from: {ept_,}p2m_type_to_flags() silently suppress the mapping becoming writable. What am I overlooking? >>> Relatedly, is there actually anything wrong with making a superpage >>> read-only mapping over some scattered read-only 4K pages? >> I'm afraid I don't understand: "scattered pages" and "superpage >> mapping" don't seem to fit together for me. > > If there is a single 4K page in the RO region, and the caller attempts > to create a RO 2M superpage which includes the 4K region, these checks > will force the use of 4K mappings even though the 2M mapping would be fine. Oh, so you want "access" to also be taken into account. Not sure that's worth it right now - r/o MMIO mappings shouldn't occur very often (and map_mmio_regions() passes ->default_access anyway). Jan
--- a/tools/libxc/xc_domain.c +++ b/tools/libxc/xc_domain.c @@ -2174,7 +2174,7 @@ int xc_domain_memory_mapping( { DECLARE_DOMCTL; xc_dominfo_t info; - int ret = 0, err; + int ret = 0, rc; unsigned long done = 0, nr, max_batch_sz; if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 || @@ -2199,19 +2199,24 @@ int xc_domain_memory_mapping( domctl.u.memory_mapping.nr_mfns = nr; domctl.u.memory_mapping.first_gfn = first_gfn + done; domctl.u.memory_mapping.first_mfn = first_mfn + done; - err = do_domctl(xch, &domctl); - if ( err && errno == E2BIG ) + rc = do_domctl(xch, &domctl); + if ( rc < 0 && errno == E2BIG ) { if ( max_batch_sz <= 1 ) break; max_batch_sz >>= 1; continue; } + if ( rc > 0 ) + { + done += rc; + continue; + } /* Save the first error... */ if ( !ret ) - ret = err; + ret = rc; /* .. and ignore the rest of them when removing. */ - if ( err && add_mapping != DPCI_REMOVE_MAPPING ) + if ( rc && add_mapping != DPCI_REMOVE_MAPPING ) break; done += nr; --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s else a = p2m_access_rw; - if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) ) + if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), + PAGE_ORDER_4K, a)) ) panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n", gfn, mfn, i, rc); if ( !(i & 0xfffff) ) --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -2491,7 +2491,7 @@ static int vmx_alloc_vlapic_mapping(stru share_xen_page_with_guest(pg, d, XENSHARE_writable); d->arch.hvm_domain.vmx.apic_access_mfn = mfn; set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn), - p2m_get_hostp2m(d)->default_access); + PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access); return 0; } --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -899,48 +899,62 @@ void p2m_change_type_range(struct domain p2m_unlock(p2m); } -/* Returns: 0 for success, -errno for failure */ +/* + * Returns: + * 0 for success + * -errno for failure + * order+1 for caller to retry with order (guaranteed smaller than + * the order value passed in) + */ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, - p2m_type_t gfn_p2mt, p2m_access_t access) + unsigned int order, p2m_type_t gfn_p2mt, + p2m_access_t access) { int rc = 0; p2m_access_t a; p2m_type_t ot; mfn_t omfn; + unsigned int cur_order = 0; struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( !paging_mode_translate(d) ) return -EIO; - gfn_lock(p2m, gfn, 0); - omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL); + gfn_lock(p2m, gfn, order); + omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL); + if ( cur_order < order ) + { + gfn_unlock(p2m, gfn, order); + return cur_order + 1; + } if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) { - gfn_unlock(p2m, gfn, 0); + gfn_unlock(p2m, gfn, order); domain_crash(d); return -ENOENT; } else if ( p2m_is_ram(ot) ) { + unsigned long i; + ASSERT(mfn_valid(omfn)); - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + for ( i = 0; i < (1UL << order); ++i ) + set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); } P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn)); - rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt, - access); + rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); if ( rc ) - gdprintk(XENLOG_ERR, - "p2m_set_entry failed! mfn=%08lx rc:%d\n", - mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc); + gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n", + gfn, order, rc, mfn_x(mfn)); else if ( p2m_is_pod(ot) ) { pod_lock(p2m); - p2m->pod.entry_count--; + p2m->pod.entry_count -= 1UL << order; BUG_ON(p2m->pod.entry_count < 0); pod_unlock(p2m); } - gfn_unlock(p2m, gfn, 0); + gfn_unlock(p2m, gfn, order); return rc; } @@ -949,14 +963,21 @@ static int set_typed_p2m_entry(struct do static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) { - return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign, + return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign, p2m_get_hostp2m(d)->default_access); } int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, - p2m_access_t access) + unsigned int order, p2m_access_t access) { - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); + if ( order && + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), + mfn_x(mfn) + (1UL << order) - 1) && + !rangeset_contains_range(mmio_ro_ranges, mfn_x(mfn), + mfn_x(mfn) + (1UL << order) - 1) ) + return order; + + return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access); } int set_identity_p2m_entry(struct domain *d, unsigned long gfn, @@ -1009,20 +1030,33 @@ int set_identity_p2m_entry(struct domain return ret; } -/* Returns: 0 for success, -errno for failure */ -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) +/* + * Returns: + * 0 for success + * -errno for failure + * order+1 for caller to retry with order (guaranteed smaller than + * the order value passed in) + */ +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + unsigned int order) { int rc = -EINVAL; mfn_t actual_mfn; p2m_access_t a; p2m_type_t t; + unsigned int cur_order = 0; struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( !paging_mode_translate(d) ) return -EIO; - gfn_lock(p2m, gfn, 0); - actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL); + gfn_lock(p2m, gfn, order); + actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL); + if ( cur_order < order ) + { + rc = cur_order + 1; + goto out; + } /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */ if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) ) @@ -1035,11 +1069,11 @@ int clear_mmio_p2m_entry(struct domain * gdprintk(XENLOG_WARNING, "no mapping between mfn %08lx and gfn %08lx\n", mfn_x(mfn), gfn); - rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid, + rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid, p2m->default_access); out: - gfn_unlock(p2m, gfn, 0); + gfn_unlock(p2m, gfn, order); return rc; } @@ -2095,6 +2129,25 @@ void *map_domain_gfn(struct p2m_domain * return map_domain_page(*mfn); } +static unsigned int mmio_order(const struct domain *d, + unsigned long start_fn, unsigned long nr) +{ + if ( !need_iommu(d) || !iommu_use_hap_pt(d) || + (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) ) + return 0; + + if ( !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) && + hap_has_1gb ) + return PAGE_ORDER_1G; + + if ( hap_has_2mb ) + return PAGE_ORDER_2M; + + return 0; +} + +#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */ + int map_mmio_regions(struct domain *d, unsigned long start_gfn, unsigned long nr, @@ -2102,22 +2155,29 @@ int map_mmio_regions(struct domain *d, { int ret = 0; unsigned long i; + unsigned int iter, order; if ( !paging_mode_translate(d) ) return 0; - for ( i = 0; !ret && i < nr; i++ ) + for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER; + i += 1UL << order, ++iter ) { - ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), - p2m_get_hostp2m(d)->default_access); - if ( ret ) + /* OR'ing gfn and mfn values will return an order suitable to both. */ + for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ; + order = ret - 1 ) { - unmap_mmio_regions(d, start_gfn, i, mfn); - break; + ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order, + p2m_get_hostp2m(d)->default_access); + if ( ret <= 0 ) + break; + ASSERT(ret <= order); } + if ( ret < 0 ) + break; } - return ret; + return i == nr ? 0 : i ?: ret; } int unmap_mmio_regions(struct domain *d, @@ -2125,20 +2185,30 @@ int unmap_mmio_regions(struct domain *d, unsigned long nr, unsigned long mfn) { - int err = 0; + int ret = 0; unsigned long i; + unsigned int iter, order; if ( !paging_mode_translate(d) ) return 0; - for ( i = 0; i < nr; i++ ) + for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER; + i += 1UL << order, ++iter ) { - int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i)); - if ( ret ) - err = ret; + /* OR'ing gfn and mfn values will return an order suitable to both. */ + for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ; + order = ret - 1 ) + { + ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order); + if ( ret <= 0 ) + break; + ASSERT(ret <= order); + } + if ( ret < 0 ) + break; } - return err; + return i == nr ? 0 : i ?: ret; } unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp) --- a/xen/arch/x86/mm/p2m-ept.c +++ b/xen/arch/x86/mm/p2m-ept.c @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct entry->r = entry->x = 1; entry->w = !rangeset_contains_singleton(mmio_ro_ranges, entry->mfn); + ASSERT(entry->w || !is_epte_superpage(entry)); entry->a = !!cpu_has_vmx_ept_ad; entry->d = entry->w && cpu_has_vmx_ept_ad; break; --- a/xen/arch/x86/mm/p2m-pt.c +++ b/xen/arch/x86/mm/p2m-pt.c @@ -72,7 +72,8 @@ static const unsigned long pgt[] = { PGT_l3_page_table }; -static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) +static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn, + unsigned int level) { unsigned long flags; /* @@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p case p2m_mmio_direct: if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) flags |= _PAGE_RW; + else + ASSERT(!level); return flags | P2M_BASE_FLAGS | _PAGE_PCD; } } @@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain * p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) ? p2m_ram_logdirty : p2m_ram_rw; unsigned long mfn = l1e_get_pfn(e); - unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn)); + unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level); if ( level ) { @@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ? l3e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) + p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE) : l3e_empty(); entry_content.l1 = l3e_content.l3; @@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) entry_content = p2m_l1e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn)); + p2m_type_to_flags(p2mt, mfn, 0)); else entry_content = l1e_empty(); @@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) l2e_content = l2e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn) | + p2m_type_to_flags(p2mt, mfn, 1) | _PAGE_PSE); else l2e_content = l2e_empty(); --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe (gfn + nr_mfns - 1) < gfn ) /* wrap? */ break; +#ifndef CONFIG_X86 /* XXX ARM!? */ ret = -E2BIG; /* Must break hypercall up as this could take a while. */ if ( nr_mfns > 64 ) break; +#endif ret = -EPERM; if ( !iomem_access_permitted(current->domain, mfn, mfn_end) || @@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe d->domain_id, gfn, mfn, nr_mfns); ret = map_mmio_regions(d, gfn, nr_mfns, mfn); - if ( ret ) + if ( ret < 0 ) printk(XENLOG_G_WARNING "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n", d->domain_id, gfn, mfn, nr_mfns, ret); @@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe d->domain_id, gfn, mfn, nr_mfns); ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn); - if ( ret && is_hardware_domain(current->domain) ) + if ( ret < 0 && is_hardware_domain(current->domain) ) printk(XENLOG_ERR "memory_map: error %ld removing dom%d access to [%lx,%lx]\n", ret, d->domain_id, mfn, mfn_end); --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d, } if ( p2mt == p2m_mmio_direct ) { - clear_mmio_p2m_entry(d, gmfn, _mfn(mfn)); + clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0); put_gfn(d, gmfn); return 1; } --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom /* Set mmio addresses in the p2m table (for pass-through) */ int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, - p2m_access_t access); -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); + unsigned int order, p2m_access_t access); +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + unsigned int order); /* Set identity addresses in the p2m table (for pass-through) */ int set_identity_p2m_entry(struct domain *d, unsigned long gfn, --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -542,8 +542,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_ /* Bind machine I/O address range -> HVM address range. */ -/* If this returns -E2BIG lower nr_mfns value. */ /* XEN_DOMCTL_memory_mapping */ +/* Returns + - zero success, everything done + - -E2BIG passed in nr_mfns value too large for the implementation + - positive partial success for the first <result> page frames (with + <result> less than nr_mfns), requiring re-invocation by the + caller after updating inputs + - negative error; other than -E2BIG +*/ #define DPCI_ADD_MAPPING 1 #define DPCI_REMOVE_MAPPING 0 struct xen_domctl_memory_mapping {