Message ID | 56BB3B9F02000078000D0856@prv-mh.provo.novell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 10/02/16 12:31, Jan Beulich wrote: > When mapping large BARs (e.g. the frame buffer of a graphics card) the > overhead of establishing such mappings using only 4k pages has, > particularly after the XSA-125 fix, become unacceptable. Alter the > XEN_DOMCTL_memory_mapping semantics once again, so that there's no > longer a fixed amount of guest frames that represents the upper limit > of what a single invocation can map. Instead bound execution time by > limiting the number of iterations (regardless of page size). > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > Acked-by: Ian Campbell <ian.campbell@citrix.com> > Acked-by: Kevin Tian <kevin.tian@intel.com> > Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> > --- > Open issues (perhaps for subsequent changes): > - ARM side unimplemented (and hence libxc for now made cope with both > models), the main issue (besides my inability to test any change > there) being the many internal uses of map_mmio_regions()) > - iommu_{,un}map_page() interfaces don't support "order" (see the > comment in mmio_order()) > --- > v8: Fix set_mmio_p2m_entry(): Mappings overlapping with r/o MMIO > regions must be strictly 4k only for now. Comment the > !iommu_use_hap_pt() in mmio_order(). > v7: Only allow use of 2M mappings for the time being. > v6: Move an mfn_valid() assertion to cover the full MFN range. Use > PAGE_ORDER_4K in mmio_order(). Improve the return value description > of set_typed_p2m_entry(). > v5: Refine comment in domctl.h. > v4: Move cleanup duty entirely to the caller of the hypercall. Move > return value description to from commit message to domctl.h. > v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into > hap_capabilities". Extend description to spell out new return value > meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead > of literal 0. Take into consideration r/o MMIO pages. > v2: Produce valid entries for large p2m_mmio_direct mappings in > p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in > mmio_order(). Update function comment of set_typed_p2m_entry() and > clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to > {,un}map_mmio_regions() to detect otherwise endless loops. > > --- a/tools/libxc/xc_domain.c > +++ b/tools/libxc/xc_domain.c > @@ -2229,7 +2229,7 @@ int xc_domain_memory_mapping( > { > DECLARE_DOMCTL; > xc_dominfo_t info; > - int ret = 0, err; > + int ret = 0, rc; > unsigned long done = 0, nr, max_batch_sz; > > if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 || > @@ -2254,19 +2254,24 @@ int xc_domain_memory_mapping( > domctl.u.memory_mapping.nr_mfns = nr; > domctl.u.memory_mapping.first_gfn = first_gfn + done; > domctl.u.memory_mapping.first_mfn = first_mfn + done; > - err = do_domctl(xch, &domctl); > - if ( err && errno == E2BIG ) > + rc = do_domctl(xch, &domctl); > + if ( rc < 0 && errno == E2BIG ) > { > if ( max_batch_sz <= 1 ) > break; > max_batch_sz >>= 1; > continue; > } > + if ( rc > 0 ) > + { > + done += rc; > + continue; > + } > /* Save the first error... */ > if ( !ret ) > - ret = err; > + ret = rc; > /* .. and ignore the rest of them when removing. */ > - if ( err && add_mapping != DPCI_REMOVE_MAPPING ) > + if ( rc && add_mapping != DPCI_REMOVE_MAPPING ) > break; > > done += nr; > --- a/xen/arch/x86/domain_build.c > +++ b/xen/arch/x86/domain_build.c > @@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s > else > a = p2m_access_rw; > > - if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) ) > + if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), > + PAGE_ORDER_4K, a)) ) > panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n", > gfn, mfn, i, rc); > if ( !(i & 0xfffff) ) > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -2504,7 +2504,7 @@ static int vmx_alloc_vlapic_mapping(stru > share_xen_page_with_guest(pg, d, XENSHARE_writable); > d->arch.hvm_domain.vmx.apic_access_mfn = mfn; > set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn), > - p2m_get_hostp2m(d)->default_access); > + PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access); > > return 0; > } > --- a/xen/arch/x86/mm/p2m.c > +++ b/xen/arch/x86/mm/p2m.c > @@ -900,48 +900,64 @@ void p2m_change_type_range(struct domain > p2m_unlock(p2m); > } > > -/* Returns: 0 for success, -errno for failure */ > +/* > + * Returns: > + * 0 for success > + * -errno for failure > + * 1 + new order for caller to retry with smaller order (guaranteed > + * to be smaller than order passed in) > + */ > static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > - p2m_type_t gfn_p2mt, p2m_access_t access) > + unsigned int order, p2m_type_t gfn_p2mt, > + p2m_access_t access) > { > int rc = 0; > p2m_access_t a; > p2m_type_t ot; > mfn_t omfn; > + unsigned int cur_order = 0; > struct p2m_domain *p2m = p2m_get_hostp2m(d); > > if ( !paging_mode_translate(d) ) > return -EIO; > > - gfn_lock(p2m, gfn, 0); > - omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL); > + gfn_lock(p2m, gfn, order); > + omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL); > + if ( cur_order < order ) > + { > + gfn_unlock(p2m, gfn, order); > + return cur_order + 1; > + } > if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) > { > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > domain_crash(d); > return -ENOENT; > } > else if ( p2m_is_ram(ot) ) > { > - ASSERT(mfn_valid(omfn)); > - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); > + unsigned long i; > + > + for ( i = 0; i < (1UL << order); ++i ) > + { > + ASSERT(mfn_valid(_mfn(mfn_x(omfn) + i))); > + set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); > + } > } > > P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn)); > - rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt, > - access); > + rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); > if ( rc ) > - gdprintk(XENLOG_ERR, > - "p2m_set_entry failed! mfn=%08lx rc:%d\n", > - mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc); > + gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n", > + gfn, order, rc, mfn_x(mfn)); > else if ( p2m_is_pod(ot) ) > { > pod_lock(p2m); > - p2m->pod.entry_count--; > + p2m->pod.entry_count -= 1UL << order; > BUG_ON(p2m->pod.entry_count < 0); > pod_unlock(p2m); > } > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > > return rc; > } > @@ -950,14 +966,19 @@ static int set_typed_p2m_entry(struct do > static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn, > mfn_t mfn) > { > - return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign, > + return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign, > p2m_get_hostp2m(d)->default_access); > } > > int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > - p2m_access_t access) > + unsigned int order, p2m_access_t access) > { > - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); > + if ( order > PAGE_ORDER_4K && > + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), > + mfn_x(mfn) + (1UL << order) - 1) ) > + return PAGE_ORDER_4K + 1; > + > + return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access); > } > > int set_identity_p2m_entry(struct domain *d, unsigned long gfn, > @@ -1010,20 +1031,33 @@ int set_identity_p2m_entry(struct domain > return ret; > } > > -/* Returns: 0 for success, -errno for failure */ > -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) > +/* > + * Returns: > + * 0 for success > + * -errno for failure > + * order+1 for caller to retry with order (guaranteed smaller than > + * the order value passed in) > + */ > +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > + unsigned int order) > { > int rc = -EINVAL; > mfn_t actual_mfn; > p2m_access_t a; > p2m_type_t t; > + unsigned int cur_order = 0; > struct p2m_domain *p2m = p2m_get_hostp2m(d); > > if ( !paging_mode_translate(d) ) > return -EIO; > > - gfn_lock(p2m, gfn, 0); > - actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL); > + gfn_lock(p2m, gfn, order); > + actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL); > + if ( cur_order < order ) > + { > + rc = cur_order + 1; > + goto out; > + } > > /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */ > if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) ) > @@ -1036,11 +1070,11 @@ int clear_mmio_p2m_entry(struct domain * > gdprintk(XENLOG_WARNING, > "no mapping between mfn %08lx and gfn %08lx\n", > mfn_x(mfn), gfn); > - rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid, > + rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid, > p2m->default_access); > > out: > - gfn_unlock(p2m, gfn, 0); > + gfn_unlock(p2m, gfn, order); > > return rc; > } > @@ -2062,6 +2096,37 @@ void *map_domain_gfn(struct p2m_domain * > return map_domain_page(*mfn); > } > > +static unsigned int mmio_order(const struct domain *d, > + unsigned long start_fn, unsigned long nr) > +{ > + /* > + * Note that the !iommu_use_hap_pt() here has three effects: > + * - cover iommu_{,un}map_page() not having an "order" input yet, > + * - exclude shadow mode (which doesn't support large MMIO mappings), > + * - exclude PV guests, should execution reach this code for such. > + * So be careful when altering this. > + */ > + if ( !need_iommu(d) || !iommu_use_hap_pt(d) || > + (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) ) > + return PAGE_ORDER_4K; > + > + if ( 0 /* > + * Don't use 1Gb pages, to limit the iteration count in > + * set_typed_p2m_entry() when it needs to zap M2P entries > + * for a RAM range. > + */ && > + !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) && > + hap_has_1gb ) > + return PAGE_ORDER_1G; > + > + if ( hap_has_2mb ) > + return PAGE_ORDER_2M; > + > + return PAGE_ORDER_4K; > +} > + > +#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */ > + > int map_mmio_regions(struct domain *d, > unsigned long start_gfn, > unsigned long nr, > @@ -2069,22 +2134,29 @@ int map_mmio_regions(struct domain *d, > { > int ret = 0; > unsigned long i; > + unsigned int iter, order; > > if ( !paging_mode_translate(d) ) > return 0; > > - for ( i = 0; !ret && i < nr; i++ ) > + for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER; > + i += 1UL << order, ++iter ) > { > - ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), > - p2m_get_hostp2m(d)->default_access); > - if ( ret ) > + /* OR'ing gfn and mfn values will return an order suitable to both. */ > + for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ; > + order = ret - 1 ) > { > - unmap_mmio_regions(d, start_gfn, i, mfn); > - break; > + ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order, > + p2m_get_hostp2m(d)->default_access); > + if ( ret <= 0 ) > + break; > + ASSERT(ret <= order); > } > + if ( ret < 0 ) > + break; > } > > - return ret; > + return i == nr ? 0 : i ?: ret; > } > > int unmap_mmio_regions(struct domain *d, > @@ -2092,20 +2164,30 @@ int unmap_mmio_regions(struct domain *d, > unsigned long nr, > unsigned long mfn) > { > - int err = 0; > + int ret = 0; > unsigned long i; > + unsigned int iter, order; > > if ( !paging_mode_translate(d) ) > return 0; > > - for ( i = 0; i < nr; i++ ) > + for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER; > + i += 1UL << order, ++iter ) > { > - int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i)); > - if ( ret ) > - err = ret; > + /* OR'ing gfn and mfn values will return an order suitable to both. */ > + for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ; > + order = ret - 1 ) > + { > + ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order); > + if ( ret <= 0 ) > + break; > + ASSERT(ret <= order); > + } > + if ( ret < 0 ) > + break; > } > > - return err; > + return i == nr ? 0 : i ?: ret; > } > > unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp) > --- a/xen/arch/x86/mm/p2m-ept.c > +++ b/xen/arch/x86/mm/p2m-ept.c > @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct > entry->r = entry->x = 1; > entry->w = !rangeset_contains_singleton(mmio_ro_ranges, > entry->mfn); > + ASSERT(entry->w || !is_epte_superpage(entry)); > entry->a = !!cpu_has_vmx_ept_ad; > entry->d = entry->w && cpu_has_vmx_ept_ad; > break; > --- a/xen/arch/x86/mm/p2m-pt.c > +++ b/xen/arch/x86/mm/p2m-pt.c > @@ -72,7 +72,8 @@ static const unsigned long pgt[] = { > PGT_l3_page_table > }; > > -static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) > +static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn, > + unsigned int level) > { > unsigned long flags; > /* > @@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p > case p2m_mmio_direct: > if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) > flags |= _PAGE_RW; > + else > + ASSERT(!level); > return flags | P2M_BASE_FLAGS | _PAGE_PCD; > } > } > @@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain * > p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) > ? p2m_ram_logdirty : p2m_ram_rw; > unsigned long mfn = l1e_get_pfn(e); > - unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn)); > + unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level); > > if ( level ) > { > @@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, > ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); > l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) > ? l3e_from_pfn(mfn_x(mfn), > - p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) > + p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE) > : l3e_empty(); > entry_content.l1 = l3e_content.l3; > > @@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, > > if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) > entry_content = p2m_l1e_from_pfn(mfn_x(mfn), > - p2m_type_to_flags(p2mt, mfn)); > + p2m_type_to_flags(p2mt, mfn, 0)); > else > entry_content = l1e_empty(); > > @@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, > ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); > if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) > l2e_content = l2e_from_pfn(mfn_x(mfn), > - p2m_type_to_flags(p2mt, mfn) | > + p2m_type_to_flags(p2mt, mfn, 1) | > _PAGE_PSE); > else > l2e_content = l2e_empty(); > --- a/xen/common/domctl.c > +++ b/xen/common/domctl.c > @@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe > (gfn + nr_mfns - 1) < gfn ) /* wrap? */ > break; > > +#ifndef CONFIG_X86 /* XXX ARM!? */ > ret = -E2BIG; > /* Must break hypercall up as this could take a while. */ > if ( nr_mfns > 64 ) > break; > +#endif > > ret = -EPERM; > if ( !iomem_access_permitted(current->domain, mfn, mfn_end) || > @@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe > d->domain_id, gfn, mfn, nr_mfns); > > ret = map_mmio_regions(d, gfn, nr_mfns, mfn); > - if ( ret ) > + if ( ret < 0 ) > printk(XENLOG_G_WARNING > "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n", > d->domain_id, gfn, mfn, nr_mfns, ret); > @@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe > d->domain_id, gfn, mfn, nr_mfns); > > ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn); > - if ( ret && is_hardware_domain(current->domain) ) > + if ( ret < 0 && is_hardware_domain(current->domain) ) > printk(XENLOG_ERR > "memory_map: error %ld removing dom%d access to [%lx,%lx]\n", > ret, d->domain_id, mfn, mfn_end); > --- a/xen/common/memory.c > +++ b/xen/common/memory.c > @@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d, > } > if ( p2mt == p2m_mmio_direct ) > { > - clear_mmio_p2m_entry(d, gmfn, _mfn(mfn)); > + clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0); > put_gfn(d, gmfn); > return 1; > } > --- a/xen/include/asm-x86/p2m.h > +++ b/xen/include/asm-x86/p2m.h > @@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom > > /* Set mmio addresses in the p2m table (for pass-through) */ > int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > - p2m_access_t access); > -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); > + unsigned int order, p2m_access_t access); > +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, > + unsigned int order); > > /* Set identity addresses in the p2m table (for pass-through) */ > int set_identity_p2m_entry(struct domain *d, unsigned long gfn, > --- a/xen/include/public/domctl.h > +++ b/xen/include/public/domctl.h > @@ -545,8 +545,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_ > > > /* Bind machine I/O address range -> HVM address range. */ > -/* If this returns -E2BIG lower nr_mfns value. */ > /* XEN_DOMCTL_memory_mapping */ > +/* Returns > + - zero success, everything done > + - -E2BIG passed in nr_mfns value too large for the implementation > + - positive partial success for the first <result> page frames (with > + <result> less than nr_mfns), requiring re-invocation by the > + caller after updating inputs > + - negative error; other than -E2BIG > +*/ > #define DPCI_ADD_MAPPING 1 > #define DPCI_REMOVE_MAPPING 0 > struct xen_domctl_memory_mapping { > >
--- a/tools/libxc/xc_domain.c +++ b/tools/libxc/xc_domain.c @@ -2229,7 +2229,7 @@ int xc_domain_memory_mapping( { DECLARE_DOMCTL; xc_dominfo_t info; - int ret = 0, err; + int ret = 0, rc; unsigned long done = 0, nr, max_batch_sz; if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 || @@ -2254,19 +2254,24 @@ int xc_domain_memory_mapping( domctl.u.memory_mapping.nr_mfns = nr; domctl.u.memory_mapping.first_gfn = first_gfn + done; domctl.u.memory_mapping.first_mfn = first_mfn + done; - err = do_domctl(xch, &domctl); - if ( err && errno == E2BIG ) + rc = do_domctl(xch, &domctl); + if ( rc < 0 && errno == E2BIG ) { if ( max_batch_sz <= 1 ) break; max_batch_sz >>= 1; continue; } + if ( rc > 0 ) + { + done += rc; + continue; + } /* Save the first error... */ if ( !ret ) - ret = err; + ret = rc; /* .. and ignore the rest of them when removing. */ - if ( err && add_mapping != DPCI_REMOVE_MAPPING ) + if ( rc && add_mapping != DPCI_REMOVE_MAPPING ) break; done += nr; --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s else a = p2m_access_rw; - if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) ) + if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), + PAGE_ORDER_4K, a)) ) panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n", gfn, mfn, i, rc); if ( !(i & 0xfffff) ) --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -2504,7 +2504,7 @@ static int vmx_alloc_vlapic_mapping(stru share_xen_page_with_guest(pg, d, XENSHARE_writable); d->arch.hvm_domain.vmx.apic_access_mfn = mfn; set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn), - p2m_get_hostp2m(d)->default_access); + PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access); return 0; } --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -900,48 +900,64 @@ void p2m_change_type_range(struct domain p2m_unlock(p2m); } -/* Returns: 0 for success, -errno for failure */ +/* + * Returns: + * 0 for success + * -errno for failure + * 1 + new order for caller to retry with smaller order (guaranteed + * to be smaller than order passed in) + */ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, - p2m_type_t gfn_p2mt, p2m_access_t access) + unsigned int order, p2m_type_t gfn_p2mt, + p2m_access_t access) { int rc = 0; p2m_access_t a; p2m_type_t ot; mfn_t omfn; + unsigned int cur_order = 0; struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( !paging_mode_translate(d) ) return -EIO; - gfn_lock(p2m, gfn, 0); - omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL); + gfn_lock(p2m, gfn, order); + omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL); + if ( cur_order < order ) + { + gfn_unlock(p2m, gfn, order); + return cur_order + 1; + } if ( p2m_is_grant(ot) || p2m_is_foreign(ot) ) { - gfn_unlock(p2m, gfn, 0); + gfn_unlock(p2m, gfn, order); domain_crash(d); return -ENOENT; } else if ( p2m_is_ram(ot) ) { - ASSERT(mfn_valid(omfn)); - set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + unsigned long i; + + for ( i = 0; i < (1UL << order); ++i ) + { + ASSERT(mfn_valid(_mfn(mfn_x(omfn) + i))); + set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY); + } } P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn)); - rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt, - access); + rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access); if ( rc ) - gdprintk(XENLOG_ERR, - "p2m_set_entry failed! mfn=%08lx rc:%d\n", - mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc); + gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n", + gfn, order, rc, mfn_x(mfn)); else if ( p2m_is_pod(ot) ) { pod_lock(p2m); - p2m->pod.entry_count--; + p2m->pod.entry_count -= 1UL << order; BUG_ON(p2m->pod.entry_count < 0); pod_unlock(p2m); } - gfn_unlock(p2m, gfn, 0); + gfn_unlock(p2m, gfn, order); return rc; } @@ -950,14 +966,19 @@ static int set_typed_p2m_entry(struct do static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) { - return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign, + return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign, p2m_get_hostp2m(d)->default_access); } int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, - p2m_access_t access) + unsigned int order, p2m_access_t access) { - return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access); + if ( order > PAGE_ORDER_4K && + rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), + mfn_x(mfn) + (1UL << order) - 1) ) + return PAGE_ORDER_4K + 1; + + return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access); } int set_identity_p2m_entry(struct domain *d, unsigned long gfn, @@ -1010,20 +1031,33 @@ int set_identity_p2m_entry(struct domain return ret; } -/* Returns: 0 for success, -errno for failure */ -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) +/* + * Returns: + * 0 for success + * -errno for failure + * order+1 for caller to retry with order (guaranteed smaller than + * the order value passed in) + */ +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + unsigned int order) { int rc = -EINVAL; mfn_t actual_mfn; p2m_access_t a; p2m_type_t t; + unsigned int cur_order = 0; struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( !paging_mode_translate(d) ) return -EIO; - gfn_lock(p2m, gfn, 0); - actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL); + gfn_lock(p2m, gfn, order); + actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL); + if ( cur_order < order ) + { + rc = cur_order + 1; + goto out; + } /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */ if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) ) @@ -1036,11 +1070,11 @@ int clear_mmio_p2m_entry(struct domain * gdprintk(XENLOG_WARNING, "no mapping between mfn %08lx and gfn %08lx\n", mfn_x(mfn), gfn); - rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid, + rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid, p2m->default_access); out: - gfn_unlock(p2m, gfn, 0); + gfn_unlock(p2m, gfn, order); return rc; } @@ -2062,6 +2096,37 @@ void *map_domain_gfn(struct p2m_domain * return map_domain_page(*mfn); } +static unsigned int mmio_order(const struct domain *d, + unsigned long start_fn, unsigned long nr) +{ + /* + * Note that the !iommu_use_hap_pt() here has three effects: + * - cover iommu_{,un}map_page() not having an "order" input yet, + * - exclude shadow mode (which doesn't support large MMIO mappings), + * - exclude PV guests, should execution reach this code for such. + * So be careful when altering this. + */ + if ( !need_iommu(d) || !iommu_use_hap_pt(d) || + (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) ) + return PAGE_ORDER_4K; + + if ( 0 /* + * Don't use 1Gb pages, to limit the iteration count in + * set_typed_p2m_entry() when it needs to zap M2P entries + * for a RAM range. + */ && + !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) && + hap_has_1gb ) + return PAGE_ORDER_1G; + + if ( hap_has_2mb ) + return PAGE_ORDER_2M; + + return PAGE_ORDER_4K; +} + +#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */ + int map_mmio_regions(struct domain *d, unsigned long start_gfn, unsigned long nr, @@ -2069,22 +2134,29 @@ int map_mmio_regions(struct domain *d, { int ret = 0; unsigned long i; + unsigned int iter, order; if ( !paging_mode_translate(d) ) return 0; - for ( i = 0; !ret && i < nr; i++ ) + for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER; + i += 1UL << order, ++iter ) { - ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), - p2m_get_hostp2m(d)->default_access); - if ( ret ) + /* OR'ing gfn and mfn values will return an order suitable to both. */ + for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ; + order = ret - 1 ) { - unmap_mmio_regions(d, start_gfn, i, mfn); - break; + ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order, + p2m_get_hostp2m(d)->default_access); + if ( ret <= 0 ) + break; + ASSERT(ret <= order); } + if ( ret < 0 ) + break; } - return ret; + return i == nr ? 0 : i ?: ret; } int unmap_mmio_regions(struct domain *d, @@ -2092,20 +2164,30 @@ int unmap_mmio_regions(struct domain *d, unsigned long nr, unsigned long mfn) { - int err = 0; + int ret = 0; unsigned long i; + unsigned int iter, order; if ( !paging_mode_translate(d) ) return 0; - for ( i = 0; i < nr; i++ ) + for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER; + i += 1UL << order, ++iter ) { - int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i)); - if ( ret ) - err = ret; + /* OR'ing gfn and mfn values will return an order suitable to both. */ + for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ; + order = ret - 1 ) + { + ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order); + if ( ret <= 0 ) + break; + ASSERT(ret <= order); + } + if ( ret < 0 ) + break; } - return err; + return i == nr ? 0 : i ?: ret; } unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp) --- a/xen/arch/x86/mm/p2m-ept.c +++ b/xen/arch/x86/mm/p2m-ept.c @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct entry->r = entry->x = 1; entry->w = !rangeset_contains_singleton(mmio_ro_ranges, entry->mfn); + ASSERT(entry->w || !is_epte_superpage(entry)); entry->a = !!cpu_has_vmx_ept_ad; entry->d = entry->w && cpu_has_vmx_ept_ad; break; --- a/xen/arch/x86/mm/p2m-pt.c +++ b/xen/arch/x86/mm/p2m-pt.c @@ -72,7 +72,8 @@ static const unsigned long pgt[] = { PGT_l3_page_table }; -static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) +static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn, + unsigned int level) { unsigned long flags; /* @@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p case p2m_mmio_direct: if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) flags |= _PAGE_RW; + else + ASSERT(!level); return flags | P2M_BASE_FLAGS | _PAGE_PCD; } } @@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain * p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) ? p2m_ram_logdirty : p2m_ram_rw; unsigned long mfn = l1e_get_pfn(e); - unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn)); + unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level); if ( level ) { @@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ? l3e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) + p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE) : l3e_empty(); entry_content.l1 = l3e_content.l3; @@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) entry_content = p2m_l1e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn)); + p2m_type_to_flags(p2mt, mfn, 0)); else entry_content = l1e_empty(); @@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) ) l2e_content = l2e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn) | + p2m_type_to_flags(p2mt, mfn, 1) | _PAGE_PSE); else l2e_content = l2e_empty(); --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe (gfn + nr_mfns - 1) < gfn ) /* wrap? */ break; +#ifndef CONFIG_X86 /* XXX ARM!? */ ret = -E2BIG; /* Must break hypercall up as this could take a while. */ if ( nr_mfns > 64 ) break; +#endif ret = -EPERM; if ( !iomem_access_permitted(current->domain, mfn, mfn_end) || @@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe d->domain_id, gfn, mfn, nr_mfns); ret = map_mmio_regions(d, gfn, nr_mfns, mfn); - if ( ret ) + if ( ret < 0 ) printk(XENLOG_G_WARNING "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n", d->domain_id, gfn, mfn, nr_mfns, ret); @@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe d->domain_id, gfn, mfn, nr_mfns); ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn); - if ( ret && is_hardware_domain(current->domain) ) + if ( ret < 0 && is_hardware_domain(current->domain) ) printk(XENLOG_ERR "memory_map: error %ld removing dom%d access to [%lx,%lx]\n", ret, d->domain_id, mfn, mfn_end); --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d, } if ( p2mt == p2m_mmio_direct ) { - clear_mmio_p2m_entry(d, gmfn, _mfn(mfn)); + clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0); put_gfn(d, gmfn); return 1; } --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom /* Set mmio addresses in the p2m table (for pass-through) */ int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, - p2m_access_t access); -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); + unsigned int order, p2m_access_t access); +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + unsigned int order); /* Set identity addresses in the p2m table (for pass-through) */ int set_identity_p2m_entry(struct domain *d, unsigned long gfn, --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -545,8 +545,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_ /* Bind machine I/O address range -> HVM address range. */ -/* If this returns -E2BIG lower nr_mfns value. */ /* XEN_DOMCTL_memory_mapping */ +/* Returns + - zero success, everything done + - -E2BIG passed in nr_mfns value too large for the implementation + - positive partial success for the first <result> page frames (with + <result> less than nr_mfns), requiring re-invocation by the + caller after updating inputs + - negative error; other than -E2BIG +*/ #define DPCI_ADD_MAPPING 1 #define DPCI_REMOVE_MAPPING 0 struct xen_domctl_memory_mapping {