diff mbox

[v8] x86/p2m: use large pages for MMIO mappings

Message ID 56BB3B9F02000078000D0856@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich Feb. 10, 2016, 12:31 p.m. UTC
When mapping large BARs (e.g. the frame buffer of a graphics card) the
overhead of establishing such mappings using only 4k pages has,
particularly after the XSA-125 fix, become unacceptable. Alter the
XEN_DOMCTL_memory_mapping semantics once again, so that there's no
longer a fixed amount of guest frames that represents the upper limit
of what a single invocation can map. Instead bound execution time by
limiting the number of iterations (regardless of page size).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
Open issues (perhaps for subsequent changes):
- ARM side unimplemented (and hence libxc for now made cope with both
  models), the main issue (besides my inability to test any change
  there) being the many internal uses of map_mmio_regions())
- iommu_{,un}map_page() interfaces don't support "order" (see the
  comment in mmio_order())
---
v8: Fix set_mmio_p2m_entry(): Mappings overlapping with r/o MMIO
    regions must be strictly 4k only for now. Comment the
    !iommu_use_hap_pt() in mmio_order().
v7: Only allow use of 2M mappings for the time being.
v6: Move an mfn_valid() assertion to cover the full MFN range. Use
    PAGE_ORDER_4K in mmio_order(). Improve the return value description
    of set_typed_p2m_entry().
v5: Refine comment in domctl.h.
v4: Move cleanup duty entirely to the caller of the hypercall. Move
    return value description to from commit message to domctl.h.
v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into
    hap_capabilities". Extend description to spell out new return value
    meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead
    of literal 0. Take into consideration r/o MMIO pages.
v2: Produce valid entries for large p2m_mmio_direct mappings in
    p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in
    mmio_order(). Update function comment of set_typed_p2m_entry() and
    clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to
    {,un}map_mmio_regions() to detect otherwise endless loops.
x86/p2m: use large pages for MMIO mappings

When mapping large BARs (e.g. the frame buffer of a graphics card) the
overhead of establishing such mappings using only 4k pages has,
particularly after the XSA-125 fix, become unacceptable. Alter the
XEN_DOMCTL_memory_mapping semantics once again, so that there's no
longer a fixed amount of guest frames that represents the upper limit
of what a single invocation can map. Instead bound execution time by
limiting the number of iterations (regardless of page size).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
Open issues (perhaps for subsequent changes):
- ARM side unimplemented (and hence libxc for now made cope with both
  models), the main issue (besides my inability to test any change
  there) being the many internal uses of map_mmio_regions())
- iommu_{,un}map_page() interfaces don't support "order" (see the
  comment in mmio_order())
---
v8: Fix set_mmio_p2m_entry(): Mappings overlapping with r/o MMIO
    regions must be strictly 4k only for now. Comment the
    !iommu_use_hap_pt() in mmio_order().
v7: Only allow use of 2M mappings for the time being.
v6: Move an mfn_valid() assertion to cover the full MFN range. Use
    PAGE_ORDER_4K in mmio_order(). Improve the return value description
    of set_typed_p2m_entry().
v5: Refine comment in domctl.h.
v4: Move cleanup duty entirely to the caller of the hypercall. Move
    return value description to from commit message to domctl.h.
v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into
    hap_capabilities". Extend description to spell out new return value
    meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead
    of literal 0. Take into consideration r/o MMIO pages.
v2: Produce valid entries for large p2m_mmio_direct mappings in
    p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in
    mmio_order(). Update function comment of set_typed_p2m_entry() and
    clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to
    {,un}map_mmio_regions() to detect otherwise endless loops.

--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -2229,7 +2229,7 @@ int xc_domain_memory_mapping(
 {
     DECLARE_DOMCTL;
     xc_dominfo_t info;
-    int ret = 0, err;
+    int ret = 0, rc;
     unsigned long done = 0, nr, max_batch_sz;
 
     if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ||
@@ -2254,19 +2254,24 @@ int xc_domain_memory_mapping(
         domctl.u.memory_mapping.nr_mfns = nr;
         domctl.u.memory_mapping.first_gfn = first_gfn + done;
         domctl.u.memory_mapping.first_mfn = first_mfn + done;
-        err = do_domctl(xch, &domctl);
-        if ( err && errno == E2BIG )
+        rc = do_domctl(xch, &domctl);
+        if ( rc < 0 && errno == E2BIG )
         {
             if ( max_batch_sz <= 1 )
                 break;
             max_batch_sz >>= 1;
             continue;
         }
+        if ( rc > 0 )
+        {
+            done += rc;
+            continue;
+        }
         /* Save the first error... */
         if ( !ret )
-            ret = err;
+            ret = rc;
         /* .. and ignore the rest of them when removing. */
-        if ( err && add_mapping != DPCI_REMOVE_MAPPING )
+        if ( rc && add_mapping != DPCI_REMOVE_MAPPING )
             break;
 
         done += nr;
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s
         else
             a = p2m_access_rw;
 
-        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
+        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i),
+                                      PAGE_ORDER_4K, a)) )
             panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
                   gfn, mfn, i, rc);
         if ( !(i & 0xfffff) )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2504,7 +2504,7 @@ static int vmx_alloc_vlapic_mapping(stru
     share_xen_page_with_guest(pg, d, XENSHARE_writable);
     d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
     set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
-                       p2m_get_hostp2m(d)->default_access);
+                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
 
     return 0;
 }
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -900,48 +900,64 @@ void p2m_change_type_range(struct domain
     p2m_unlock(p2m);
 }
 
-/* Returns: 0 for success, -errno for failure */
+/*
+ * Returns:
+ *    0              for success
+ *    -errno         for failure
+ *    1 + new order  for caller to retry with smaller order (guaranteed
+ *                   to be smaller than order passed in)
+ */
 static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                               p2m_type_t gfn_p2mt, p2m_access_t access)
+                               unsigned int order, p2m_type_t gfn_p2mt,
+                               p2m_access_t access)
 {
     int rc = 0;
     p2m_access_t a;
     p2m_type_t ot;
     mfn_t omfn;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        gfn_unlock(p2m, gfn, order);
+        return cur_order + 1;
+    }
     if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
-        gfn_unlock(p2m, gfn, 0);
+        gfn_unlock(p2m, gfn, order);
         domain_crash(d);
         return -ENOENT;
     }
     else if ( p2m_is_ram(ot) )
     {
-        ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        unsigned long i;
+
+        for ( i = 0; i < (1UL << order); ++i )
+        {
+            ASSERT(mfn_valid(_mfn(mfn_x(omfn) + i)));
+            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
+        }
     }
 
     P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
-    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
-                       access);
+    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
     if ( rc )
-        gdprintk(XENLOG_ERR,
-                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
-                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc);
+        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+                 gfn, order, rc, mfn_x(mfn));
     else if ( p2m_is_pod(ot) )
     {
         pod_lock(p2m);
-        p2m->pod.entry_count--;
+        p2m->pod.entry_count -= 1UL << order;
         BUG_ON(p2m->pod.entry_count < 0);
         pod_unlock(p2m);
     }
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -950,14 +966,19 @@ static int set_typed_p2m_entry(struct do
 static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
                                  mfn_t mfn)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
+    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
                                p2m_get_hostp2m(d)->default_access);
 }
 
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access)
+                       unsigned int order, p2m_access_t access)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
+    if ( order > PAGE_ORDER_4K &&
+         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
+                                 mfn_x(mfn) + (1UL << order) - 1) )
+        return PAGE_ORDER_4K + 1;
+
+    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access);
 }
 
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
@@ -1010,20 +1031,33 @@ int set_identity_p2m_entry(struct domain
     return ret;
 }
 
-/* Returns: 0 for success, -errno for failure */
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+/*
+ * Returns:
+ *    0        for success
+ *    -errno   for failure
+ *    order+1  for caller to retry with order (guaranteed smaller than
+ *             the order value passed in)
+ */
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order)
 {
     int rc = -EINVAL;
     mfn_t actual_mfn;
     p2m_access_t a;
     p2m_type_t t;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        rc = cur_order + 1;
+        goto out;
+    }
 
     /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
     if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
@@ -1036,11 +1070,11 @@ int clear_mmio_p2m_entry(struct domain *
         gdprintk(XENLOG_WARNING,
                  "no mapping between mfn %08lx and gfn %08lx\n",
                  mfn_x(mfn), gfn);
-    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid,
+    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
                        p2m->default_access);
 
  out:
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -2062,6 +2096,37 @@ void *map_domain_gfn(struct p2m_domain *
     return map_domain_page(*mfn);
 }
 
+static unsigned int mmio_order(const struct domain *d,
+                               unsigned long start_fn, unsigned long nr)
+{
+    /*
+     * Note that the !iommu_use_hap_pt() here has three effects:
+     * - cover iommu_{,un}map_page() not having an "order" input yet,
+     * - exclude shadow mode (which doesn't support large MMIO mappings),
+     * - exclude PV guests, should execution reach this code for such.
+     * So be careful when altering this.
+     */
+    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
+         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
+        return PAGE_ORDER_4K;
+
+    if ( 0 /*
+            * Don't use 1Gb pages, to limit the iteration count in
+            * set_typed_p2m_entry() when it needs to zap M2P entries
+            * for a RAM range.
+            */ &&
+         !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
+         hap_has_1gb )
+        return PAGE_ORDER_1G;
+
+    if ( hap_has_2mb )
+        return PAGE_ORDER_2M;
+
+    return PAGE_ORDER_4K;
+}
+
+#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */
+
 int map_mmio_regions(struct domain *d,
                      unsigned long start_gfn,
                      unsigned long nr,
@@ -2069,22 +2134,29 @@ int map_mmio_regions(struct domain *d,
 {
     int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; !ret && i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
-                                 p2m_get_hostp2m(d)->default_access);
-        if ( ret )
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
         {
-            unmap_mmio_regions(d, start_gfn, i, mfn);
-            break;
+            ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order,
+                                     p2m_get_hostp2m(d)->default_access);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
         }
+        if ( ret < 0 )
+            break;
     }
 
-    return ret;
+    return i == nr ? 0 : i ?: ret;
 }
 
 int unmap_mmio_regions(struct domain *d,
@@ -2092,20 +2164,30 @@ int unmap_mmio_regions(struct domain *d,
                        unsigned long nr,
                        unsigned long mfn)
 {
-    int err = 0;
+    int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
-        if ( ret )
-            err = ret;
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
+        {
+            ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
+        }
+        if ( ret < 0 )
+            break;
     }
 
-    return err;
+    return i == nr ? 0 : i ?: ret;
 }
 
 unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
                                                     entry->mfn);
+            ASSERT(entry->w || !is_epte_superpage(entry));
             entry->a = !!cpu_has_vmx_ept_ad;
             entry->d = entry->w && cpu_has_vmx_ept_ad;
             break;
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -72,7 +72,8 @@ static const unsigned long pgt[] = {
     PGT_l3_page_table
 };
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
+                                       unsigned int level)
 {
     unsigned long flags;
     /*
@@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p
     case p2m_mmio_direct:
         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
             flags |= _PAGE_RW;
+        else
+            ASSERT(!level);
         return flags | P2M_BASE_FLAGS | _PAGE_PCD;
     }
 }
@@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain *
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
                               ? p2m_ram_logdirty : p2m_ram_rw;
             unsigned long mfn = l1e_get_pfn(e);
-            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
 
             if ( level )
             {
@@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
             ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
+                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
 
@@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
 
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
-                                             p2m_type_to_flags(p2mt, mfn));
+                                             p2m_type_to_flags(p2mt, mfn, 0));
         else
             entry_content = l1e_empty();
 
@@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn) |
+                                       p2m_type_to_flags(p2mt, mfn, 1) |
                                        _PAGE_PSE);
         else
             l2e_content = l2e_empty();
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
              (gfn + nr_mfns - 1) < gfn ) /* wrap? */
             break;
 
+#ifndef CONFIG_X86 /* XXX ARM!? */
         ret = -E2BIG;
         /* Must break hypercall up as this could take a while. */
         if ( nr_mfns > 64 )
             break;
+#endif
 
         ret = -EPERM;
         if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
@@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = map_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret )
+            if ( ret < 0 )
                 printk(XENLOG_G_WARNING
                        "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n",
                        d->domain_id, gfn, mfn, nr_mfns, ret);
@@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret && is_hardware_domain(current->domain) )
+            if ( ret < 0 && is_hardware_domain(current->domain) )
                 printk(XENLOG_ERR
                        "memory_map: error %ld removing dom%d access to [%lx,%lx]\n",
                        ret, d->domain_id, mfn, mfn_end);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d,
     }
     if ( p2mt == p2m_mmio_direct )
     {
-        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn));
+        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0);
         put_gfn(d, gmfn);
         return 1;
     }
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom
 
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access);
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+                       unsigned int order, p2m_access_t access);
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order);
 
 /* Set identity addresses in the p2m table (for pass-through) */
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -545,8 +545,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_
 
 
 /* Bind machine I/O address range -> HVM address range. */
-/* If this returns -E2BIG lower nr_mfns value. */
 /* XEN_DOMCTL_memory_mapping */
+/* Returns
+   - zero     success, everything done
+   - -E2BIG   passed in nr_mfns value too large for the implementation
+   - positive partial success for the first <result> page frames (with
+              <result> less than nr_mfns), requiring re-invocation by the
+              caller after updating inputs
+   - negative error; other than -E2BIG
+*/
 #define DPCI_ADD_MAPPING         1
 #define DPCI_REMOVE_MAPPING      0
 struct xen_domctl_memory_mapping {

Comments

George Dunlap Feb. 10, 2016, 2:16 p.m. UTC | #1
On 10/02/16 12:31, Jan Beulich wrote:
> When mapping large BARs (e.g. the frame buffer of a graphics card) the
> overhead of establishing such mappings using only 4k pages has,
> particularly after the XSA-125 fix, become unacceptable. Alter the
> XEN_DOMCTL_memory_mapping semantics once again, so that there's no
> longer a fixed amount of guest frames that represents the upper limit
> of what a single invocation can map. Instead bound execution time by
> limiting the number of iterations (regardless of page size).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> Acked-by: Kevin Tian <kevin.tian@intel.com>
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: George Dunlap <george.dunlap@citrix.com>

> ---
> Open issues (perhaps for subsequent changes):
> - ARM side unimplemented (and hence libxc for now made cope with both
>   models), the main issue (besides my inability to test any change
>   there) being the many internal uses of map_mmio_regions())
> - iommu_{,un}map_page() interfaces don't support "order" (see the
>   comment in mmio_order())
> ---
> v8: Fix set_mmio_p2m_entry(): Mappings overlapping with r/o MMIO
>     regions must be strictly 4k only for now. Comment the
>     !iommu_use_hap_pt() in mmio_order().
> v7: Only allow use of 2M mappings for the time being.
> v6: Move an mfn_valid() assertion to cover the full MFN range. Use
>     PAGE_ORDER_4K in mmio_order(). Improve the return value description
>     of set_typed_p2m_entry().
> v5: Refine comment in domctl.h.
> v4: Move cleanup duty entirely to the caller of the hypercall. Move
>     return value description to from commit message to domctl.h.
> v3: Re-base on top of "x86/hvm: fold opt_hap_{2mb,1gb} into
>     hap_capabilities". Extend description to spell out new return value
>     meaning. Add a couple of code comments. Use PAGE_ORDER_4K instead
>     of literal 0. Take into consideration r/o MMIO pages.
> v2: Produce valid entries for large p2m_mmio_direct mappings in
>     p2m_pt_set_entry(). Don't open code iommu_use_hap_pt() in
>     mmio_order(). Update function comment of set_typed_p2m_entry() and
>     clear_mmio_p2m_entry(). Use PRI_mfn. Add ASSERT()s to
>     {,un}map_mmio_regions() to detect otherwise endless loops.
> 
> --- a/tools/libxc/xc_domain.c
> +++ b/tools/libxc/xc_domain.c
> @@ -2229,7 +2229,7 @@ int xc_domain_memory_mapping(
>  {
>      DECLARE_DOMCTL;
>      xc_dominfo_t info;
> -    int ret = 0, err;
> +    int ret = 0, rc;
>      unsigned long done = 0, nr, max_batch_sz;
>  
>      if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ||
> @@ -2254,19 +2254,24 @@ int xc_domain_memory_mapping(
>          domctl.u.memory_mapping.nr_mfns = nr;
>          domctl.u.memory_mapping.first_gfn = first_gfn + done;
>          domctl.u.memory_mapping.first_mfn = first_mfn + done;
> -        err = do_domctl(xch, &domctl);
> -        if ( err && errno == E2BIG )
> +        rc = do_domctl(xch, &domctl);
> +        if ( rc < 0 && errno == E2BIG )
>          {
>              if ( max_batch_sz <= 1 )
>                  break;
>              max_batch_sz >>= 1;
>              continue;
>          }
> +        if ( rc > 0 )
> +        {
> +            done += rc;
> +            continue;
> +        }
>          /* Save the first error... */
>          if ( !ret )
> -            ret = err;
> +            ret = rc;
>          /* .. and ignore the rest of them when removing. */
> -        if ( err && add_mapping != DPCI_REMOVE_MAPPING )
> +        if ( rc && add_mapping != DPCI_REMOVE_MAPPING )
>              break;
>  
>          done += nr;
> --- a/xen/arch/x86/domain_build.c
> +++ b/xen/arch/x86/domain_build.c
> @@ -436,7 +436,8 @@ static __init void pvh_add_mem_mapping(s
>          else
>              a = p2m_access_rw;
>  
> -        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
> +        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i),
> +                                      PAGE_ORDER_4K, a)) )
>              panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
>                    gfn, mfn, i, rc);
>          if ( !(i & 0xfffff) )
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -2504,7 +2504,7 @@ static int vmx_alloc_vlapic_mapping(stru
>      share_xen_page_with_guest(pg, d, XENSHARE_writable);
>      d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
>      set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
> -                       p2m_get_hostp2m(d)->default_access);
> +                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
>  
>      return 0;
>  }
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -900,48 +900,64 @@ void p2m_change_type_range(struct domain
>      p2m_unlock(p2m);
>  }
>  
> -/* Returns: 0 for success, -errno for failure */
> +/*
> + * Returns:
> + *    0              for success
> + *    -errno         for failure
> + *    1 + new order  for caller to retry with smaller order (guaranteed
> + *                   to be smaller than order passed in)
> + */
>  static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> -                               p2m_type_t gfn_p2mt, p2m_access_t access)
> +                               unsigned int order, p2m_type_t gfn_p2mt,
> +                               p2m_access_t access)
>  {
>      int rc = 0;
>      p2m_access_t a;
>      p2m_type_t ot;
>      mfn_t omfn;
> +    unsigned int cur_order = 0;
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>  
>      if ( !paging_mode_translate(d) )
>          return -EIO;
>  
> -    gfn_lock(p2m, gfn, 0);
> -    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
> +    gfn_lock(p2m, gfn, order);
> +    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
> +    if ( cur_order < order )
> +    {
> +        gfn_unlock(p2m, gfn, order);
> +        return cur_order + 1;
> +    }
>      if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
>      {
> -        gfn_unlock(p2m, gfn, 0);
> +        gfn_unlock(p2m, gfn, order);
>          domain_crash(d);
>          return -ENOENT;
>      }
>      else if ( p2m_is_ram(ot) )
>      {
> -        ASSERT(mfn_valid(omfn));
> -        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
> +        unsigned long i;
> +
> +        for ( i = 0; i < (1UL << order); ++i )
> +        {
> +            ASSERT(mfn_valid(_mfn(mfn_x(omfn) + i)));
> +            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
> +        }
>      }
>  
>      P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
> -    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
> -                       access);
> +    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
>      if ( rc )
> -        gdprintk(XENLOG_ERR,
> -                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
> -                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc);
> +        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
> +                 gfn, order, rc, mfn_x(mfn));
>      else if ( p2m_is_pod(ot) )
>      {
>          pod_lock(p2m);
> -        p2m->pod.entry_count--;
> +        p2m->pod.entry_count -= 1UL << order;
>          BUG_ON(p2m->pod.entry_count < 0);
>          pod_unlock(p2m);
>      }
> -    gfn_unlock(p2m, gfn, 0);
> +    gfn_unlock(p2m, gfn, order);
>  
>      return rc;
>  }
> @@ -950,14 +966,19 @@ static int set_typed_p2m_entry(struct do
>  static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
>                                   mfn_t mfn)
>  {
> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
> +    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
>                                 p2m_get_hostp2m(d)->default_access);
>  }
>  
>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> -                       p2m_access_t access)
> +                       unsigned int order, p2m_access_t access)
>  {
> -    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
> +    if ( order > PAGE_ORDER_4K &&
> +         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
> +                                 mfn_x(mfn) + (1UL << order) - 1) )
> +        return PAGE_ORDER_4K + 1;
> +
> +    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access);
>  }
>  
>  int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
> @@ -1010,20 +1031,33 @@ int set_identity_p2m_entry(struct domain
>      return ret;
>  }
>  
> -/* Returns: 0 for success, -errno for failure */
> -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
> +/*
> + * Returns:
> + *    0        for success
> + *    -errno   for failure
> + *    order+1  for caller to retry with order (guaranteed smaller than
> + *             the order value passed in)
> + */
> +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> +                         unsigned int order)
>  {
>      int rc = -EINVAL;
>      mfn_t actual_mfn;
>      p2m_access_t a;
>      p2m_type_t t;
> +    unsigned int cur_order = 0;
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>  
>      if ( !paging_mode_translate(d) )
>          return -EIO;
>  
> -    gfn_lock(p2m, gfn, 0);
> -    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
> +    gfn_lock(p2m, gfn, order);
> +    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
> +    if ( cur_order < order )
> +    {
> +        rc = cur_order + 1;
> +        goto out;
> +    }
>  
>      /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
>      if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
> @@ -1036,11 +1070,11 @@ int clear_mmio_p2m_entry(struct domain *
>          gdprintk(XENLOG_WARNING,
>                   "no mapping between mfn %08lx and gfn %08lx\n",
>                   mfn_x(mfn), gfn);
> -    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid,
> +    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
>                         p2m->default_access);
>  
>   out:
> -    gfn_unlock(p2m, gfn, 0);
> +    gfn_unlock(p2m, gfn, order);
>  
>      return rc;
>  }
> @@ -2062,6 +2096,37 @@ void *map_domain_gfn(struct p2m_domain *
>      return map_domain_page(*mfn);
>  }
>  
> +static unsigned int mmio_order(const struct domain *d,
> +                               unsigned long start_fn, unsigned long nr)
> +{
> +    /*
> +     * Note that the !iommu_use_hap_pt() here has three effects:
> +     * - cover iommu_{,un}map_page() not having an "order" input yet,
> +     * - exclude shadow mode (which doesn't support large MMIO mappings),
> +     * - exclude PV guests, should execution reach this code for such.
> +     * So be careful when altering this.
> +     */
> +    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
> +         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
> +        return PAGE_ORDER_4K;
> +
> +    if ( 0 /*
> +            * Don't use 1Gb pages, to limit the iteration count in
> +            * set_typed_p2m_entry() when it needs to zap M2P entries
> +            * for a RAM range.
> +            */ &&
> +         !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
> +         hap_has_1gb )
> +        return PAGE_ORDER_1G;
> +
> +    if ( hap_has_2mb )
> +        return PAGE_ORDER_2M;
> +
> +    return PAGE_ORDER_4K;
> +}
> +
> +#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */
> +
>  int map_mmio_regions(struct domain *d,
>                       unsigned long start_gfn,
>                       unsigned long nr,
> @@ -2069,22 +2134,29 @@ int map_mmio_regions(struct domain *d,
>  {
>      int ret = 0;
>      unsigned long i;
> +    unsigned int iter, order;
>  
>      if ( !paging_mode_translate(d) )
>          return 0;
>  
> -    for ( i = 0; !ret && i < nr; i++ )
> +    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
> +          i += 1UL << order, ++iter )
>      {
> -        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
> -                                 p2m_get_hostp2m(d)->default_access);
> -        if ( ret )
> +        /* OR'ing gfn and mfn values will return an order suitable to both. */
> +        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
> +              order = ret - 1 )
>          {
> -            unmap_mmio_regions(d, start_gfn, i, mfn);
> -            break;
> +            ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order,
> +                                     p2m_get_hostp2m(d)->default_access);
> +            if ( ret <= 0 )
> +                break;
> +            ASSERT(ret <= order);
>          }
> +        if ( ret < 0 )
> +            break;
>      }
>  
> -    return ret;
> +    return i == nr ? 0 : i ?: ret;
>  }
>  
>  int unmap_mmio_regions(struct domain *d,
> @@ -2092,20 +2164,30 @@ int unmap_mmio_regions(struct domain *d,
>                         unsigned long nr,
>                         unsigned long mfn)
>  {
> -    int err = 0;
> +    int ret = 0;
>      unsigned long i;
> +    unsigned int iter, order;
>  
>      if ( !paging_mode_translate(d) )
>          return 0;
>  
> -    for ( i = 0; i < nr; i++ )
> +    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
> +          i += 1UL << order, ++iter )
>      {
> -        int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
> -        if ( ret )
> -            err = ret;
> +        /* OR'ing gfn and mfn values will return an order suitable to both. */
> +        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
> +              order = ret - 1 )
> +        {
> +            ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order);
> +            if ( ret <= 0 )
> +                break;
> +            ASSERT(ret <= order);
> +        }
> +        if ( ret < 0 )
> +            break;
>      }
>  
> -    return err;
> +    return i == nr ? 0 : i ?: ret;
>  }
>  
>  unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
> --- a/xen/arch/x86/mm/p2m-ept.c
> +++ b/xen/arch/x86/mm/p2m-ept.c
> @@ -136,6 +136,7 @@ static void ept_p2m_type_to_flags(struct
>              entry->r = entry->x = 1;
>              entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
>                                                      entry->mfn);
> +            ASSERT(entry->w || !is_epte_superpage(entry));
>              entry->a = !!cpu_has_vmx_ept_ad;
>              entry->d = entry->w && cpu_has_vmx_ept_ad;
>              break;
> --- a/xen/arch/x86/mm/p2m-pt.c
> +++ b/xen/arch/x86/mm/p2m-pt.c
> @@ -72,7 +72,8 @@ static const unsigned long pgt[] = {
>      PGT_l3_page_table
>  };
>  
> -static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
> +static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
> +                                       unsigned int level)
>  {
>      unsigned long flags;
>      /*
> @@ -107,6 +108,8 @@ static unsigned long p2m_type_to_flags(p
>      case p2m_mmio_direct:
>          if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
>              flags |= _PAGE_RW;
> +        else
> +            ASSERT(!level);
>          return flags | P2M_BASE_FLAGS | _PAGE_PCD;
>      }
>  }
> @@ -436,7 +439,7 @@ static int do_recalc(struct p2m_domain *
>              p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
>                                ? p2m_ram_logdirty : p2m_ram_rw;
>              unsigned long mfn = l1e_get_pfn(e);
> -            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
> +            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
>  
>              if ( level )
>              {
> @@ -573,7 +576,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
>          ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
>          l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
>              ? l3e_from_pfn(mfn_x(mfn),
> -                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
> +                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
>              : l3e_empty();
>          entry_content.l1 = l3e_content.l3;
>  
> @@ -609,7 +612,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
>  
>          if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
>              entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
> -                                             p2m_type_to_flags(p2mt, mfn));
> +                                             p2m_type_to_flags(p2mt, mfn, 0));
>          else
>              entry_content = l1e_empty();
>  
> @@ -645,7 +648,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
>          ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
>          if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
>              l2e_content = l2e_from_pfn(mfn_x(mfn),
> -                                       p2m_type_to_flags(p2mt, mfn) |
> +                                       p2m_type_to_flags(p2mt, mfn, 1) |
>                                         _PAGE_PSE);
>          else
>              l2e_content = l2e_empty();
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -1046,10 +1046,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
>               (gfn + nr_mfns - 1) < gfn ) /* wrap? */
>              break;
>  
> +#ifndef CONFIG_X86 /* XXX ARM!? */
>          ret = -E2BIG;
>          /* Must break hypercall up as this could take a while. */
>          if ( nr_mfns > 64 )
>              break;
> +#endif
>  
>          ret = -EPERM;
>          if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
> @@ -1067,7 +1069,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
>                     d->domain_id, gfn, mfn, nr_mfns);
>  
>              ret = map_mmio_regions(d, gfn, nr_mfns, mfn);
> -            if ( ret )
> +            if ( ret < 0 )
>                  printk(XENLOG_G_WARNING
>                         "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n",
>                         d->domain_id, gfn, mfn, nr_mfns, ret);
> @@ -1079,7 +1081,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
>                     d->domain_id, gfn, mfn, nr_mfns);
>  
>              ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn);
> -            if ( ret && is_hardware_domain(current->domain) )
> +            if ( ret < 0 && is_hardware_domain(current->domain) )
>                  printk(XENLOG_ERR
>                         "memory_map: error %ld removing dom%d access to [%lx,%lx]\n",
>                         ret, d->domain_id, mfn, mfn_end);
> --- a/xen/common/memory.c
> +++ b/xen/common/memory.c
> @@ -259,7 +259,7 @@ int guest_remove_page(struct domain *d,
>      }
>      if ( p2mt == p2m_mmio_direct )
>      {
> -        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn));
> +        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0);
>          put_gfn(d, gmfn);
>          return 1;
>      }
> --- a/xen/include/asm-x86/p2m.h
> +++ b/xen/include/asm-x86/p2m.h
> @@ -574,8 +574,9 @@ int p2m_is_logdirty_range(struct p2m_dom
>  
>  /* Set mmio addresses in the p2m table (for pass-through) */
>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> -                       p2m_access_t access);
> -int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
> +                       unsigned int order, p2m_access_t access);
> +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
> +                         unsigned int order);
>  
>  /* Set identity addresses in the p2m table (for pass-through) */
>  int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -545,8 +545,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_
>  
>  
>  /* Bind machine I/O address range -> HVM address range. */
> -/* If this returns -E2BIG lower nr_mfns value. */
>  /* XEN_DOMCTL_memory_mapping */
> +/* Returns
> +   - zero     success, everything done
> +   - -E2BIG   passed in nr_mfns value too large for the implementation
> +   - positive partial success for the first <result> page frames (with
> +              <result> less than nr_mfns), requiring re-invocation by the
> +              caller after updating inputs
> +   - negative error; other than -E2BIG
> +*/
>  #define DPCI_ADD_MAPPING         1
>  #define DPCI_REMOVE_MAPPING      0
>  struct xen_domctl_memory_mapping {
> 
>
diff mbox

Patch

--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -2229,7 +2229,7 @@  int xc_domain_memory_mapping(
 {
     DECLARE_DOMCTL;
     xc_dominfo_t info;
-    int ret = 0, err;
+    int ret = 0, rc;
     unsigned long done = 0, nr, max_batch_sz;
 
     if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ||
@@ -2254,19 +2254,24 @@  int xc_domain_memory_mapping(
         domctl.u.memory_mapping.nr_mfns = nr;
         domctl.u.memory_mapping.first_gfn = first_gfn + done;
         domctl.u.memory_mapping.first_mfn = first_mfn + done;
-        err = do_domctl(xch, &domctl);
-        if ( err && errno == E2BIG )
+        rc = do_domctl(xch, &domctl);
+        if ( rc < 0 && errno == E2BIG )
         {
             if ( max_batch_sz <= 1 )
                 break;
             max_batch_sz >>= 1;
             continue;
         }
+        if ( rc > 0 )
+        {
+            done += rc;
+            continue;
+        }
         /* Save the first error... */
         if ( !ret )
-            ret = err;
+            ret = rc;
         /* .. and ignore the rest of them when removing. */
-        if ( err && add_mapping != DPCI_REMOVE_MAPPING )
+        if ( rc && add_mapping != DPCI_REMOVE_MAPPING )
             break;
 
         done += nr;
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -436,7 +436,8 @@  static __init void pvh_add_mem_mapping(s
         else
             a = p2m_access_rw;
 
-        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
+        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i),
+                                      PAGE_ORDER_4K, a)) )
             panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
                   gfn, mfn, i, rc);
         if ( !(i & 0xfffff) )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2504,7 +2504,7 @@  static int vmx_alloc_vlapic_mapping(stru
     share_xen_page_with_guest(pg, d, XENSHARE_writable);
     d->arch.hvm_domain.vmx.apic_access_mfn = mfn;
     set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(mfn),
-                       p2m_get_hostp2m(d)->default_access);
+                       PAGE_ORDER_4K, p2m_get_hostp2m(d)->default_access);
 
     return 0;
 }
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -900,48 +900,64 @@  void p2m_change_type_range(struct domain
     p2m_unlock(p2m);
 }
 
-/* Returns: 0 for success, -errno for failure */
+/*
+ * Returns:
+ *    0              for success
+ *    -errno         for failure
+ *    1 + new order  for caller to retry with smaller order (guaranteed
+ *                   to be smaller than order passed in)
+ */
 static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                               p2m_type_t gfn_p2mt, p2m_access_t access)
+                               unsigned int order, p2m_type_t gfn_p2mt,
+                               p2m_access_t access)
 {
     int rc = 0;
     p2m_access_t a;
     p2m_type_t ot;
     mfn_t omfn;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        gfn_unlock(p2m, gfn, order);
+        return cur_order + 1;
+    }
     if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
-        gfn_unlock(p2m, gfn, 0);
+        gfn_unlock(p2m, gfn, order);
         domain_crash(d);
         return -ENOENT;
     }
     else if ( p2m_is_ram(ot) )
     {
-        ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        unsigned long i;
+
+        for ( i = 0; i < (1UL << order); ++i )
+        {
+            ASSERT(mfn_valid(_mfn(mfn_x(omfn) + i)));
+            set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
+        }
     }
 
     P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
-    rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
-                       access);
+    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
     if ( rc )
-        gdprintk(XENLOG_ERR,
-                 "p2m_set_entry failed! mfn=%08lx rc:%d\n",
-                 mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot)), rc);
+        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+                 gfn, order, rc, mfn_x(mfn));
     else if ( p2m_is_pod(ot) )
     {
         pod_lock(p2m);
-        p2m->pod.entry_count--;
+        p2m->pod.entry_count -= 1UL << order;
         BUG_ON(p2m->pod.entry_count < 0);
         pod_unlock(p2m);
     }
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -950,14 +966,19 @@  static int set_typed_p2m_entry(struct do
 static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
                                  mfn_t mfn)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
+    return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_map_foreign,
                                p2m_get_hostp2m(d)->default_access);
 }
 
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access)
+                       unsigned int order, p2m_access_t access)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
+    if ( order > PAGE_ORDER_4K &&
+         rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
+                                 mfn_x(mfn) + (1UL << order) - 1) )
+        return PAGE_ORDER_4K + 1;
+
+    return set_typed_p2m_entry(d, gfn, mfn, order, p2m_mmio_direct, access);
 }
 
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
@@ -1010,20 +1031,33 @@  int set_identity_p2m_entry(struct domain
     return ret;
 }
 
-/* Returns: 0 for success, -errno for failure */
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+/*
+ * Returns:
+ *    0        for success
+ *    -errno   for failure
+ *    order+1  for caller to retry with order (guaranteed smaller than
+ *             the order value passed in)
+ */
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order)
 {
     int rc = -EINVAL;
     mfn_t actual_mfn;
     p2m_access_t a;
     p2m_type_t t;
+    unsigned int cur_order = 0;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     if ( !paging_mode_translate(d) )
         return -EIO;
 
-    gfn_lock(p2m, gfn, 0);
-    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
+    gfn_lock(p2m, gfn, order);
+    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, &cur_order, NULL);
+    if ( cur_order < order )
+    {
+        rc = cur_order + 1;
+        goto out;
+    }
 
     /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
     if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
@@ -1036,11 +1070,11 @@  int clear_mmio_p2m_entry(struct domain *
         gdprintk(XENLOG_WARNING,
                  "no mapping between mfn %08lx and gfn %08lx\n",
                  mfn_x(mfn), gfn);
-    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid,
+    rc = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), order, p2m_invalid,
                        p2m->default_access);
 
  out:
-    gfn_unlock(p2m, gfn, 0);
+    gfn_unlock(p2m, gfn, order);
 
     return rc;
 }
@@ -2062,6 +2096,37 @@  void *map_domain_gfn(struct p2m_domain *
     return map_domain_page(*mfn);
 }
 
+static unsigned int mmio_order(const struct domain *d,
+                               unsigned long start_fn, unsigned long nr)
+{
+    /*
+     * Note that the !iommu_use_hap_pt() here has three effects:
+     * - cover iommu_{,un}map_page() not having an "order" input yet,
+     * - exclude shadow mode (which doesn't support large MMIO mappings),
+     * - exclude PV guests, should execution reach this code for such.
+     * So be careful when altering this.
+     */
+    if ( !need_iommu(d) || !iommu_use_hap_pt(d) ||
+         (start_fn & ((1UL << PAGE_ORDER_2M) - 1)) || !(nr >> PAGE_ORDER_2M) )
+        return PAGE_ORDER_4K;
+
+    if ( 0 /*
+            * Don't use 1Gb pages, to limit the iteration count in
+            * set_typed_p2m_entry() when it needs to zap M2P entries
+            * for a RAM range.
+            */ &&
+         !(start_fn & ((1UL << PAGE_ORDER_1G) - 1)) && (nr >> PAGE_ORDER_1G) &&
+         hap_has_1gb )
+        return PAGE_ORDER_1G;
+
+    if ( hap_has_2mb )
+        return PAGE_ORDER_2M;
+
+    return PAGE_ORDER_4K;
+}
+
+#define MAP_MMIO_MAX_ITER 64 /* pretty arbitrary */
+
 int map_mmio_regions(struct domain *d,
                      unsigned long start_gfn,
                      unsigned long nr,
@@ -2069,22 +2134,29 @@  int map_mmio_regions(struct domain *d,
 {
     int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; !ret && i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
-                                 p2m_get_hostp2m(d)->default_access);
-        if ( ret )
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
         {
-            unmap_mmio_regions(d, start_gfn, i, mfn);
-            break;
+            ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order,
+                                     p2m_get_hostp2m(d)->default_access);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
         }
+        if ( ret < 0 )
+            break;
     }
 
-    return ret;
+    return i == nr ? 0 : i ?: ret;
 }
 
 int unmap_mmio_regions(struct domain *d,
@@ -2092,20 +2164,30 @@  int unmap_mmio_regions(struct domain *d,
                        unsigned long nr,
                        unsigned long mfn)
 {
-    int err = 0;
+    int ret = 0;
     unsigned long i;
+    unsigned int iter, order;
 
     if ( !paging_mode_translate(d) )
         return 0;
 
-    for ( i = 0; i < nr; i++ )
+    for ( iter = i = 0; i < nr && iter < MAP_MMIO_MAX_ITER;
+          i += 1UL << order, ++iter )
     {
-        int ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
-        if ( ret )
-            err = ret;
+        /* OR'ing gfn and mfn values will return an order suitable to both. */
+        for ( order = mmio_order(d, (start_gfn + i) | (mfn + i), nr - i); ;
+              order = ret - 1 )
+        {
+            ret = clear_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i), order);
+            if ( ret <= 0 )
+                break;
+            ASSERT(ret <= order);
+        }
+        if ( ret < 0 )
+            break;
     }
 
-    return err;
+    return i == nr ? 0 : i ?: ret;
 }
 
 unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -136,6 +136,7 @@  static void ept_p2m_type_to_flags(struct
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
                                                     entry->mfn);
+            ASSERT(entry->w || !is_epte_superpage(entry));
             entry->a = !!cpu_has_vmx_ept_ad;
             entry->d = entry->w && cpu_has_vmx_ept_ad;
             break;
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -72,7 +72,8 @@  static const unsigned long pgt[] = {
     PGT_l3_page_table
 };
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
+                                       unsigned int level)
 {
     unsigned long flags;
     /*
@@ -107,6 +108,8 @@  static unsigned long p2m_type_to_flags(p
     case p2m_mmio_direct:
         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
             flags |= _PAGE_RW;
+        else
+            ASSERT(!level);
         return flags | P2M_BASE_FLAGS | _PAGE_PCD;
     }
 }
@@ -436,7 +439,7 @@  static int do_recalc(struct p2m_domain *
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
                               ? p2m_ram_logdirty : p2m_ram_rw;
             unsigned long mfn = l1e_get_pfn(e);
-            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
 
             if ( level )
             {
@@ -573,7 +576,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
             ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
+                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
 
@@ -609,7 +612,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
 
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
-                                             p2m_type_to_flags(p2mt, mfn));
+                                             p2m_type_to_flags(p2mt, mfn, 0));
         else
             entry_content = l1e_empty();
 
@@ -645,7 +648,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn) |
+                                       p2m_type_to_flags(p2mt, mfn, 1) |
                                        _PAGE_PSE);
         else
             l2e_content = l2e_empty();
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -1046,10 +1046,12 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
              (gfn + nr_mfns - 1) < gfn ) /* wrap? */
             break;
 
+#ifndef CONFIG_X86 /* XXX ARM!? */
         ret = -E2BIG;
         /* Must break hypercall up as this could take a while. */
         if ( nr_mfns > 64 )
             break;
+#endif
 
         ret = -EPERM;
         if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
@@ -1067,7 +1069,7 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = map_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret )
+            if ( ret < 0 )
                 printk(XENLOG_G_WARNING
                        "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n",
                        d->domain_id, gfn, mfn, nr_mfns, ret);
@@ -1079,7 +1081,7 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
                    d->domain_id, gfn, mfn, nr_mfns);
 
             ret = unmap_mmio_regions(d, gfn, nr_mfns, mfn);
-            if ( ret && is_hardware_domain(current->domain) )
+            if ( ret < 0 && is_hardware_domain(current->domain) )
                 printk(XENLOG_ERR
                        "memory_map: error %ld removing dom%d access to [%lx,%lx]\n",
                        ret, d->domain_id, mfn, mfn_end);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -259,7 +259,7 @@  int guest_remove_page(struct domain *d,
     }
     if ( p2mt == p2m_mmio_direct )
     {
-        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn));
+        clear_mmio_p2m_entry(d, gmfn, _mfn(mfn), 0);
         put_gfn(d, gmfn);
         return 1;
     }
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,8 +574,9 @@  int p2m_is_logdirty_range(struct p2m_dom
 
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                       p2m_access_t access);
-int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+                       unsigned int order, p2m_access_t access);
+int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                         unsigned int order);
 
 /* Set identity addresses in the p2m table (for pass-through) */
 int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -545,8 +545,15 @@  DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_
 
 
 /* Bind machine I/O address range -> HVM address range. */
-/* If this returns -E2BIG lower nr_mfns value. */
 /* XEN_DOMCTL_memory_mapping */
+/* Returns
+   - zero     success, everything done
+   - -E2BIG   passed in nr_mfns value too large for the implementation
+   - positive partial success for the first <result> page frames (with
+              <result> less than nr_mfns), requiring re-invocation by the
+              caller after updating inputs
+   - negative error; other than -E2BIG
+*/
 #define DPCI_ADD_MAPPING         1
 #define DPCI_REMOVE_MAPPING      0
 struct xen_domctl_memory_mapping {