diff mbox series

[5/5] x86 / iommu: create a dedicated pool of page-table pages

Message ID 20201005094905.2929-6-paul@xen.org (mailing list archive)
State New, archived
Headers show
Series iommu page-table memory pool | expand

Commit Message

Paul Durrant Oct. 5, 2020, 9:49 a.m. UTC
From: Paul Durrant <pdurrant@amazon.com>

This patch, in a way analogous to HAP page allocation, creates a pool of
pages for use as IOMMU page-tables when the size of that pool is configured
by a call to iommu_set_allocation(). That call occurs when the tool-stack
has calculates the size of the pool and issues the
XEN_DOMCTL_IOMMU_SET_ALLOCATION sub-operation of the XEN_DOMCTL_iommu_ctl
domctl. However, some IOMMU mappings must be set up during domain_create(),
before the tool-stack has had a chance to make its calculation and issue the
domctl. Hence an initial hard-coded pool size of 256 is set for domains
not sharing EPT during the call to arch_iommu_domain_init().

NOTE: No pool is configured for the hardware or quarantine domains. They
      continue to allocate page-table pages on demand.
      The prototype of iommu_free_pgtables() is change to void return as
      it no longer needs to be re-startable.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: "Roger Pau Monné" <roger.pau@citrix.com>
Cc: Wei Liu <wl@xen.org>
---
 xen/arch/x86/domain.c               |   4 +-
 xen/drivers/passthrough/x86/iommu.c | 129 ++++++++++++++++++++++++----
 xen/include/asm-x86/iommu.h         |   5 +-
 3 files changed, 116 insertions(+), 22 deletions(-)

Comments

Jan Beulich Oct. 30, 2020, 4:43 p.m. UTC | #1
On 05.10.2020 11:49, Paul Durrant wrote:
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2304,7 +2304,9 @@ int domain_relinquish_resources(struct domain *d)
>  
>      PROGRESS(iommu_pagetables):
>  
> -        ret = iommu_free_pgtables(d);
> +        iommu_free_pgtables(d);
> +
> +        ret = iommu_set_allocation(d, 0);
>          if ( ret )
>              return ret;

There doesn't look to be a need to call iommu_free_pgtables()
more than once - how about you move it immediately ahead of
the (extended) case label?

> +static int set_allocation(struct domain *d, unsigned int nr_pages,
> +                          bool allow_preempt)

Why the allow_preempt parameter when the sole caller passes
"true"?

> +/*
> + * Some IOMMU mappings are set up during domain_create() before the tool-
> + * stack has a chance to calculate and set the appropriate page-table
> + * allocation. A hard-coded initial allocation covers this gap.
> + */
> +#define INITIAL_ALLOCATION 256

How did you arrive at this number? IOW how many pages do we
need in reality, and how much leeway have you added in?

As to the tool stack - why would it "have a chance" to do the
necessary calculations only pretty late? I wonder whether the
intended allocation wouldn't better be part of struct
xen_domctl_createdomain, without the need for a new sub-op.

> @@ -265,38 +350,45 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
>          return;
>  }
>  
> -int iommu_free_pgtables(struct domain *d)
> +void iommu_free_pgtables(struct domain *d)
>  {
>      struct domain_iommu *hd = dom_iommu(d);
> -    struct page_info *pg;
> -    unsigned int done = 0;
>  
> -    while ( (pg = page_list_remove_head(&hd->arch.pgtables.list)) )
> -    {
> -        free_domheap_page(pg);
> +    spin_lock(&hd->arch.pgtables.lock);
>  
> -        if ( !(++done & 0xff) && general_preempt_check() )
> -            return -ERESTART;
> -    }
> +    page_list_splice(&hd->arch.pgtables.list, &hd->arch.pgtables.free_list);
> +    INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list);
>  
> -    return 0;
> +    spin_unlock(&hd->arch.pgtables.lock);
>  }
>  
>  struct page_info *iommu_alloc_pgtable(struct domain *d)
>  {
>      struct domain_iommu *hd = dom_iommu(d);
> -    unsigned int memflags = 0;
>      struct page_info *pg;
>      void *p;
>  
> -#ifdef CONFIG_NUMA
> -    if ( hd->node != NUMA_NO_NODE )
> -        memflags = MEMF_node(hd->node);
> -#endif
> +    spin_lock(&hd->arch.pgtables.lock);
>  
> -    pg = alloc_domheap_page(NULL, memflags);
> + again:
> +    pg = page_list_remove_head(&hd->arch.pgtables.free_list);
>      if ( !pg )
> +    {
> +        /*
> +         * The hardware and quarantine domains are not subject to a quota
> +         * so create page-table pages on demand.
> +         */
> +        if ( is_hardware_domain(d) || d == dom_io )
> +        {
> +            int rc = create_pgtable(d);
> +
> +            if ( !rc )
> +                goto again;

This gives the appearance of a potentially infinite loop; it's
not because the lock is being held, but I still wonder whether
the impression this gives couldn't be avoided by a slightly
different code structure.

Also the downside of this is that the amount of pages used by
hwdom will now never shrink anymore.

> @@ -306,7 +398,6 @@ struct page_info *iommu_alloc_pgtable(struct domain *d)
>  
>      unmap_domain_page(p);
>  
> -    spin_lock(&hd->arch.pgtables.lock);
>      page_list_add(pg, &hd->arch.pgtables.list);
>      spin_unlock(&hd->arch.pgtables.lock);

You want to drop the lock before the map/clear/unmap, and then
re-acquire it. Or, on the assumption that putting it on the
list earlier is fine (which I think it is), move the other two
lines here up as well.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 7e16d49bfd..101ef4aba5 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2304,7 +2304,9 @@  int domain_relinquish_resources(struct domain *d)
 
     PROGRESS(iommu_pagetables):
 
-        ret = iommu_free_pgtables(d);
+        iommu_free_pgtables(d);
+
+        ret = iommu_set_allocation(d, 0);
         if ( ret )
             return ret;
 
diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
index b168073f10..de0cc52489 100644
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -134,21 +134,106 @@  void __hwdom_init arch_iommu_check_autotranslated_hwdom(struct domain *d)
         panic("PVH hardware domain iommu must be set in 'strict' mode\n");
 }
 
-int iommu_set_allocation(struct domain *d, unsigned nr_pages)
+static int destroy_pgtable(struct domain *d)
 {
+    struct domain_iommu *hd = dom_iommu(d);
+    struct page_info *pg;
+
+    if ( !hd->arch.pgtables.nr )
+    {
+        ASSERT_UNREACHABLE();
+        return -ENOENT;
+    }
+
+    pg = page_list_remove_head(&hd->arch.pgtables.free_list);
+    if ( !pg )
+        return -EBUSY;
+
+    hd->arch.pgtables.nr--;
+    free_domheap_page(pg);
+
     return 0;
 }
 
+static int create_pgtable(struct domain *d)
+{
+    struct domain_iommu *hd = dom_iommu(d);
+    unsigned int memflags = 0;
+    struct page_info *pg;
+
+#ifdef CONFIG_NUMA
+    if ( hd->node != NUMA_NO_NODE )
+        memflags = MEMF_node(hd->node);
+#endif
+
+    pg = alloc_domheap_page(NULL, memflags);
+    if ( !pg )
+        return -ENOMEM;
+
+    page_list_add(pg, &hd->arch.pgtables.free_list);
+    hd->arch.pgtables.nr++;
+
+    return 0;
+}
+
+static int set_allocation(struct domain *d, unsigned int nr_pages,
+                          bool allow_preempt)
+{
+    struct domain_iommu *hd = dom_iommu(d);
+    unsigned int done = 0;
+    int rc = 0;
+
+    spin_lock(&hd->arch.pgtables.lock);
+
+    while ( !rc )
+    {
+        if ( hd->arch.pgtables.nr < nr_pages )
+            rc = create_pgtable(d);
+        else if ( hd->arch.pgtables.nr > nr_pages )
+            rc = destroy_pgtable(d);
+        else
+            break;
+
+        if ( allow_preempt && !rc && !(++done & 0xff) &&
+             general_preempt_check() )
+            rc = -ERESTART;
+    }
+
+    spin_unlock(&hd->arch.pgtables.lock);
+
+    return rc;
+}
+
+int iommu_set_allocation(struct domain *d, unsigned int nr_pages)
+{
+    return set_allocation(d, nr_pages, true);
+}
+
+/*
+ * Some IOMMU mappings are set up during domain_create() before the tool-
+ * stack has a chance to calculate and set the appropriate page-table
+ * allocation. A hard-coded initial allocation covers this gap.
+ */
+#define INITIAL_ALLOCATION 256
+
 int arch_iommu_domain_init(struct domain *d)
 {
     struct domain_iommu *hd = dom_iommu(d);
 
     spin_lock_init(&hd->arch.mapping_lock);
 
+    INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.free_list);
     INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list);
     spin_lock_init(&hd->arch.pgtables.lock);
 
-    return 0;
+    /*
+     * The hardware and quarantine domains are not subject to a quota
+     * and domains sharing EPT do not require any allocation.
+     */
+    if ( is_hardware_domain(d) || d == dom_io || iommu_use_hap_pt(d) )
+        return 0;
+
+    return set_allocation(d, INITIAL_ALLOCATION, false);
 }
 
 void arch_iommu_domain_destroy(struct domain *d)
@@ -265,38 +350,45 @@  void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
         return;
 }
 
-int iommu_free_pgtables(struct domain *d)
+void iommu_free_pgtables(struct domain *d)
 {
     struct domain_iommu *hd = dom_iommu(d);
-    struct page_info *pg;
-    unsigned int done = 0;
 
-    while ( (pg = page_list_remove_head(&hd->arch.pgtables.list)) )
-    {
-        free_domheap_page(pg);
+    spin_lock(&hd->arch.pgtables.lock);
 
-        if ( !(++done & 0xff) && general_preempt_check() )
-            return -ERESTART;
-    }
+    page_list_splice(&hd->arch.pgtables.list, &hd->arch.pgtables.free_list);
+    INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list);
 
-    return 0;
+    spin_unlock(&hd->arch.pgtables.lock);
 }
 
 struct page_info *iommu_alloc_pgtable(struct domain *d)
 {
     struct domain_iommu *hd = dom_iommu(d);
-    unsigned int memflags = 0;
     struct page_info *pg;
     void *p;
 
-#ifdef CONFIG_NUMA
-    if ( hd->node != NUMA_NO_NODE )
-        memflags = MEMF_node(hd->node);
-#endif
+    spin_lock(&hd->arch.pgtables.lock);
 
-    pg = alloc_domheap_page(NULL, memflags);
+ again:
+    pg = page_list_remove_head(&hd->arch.pgtables.free_list);
     if ( !pg )
+    {
+        /*
+         * The hardware and quarantine domains are not subject to a quota
+         * so create page-table pages on demand.
+         */
+        if ( is_hardware_domain(d) || d == dom_io )
+        {
+            int rc = create_pgtable(d);
+
+            if ( !rc )
+                goto again;
+        }
+
+        spin_unlock(&hd->arch.pgtables.lock);
         return NULL;
+    }
 
     p = __map_domain_page(pg);
     clear_page(p);
@@ -306,7 +398,6 @@  struct page_info *iommu_alloc_pgtable(struct domain *d)
 
     unmap_domain_page(p);
 
-    spin_lock(&hd->arch.pgtables.lock);
     page_list_add(pg, &hd->arch.pgtables.list);
     spin_unlock(&hd->arch.pgtables.lock);
 
diff --git a/xen/include/asm-x86/iommu.h b/xen/include/asm-x86/iommu.h
index d086f564af..3991b5601b 100644
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -47,7 +47,8 @@  struct arch_iommu
 {
     spinlock_t mapping_lock; /* io page table lock */
     struct {
-        struct page_list_head list;
+        struct page_list_head list, free_list;
+        unsigned int nr;
         spinlock_t lock;
     } pgtables;
 
@@ -135,7 +136,7 @@  int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq,
         iommu_vcall(ops, sync_cache, addr, size);       \
 })
 
-int __must_check iommu_free_pgtables(struct domain *d);
+void iommu_free_pgtables(struct domain *d);
 struct page_info *__must_check iommu_alloc_pgtable(struct domain *d);
 
 int __must_check iommu_set_allocation(struct domain *d, unsigned int nr_pages);