@@ -136,6 +136,9 @@ int iommu_identity_mapping(struct domain *d, struct iommu_context *ctx,
p2m_access_t p2ma, paddr_t base, paddr_t end,
unsigned int flag);
void iommu_identity_map_teardown(struct domain *d, struct iommu_context *ctx);
+bool iommu_identity_map_check(struct domain *d, struct iommu_context *ctx,
+ mfn_t mfn);
+
extern bool untrusted_msi;
@@ -37,6 +37,7 @@ obj-y += percpu.o
obj-$(CONFIG_PERF_COUNTERS) += perfc.o
obj-bin-$(CONFIG_HAS_PMAP) += pmap.init.o
obj-y += preempt.o
+obj-y += pv-iommu.o
obj-y += random.o
obj-y += rangeset.o
obj-y += radix-tree.o
new file mode 100644
@@ -0,0 +1,536 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * xen/common/pv_iommu.c
+ *
+ * PV-IOMMU hypercall interface.
+ */
+
+#include <xen/errno.h>
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/iommu.h>
+#include <xen/sched.h>
+#include <xen/iocap.h>
+#include <xen/mm-frame.h>
+#include <xen/pci.h>
+#include <xen/guest_access.h>
+#include <asm/p2m.h>
+#include <asm/event.h>
+#include <asm/mm.h>
+#include <asm/iommu.h>
+#include <public/pv-iommu.h>
+
+#define PVIOMMU_PREFIX "[PV-IOMMU] "
+
+static int get_paged_frame(struct domain *d, gfn_t gfn, mfn_t *mfn,
+ struct page_info **page, bool readonly)
+{
+ int ret = 0;
+ p2m_type_t p2mt = p2m_invalid;
+
+ #ifdef CONFIG_X86
+ p2m_query_t query = P2M_ALLOC;
+
+ if ( !readonly )
+ query |= P2M_UNSHARE;
+
+ *mfn = get_gfn_type(d, gfn_x(gfn), &p2mt, query);
+ #else
+ *mfn = p2m_lookup(d, gfn, &p2mt);
+ #endif
+
+ if ( mfn_eq(*mfn, INVALID_MFN) )
+ {
+ /* No mapping ? */
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "Trying to map to non-backed page frame (gfn=%"PRI_gfn
+ " p2mt=%d d%d)\n", gfn_x(gfn), p2mt, d->domain_id);
+
+ ret = -ENOENT;
+ }
+ else if ( p2m_is_any_ram(p2mt) && mfn_valid(*mfn) )
+ {
+ *page = get_page_from_mfn(*mfn, d);
+ ret = 0;
+ }
+ else if ( p2m_is_mmio(p2mt) ||
+ iomem_access_permitted(d, mfn_x(*mfn),mfn_x(*mfn)) )
+ {
+ *page = NULL;
+ ret = 0;
+ }
+ else
+ {
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "Unexpected p2mt %d (d%d gfn=%"PRI_gfn" mfn=%"PRI_mfn")\n",
+ p2mt, d->domain_id, gfn_x(gfn), mfn_x(*mfn));
+
+ ret = -EPERM;
+ }
+
+ put_gfn(d, gfn_x(gfn));
+ return ret;
+}
+
+static bool can_use_iommu_check(struct domain *d)
+{
+ if ( !is_iommu_enabled(d) )
+ {
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "IOMMU disabled for this domain\n");
+ return false;
+ }
+
+ if ( !dom_iommu(d)->allow_pv_iommu )
+ {
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "PV-IOMMU disabled for this domain\n");
+ return false;
+ }
+
+ return true;
+}
+
+static long capabilities_op(struct pv_iommu_capabilities *cap, struct domain *d)
+{
+ cap->max_ctx_no = d->iommu.other_contexts.count;
+ cap->max_iova_addr = iommu_get_max_iova(d);
+
+ cap->max_pasid = 0; /* TODO */
+ cap->cap_flags = 0;
+
+ cap->pgsize_mask = PAGE_SIZE_4K;
+
+ return 0;
+}
+
+static long init_op(struct pv_iommu_init *init, struct domain *d)
+{
+ if (init->max_ctx_no == UINT32_MAX)
+ return -E2BIG;
+
+ return iommu_domain_pviommu_init(d, init->max_ctx_no + 1, init->arena_order);
+}
+
+static long alloc_context_op(struct pv_iommu_alloc *alloc, struct domain *d)
+{
+ u16 ctx_no = 0;
+ int status = 0;
+
+ status = iommu_context_alloc(d, &ctx_no, 0);
+
+ if ( status )
+ return status;
+
+ printk(XENLOG_G_INFO PVIOMMU_PREFIX
+ "Created IOMMU context %hu in d%d\n", ctx_no, d->domain_id);
+
+ alloc->ctx_no = ctx_no;
+ return 0;
+}
+
+static long free_context_op(struct pv_iommu_free *free, struct domain *d)
+{
+ int flags = IOMMU_TEARDOWN_PREEMPT;
+
+ if ( !free->ctx_no )
+ return -EINVAL;
+
+ if ( free->free_flags & IOMMU_FREE_reattach_default )
+ flags |= IOMMU_TEARDOWN_REATTACH_DEFAULT;
+
+ return iommu_context_free(d, free->ctx_no, flags);
+}
+
+static long reattach_device_op(struct pv_iommu_reattach_device *reattach,
+ struct domain *d)
+{
+ int ret;
+ device_t *pdev;
+ struct physdev_pci_device dev = reattach->dev;
+
+ pcidevs_lock();
+ pdev = pci_get_pdev(d, PCI_SBDF(dev.seg, dev.bus, dev.devfn));
+
+ if ( !pdev )
+ {
+ pcidevs_unlock();
+ return -ENOENT;
+ }
+
+ ret = iommu_reattach_context(d, d, pdev, reattach->ctx_no);
+
+ pcidevs_unlock();
+ return ret;
+}
+
+static long map_pages_op(struct pv_iommu_map_pages *map, struct domain *d)
+{
+ struct iommu_context *ctx;
+ int ret = 0, flush_ret;
+ struct page_info *page = NULL;
+ mfn_t mfn, mfn_lookup;
+ unsigned int flags = 0, flush_flags = 0;
+ size_t i = 0;
+ dfn_t dfn0 = _dfn(map->dfn); /* original map->dfn */
+
+ if ( !map->ctx_no || !(ctx = iommu_get_context(d, map->ctx_no)) )
+ return -EINVAL;
+
+ if ( map->map_flags & IOMMU_MAP_readable )
+ flags |= IOMMUF_readable;
+
+ if ( map->map_flags & IOMMU_MAP_writeable )
+ flags |= IOMMUF_writable;
+
+ for (i = 0; i < map->nr_pages; i++)
+ {
+ gfn_t gfn = _gfn(map->gfn + i);
+ dfn_t dfn = _dfn(map->dfn + i);
+
+#ifdef CONFIG_X86
+ if ( iommu_identity_map_check(d, ctx, _mfn(map->dfn)) )
+ {
+ ret = -EADDRNOTAVAIL;
+ break;
+ }
+#endif
+
+ ret = get_paged_frame(d, gfn, &mfn, &page, 0);
+
+ if ( ret )
+ break;
+
+ /* Check for conflict with existing mappings */
+ if ( !iommu_lookup_page(d, dfn, &mfn_lookup, &flags, map->ctx_no) )
+ {
+ if ( page )
+ put_page(page);
+
+ ret = -EADDRINUSE;
+ break;
+ }
+
+ ret = iommu_map(d, dfn, mfn, 1, flags, &flush_flags, map->ctx_no);
+
+ if ( ret )
+ {
+ if ( page )
+ put_page(page);
+
+ break;
+ }
+
+ map->mapped++;
+
+ if ( (i & 0xff) && hypercall_preempt_check() )
+ {
+ i++;
+
+ map->gfn += i;
+ map->dfn += i;
+ map->nr_pages -= i;
+
+ ret = -ERESTART;
+ break;
+ }
+ }
+
+ flush_ret = iommu_iotlb_flush(d, dfn0, i, flush_flags, map->ctx_no);
+
+ iommu_put_context(ctx);
+
+ if ( flush_ret )
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "Flush operation failed for d%dc%d (%d)\n", d->domain_id,
+ ctx->id, flush_ret);
+
+ return ret;
+}
+
+static long unmap_pages_op(struct pv_iommu_unmap_pages *unmap, struct domain *d)
+{
+ struct iommu_context *ctx;
+ mfn_t mfn;
+ int ret = 0, flush_ret;
+ unsigned int flags, flush_flags = 0;
+ size_t i = 0;
+ dfn_t dfn0 = _dfn(unmap->dfn); /* original unmap->dfn */
+
+ if ( !unmap->ctx_no || !(ctx = iommu_get_context(d, unmap->ctx_no)) )
+ return -EINVAL;
+
+ for (i = 0; i < unmap->nr_pages; i++)
+ {
+ dfn_t dfn = _dfn(unmap->dfn + i);
+
+#ifdef CONFIG_X86
+ if ( iommu_identity_map_check(d, ctx, _mfn(unmap->dfn)) )
+ {
+ ret = -EADDRNOTAVAIL;
+ break;
+ }
+#endif
+
+ /* Check if there is a valid mapping for this domain */
+ if ( iommu_lookup_page(d, dfn, &mfn, &flags, unmap->ctx_no) ) {
+ ret = -ENOENT;
+ break;
+ }
+
+ ret = iommu_unmap(d, dfn, 1, 0, &flush_flags, unmap->ctx_no);
+
+ if ( ret )
+ break;
+
+ unmap->unmapped++;
+
+ /* Decrement reference counter (if needed) */
+ if ( mfn_valid(mfn) )
+ put_page(mfn_to_page(mfn));
+
+ if ( (i & 0xff) && hypercall_preempt_check() )
+ {
+ i++;
+
+ unmap->dfn += i;
+ unmap->nr_pages -= i;
+
+ ret = -ERESTART;
+ break;
+ }
+ }
+
+ flush_ret = iommu_iotlb_flush(d, dfn0, i, flush_flags, unmap->ctx_no);
+
+ iommu_put_context(ctx);
+
+ if ( flush_ret )
+ printk(XENLOG_WARNING PVIOMMU_PREFIX
+ "Flush operation failed for d%dc%d (%d)\n", d->domain_id,
+ ctx->id, flush_ret);
+
+ return ret;
+}
+
+static long do_iommu_subop(int subop, XEN_GUEST_HANDLE_PARAM(void) arg,
+ struct domain *d, bool remote);
+
+static long remote_cmd_op(struct pv_iommu_remote_cmd *remote_cmd,
+ struct domain *current_domain)
+{
+ long ret = 0;
+ struct domain *d;
+
+ /* TODO: use a better permission logic */
+ if ( !is_hardware_domain(current_domain) )
+ return -EPERM;
+
+ d = get_domain_by_id(remote_cmd->domid);
+
+ if ( !d )
+ return -ENOENT;
+
+ ret = do_iommu_subop(remote_cmd->subop, remote_cmd->arg, d, true);
+
+ put_domain(d);
+
+ return ret;
+}
+
+static long do_iommu_subop(int subop, XEN_GUEST_HANDLE_PARAM(void) arg,
+ struct domain *d, bool remote)
+{
+ long ret = 0;
+
+ switch ( subop )
+ {
+ case IOMMU_noop:
+ break;
+
+ case IOMMU_query_capabilities:
+ {
+ struct pv_iommu_capabilities cap;
+
+ ret = capabilities_op(&cap, d);
+
+ if ( unlikely(copy_to_guest(arg, &cap, 1)) )
+ ret = -EFAULT;
+
+ break;
+ }
+
+ case IOMMU_init:
+ {
+ struct pv_iommu_init init;
+
+ if ( unlikely(copy_from_guest(&init, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = init_op(&init, d);
+ }
+
+ case IOMMU_alloc_context:
+ {
+ struct pv_iommu_alloc alloc;
+
+ if ( unlikely(copy_from_guest(&alloc, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = alloc_context_op(&alloc, d);
+
+ if ( unlikely(copy_to_guest(arg, &alloc, 1)) )
+ ret = -EFAULT;
+
+ break;
+ }
+
+ case IOMMU_free_context:
+ {
+ struct pv_iommu_free free;
+
+ if ( unlikely(copy_from_guest(&free, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = free_context_op(&free, d);
+ break;
+ }
+
+ case IOMMU_reattach_device:
+ {
+ struct pv_iommu_reattach_device reattach;
+
+ if ( unlikely(copy_from_guest(&reattach, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = reattach_device_op(&reattach, d);
+ break;
+ }
+
+ case IOMMU_map_pages:
+ {
+ struct pv_iommu_map_pages map;
+
+ if ( unlikely(copy_from_guest(&map, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = map_pages_op(&map, d);
+
+ if ( unlikely(copy_to_guest(arg, &map, 1)) )
+ ret = -EFAULT;
+
+ break;
+ }
+
+ case IOMMU_unmap_pages:
+ {
+ struct pv_iommu_unmap_pages unmap;
+
+ if ( unlikely(copy_from_guest(&unmap, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = unmap_pages_op(&unmap, d);
+
+ if ( unlikely(copy_to_guest(arg, &unmap, 1)) )
+ ret = -EFAULT;
+
+ break;
+ }
+
+ case IOMMU_remote_cmd:
+ {
+ struct pv_iommu_remote_cmd remote_cmd;
+
+ if ( remote )
+ {
+ /* Prevent remote_cmd from being called recursively */
+ ret = -EINVAL;
+ break;
+ }
+
+ if ( unlikely(copy_from_guest(&remote_cmd, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = remote_cmd_op(&remote_cmd, d);
+ break;
+ }
+
+ /*
+ * TODO
+ */
+ case IOMMU_alloc_nested:
+ {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ case IOMMU_flush_nested:
+ {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ case IOMMU_attach_pasid:
+ {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ case IOMMU_detach_pasid:
+ {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return ret;
+}
+
+long do_iommu_op(unsigned int subop, XEN_GUEST_HANDLE_PARAM(void) arg)
+{
+ long ret = 0;
+
+ if ( !can_use_iommu_check(current->domain) )
+ return -ENODEV;
+
+ ret = do_iommu_subop(subop, arg, current->domain, false);
+
+ if ( ret == -ERESTART )
+ return hypercall_create_continuation(__HYPERVISOR_iommu_op, "ih", subop, arg);
+
+ return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
@@ -714,6 +714,20 @@ static void cf_check amd_dump_page_tables(struct domain *d)
hd->arch.amd.paging_mode, 0, 0);
}
+uint64_t amd_get_max_iova(struct domain *d)
+{
+ struct domain_iommu *hd = dom_iommu(d);
+ unsigned int bits = 12 + hd->arch.amd.paging_mode * 9;
+
+ /* If paging_mode == 6, which indicates 6-level page tables,
+ we have bits == 66 while the GPA space is still 64-bits
+ */
+ if (bits >= 64)
+ return ~0LLU;
+
+ return (1LLU << bits) - 1;
+}
+
static const struct iommu_ops __initconst_cf_clobber _iommu_ops = {
.page_sizes = PAGE_SIZE_4K | PAGE_SIZE_2M | PAGE_SIZE_1G,
.init = amd_iommu_domain_init,
@@ -742,6 +756,7 @@ static const struct iommu_ops __initconst_cf_clobber _iommu_ops = {
.crash_shutdown = amd_iommu_crash_shutdown,
.get_reserved_device_memory = amd_iommu_get_reserved_device_memory,
.dump_page_tables = amd_dump_page_tables,
+ .get_max_iova = amd_get_max_iova,
};
static const struct iommu_init_ops __initconstrel _iommu_init_ops = {
@@ -193,6 +193,99 @@ static void __hwdom_init check_hwdom_reqs(struct domain *d)
arch_iommu_check_autotranslated_hwdom(d);
}
+
+int iommu_domain_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order)
+{
+ struct domain_iommu *hd = dom_iommu(d);
+ int rc;
+
+ BUG_ON(nb_ctx == 0); /* sanity check (prevent underflow) */
+
+ /*
+ * hd->other_contexts.count is always reported as 0 during initialization
+ * preventing misuse of partially initialized IOMMU contexts.
+ */
+
+ if ( atomic_cmpxchg(&hd->other_contexts.initialized, 0, 1) == 1 )
+ return -EACCES;
+
+ if ( (nb_ctx - 1) > 0 ) {
+ /* Initialize context bitmap */
+ size_t i;
+
+ hd->other_contexts.bitmap = xzalloc_array(unsigned long,
+ BITS_TO_LONGS(nb_ctx - 1));
+
+ if (!hd->other_contexts.bitmap)
+ {
+ rc = -ENOMEM;
+ goto cleanup;
+ }
+
+ hd->other_contexts.map = xzalloc_array(struct iommu_context, nb_ctx - 1);
+
+ if (!hd->other_contexts.map)
+ {
+ rc = -ENOMEM;
+ goto cleanup;
+ }
+
+ for (i = 0; i < (nb_ctx - 1); i++)
+ rspin_lock_init(&hd->other_contexts.map[i].lock);
+ }
+
+ rc = arch_iommu_pviommu_init(d, nb_ctx, arena_order);
+
+ if ( rc )
+ goto cleanup;
+
+ /* Make sure initialization is complete before making it visible to other CPUs. */
+ smp_wmb();
+
+ hd->other_contexts.count = nb_ctx - 1;
+
+ printk(XENLOG_INFO "Dom%d uses %lu IOMMU contexts (%llu pages arena)\n",
+ d->domain_id, (unsigned long)nb_ctx, 1llu << arena_order);
+
+ return 0;
+
+cleanup:
+ /* TODO: Reset hd->other_contexts.initialized */
+ if ( hd->other_contexts.bitmap )
+ {
+ xfree(hd->other_contexts.bitmap);
+ hd->other_contexts.bitmap = NULL;
+ }
+
+ if ( hd->other_contexts.map )
+ {
+ xfree(hd->other_contexts.map);
+ hd->other_contexts.bitmap = NULL;
+ }
+
+ return rc;
+}
+
+int iommu_domain_pviommu_teardown(struct domain *d)
+{
+ struct domain_iommu *hd = dom_iommu(d);
+ int i;
+ /* FIXME: Potential race condition with remote_op ? */
+
+ for (i = 0; i < hd->other_contexts.count; i++)
+ WARN_ON(iommu_context_free(d, i, IOMMU_TEARDOWN_REATTACH_DEFAULT) != ENOENT);
+
+ hd->other_contexts.count = 0;
+
+ if ( hd->other_contexts.bitmap )
+ xfree(hd->other_contexts.bitmap);
+
+ if ( hd->other_contexts.map )
+ xfree(hd->other_contexts.map);
+
+ return 0;
+}
+
int iommu_domain_init(struct domain *d, unsigned int opts)
{
struct domain_iommu *hd = dom_iommu(d);
@@ -238,6 +331,8 @@ int iommu_domain_init(struct domain *d, unsigned int opts)
ASSERT(!(hd->need_sync && hd->hap_pt_share));
+ hd->allow_pv_iommu = true;
+
rspin_lock(&hd->default_ctx.lock);
ret = iommu_context_init(d, &hd->default_ctx, 0, IOMMU_CONTEXT_INIT_default);
rspin_unlock(&hd->default_ctx.lock);
@@ -1204,6 +1299,16 @@ bool iommu_has_feature(struct domain *d, enum iommu_feature feature)
return is_iommu_enabled(d) && test_bit(feature, dom_iommu(d)->features);
}
+uint64_t iommu_get_max_iova(struct domain *d)
+{
+ struct domain_iommu *hd = dom_iommu(d);
+
+ if ( !hd->platform_ops->get_max_iova )
+ return 0;
+
+ return iommu_call(hd->platform_ops, get_max_iova, d);
+}
+
#define MAX_EXTRA_RESERVED_RANGES 20
struct extra_reserved_range {
unsigned long start;
@@ -2605,6 +2605,13 @@ static int cf_check intel_iommu_remove_devfn(struct domain *d, struct pci_dev *p
return unapply_context_single(d, drhd->iommu, NULL, pdev->bus, devfn);
}
+static uint64_t cf_check intel_iommu_get_max_iova(struct domain *d)
+{
+ struct domain_iommu *hd = dom_iommu(d);
+
+ return (1LLU << agaw_to_width(hd->arch.vtd.agaw)) - 1;
+}
+
static const struct iommu_ops __initconst_cf_clobber vtd_ops = {
.page_sizes = PAGE_SIZE_4K,
.init = intel_iommu_domain_init,
@@ -2636,6 +2643,7 @@ static const struct iommu_ops __initconst_cf_clobber vtd_ops = {
.iotlb_flush = iommu_flush_iotlb,
.get_reserved_device_memory = intel_iommu_get_reserved_device_memory,
.dump_page_tables = vtd_dump_page_tables,
+ .get_max_iova = intel_iommu_get_max_iova,
};
const struct iommu_init_ops __initconstrel intel_iommu_init_ops = {
@@ -215,6 +215,32 @@ int arch_iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32
return 0;
}
+int arch_iommu_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order)
+{
+ struct domain_iommu *hd = dom_iommu(d);
+
+ if ( arena_order == 0 )
+ return 0;
+
+ return iommu_arena_initialize(&hd->arch.pt_arena, NULL, arena_order, 0);
+}
+
+int arch_iommu_pviommu_teardown(struct domain *d)
+{
+ struct domain_iommu *hd = dom_iommu(d);
+
+ if ( iommu_arena_teardown(&hd->arch.pt_arena, true) )
+ {
+ printk(XENLOG_WARNING "IOMMU Arena used while being destroyed\n");
+ WARN();
+
+ /* Teardown anyway */
+ iommu_arena_teardown(&hd->arch.pt_arena, false);
+ }
+
+ return 0;
+}
+
void arch_iommu_domain_destroy(struct domain *d)
{
}
@@ -394,6 +420,19 @@ void iommu_identity_map_teardown(struct domain *d, struct iommu_context *ctx)
}
}
+bool iommu_identity_map_check(struct domain *d, struct iommu_context *ctx,
+ mfn_t mfn)
+{
+ struct identity_map *map;
+ uint64_t addr = pfn_to_paddr(mfn_x(mfn));
+
+ list_for_each_entry ( map, &ctx->arch.identity_maps, list )
+ if (addr >= map->base && addr < map->end)
+ return true;
+
+ return false;
+}
+
static int __hwdom_init cf_check map_subtract(unsigned long s, unsigned long e,
void *data)
{
@@ -669,7 +708,7 @@ void iommu_free_domid(domid_t domid, unsigned long *map)
BUG();
}
-int iommu_free_pgtables(struct domain *d, struct iommu_context *ctx)
+int cf_check iommu_free_pgtables(struct domain *d, struct iommu_context *ctx)
{
struct domain_iommu *hd = dom_iommu(d);
struct page_info *pg;
@@ -686,7 +725,10 @@ int iommu_free_pgtables(struct domain *d, struct iommu_context *ctx)
while ( (pg = page_list_remove_head(&ctx->arch.pgtables)) )
{
- free_domheap_page(pg);
+ if (ctx->id == 0)
+ free_domheap_page(pg);
+ else
+ iommu_arena_free_page(&hd->arch.pt_arena, pg);
if ( !(++done & 0xff) && general_preempt_check() )
return -ERESTART;
@@ -708,7 +750,11 @@ struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd,
memflags = MEMF_node(hd->node);
#endif
- pg = alloc_domheap_page(NULL, memflags);
+ if (ctx->id == 0)
+ pg = alloc_domheap_page(NULL, memflags);
+ else
+ pg = iommu_arena_allocate_page(&hd->arch.pt_arena);
+
if ( !pg )
return NULL;
@@ -787,9 +833,14 @@ void iommu_queue_free_pgtable(struct domain *d, struct iommu_context *ctx,
page_list_del(pg, &ctx->arch.pgtables);
- page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
+ if ( !ctx->id )
+ {
+ page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
- tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
+ tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
+ }
+ else
+ iommu_arena_free_page(&dom_iommu(d)->arch.pt_arena, pg);
}
static int cf_check cpu_callback(
@@ -209,6 +209,9 @@ hypfs_op(unsigned int cmd, const char *arg1, unsigned long arg2, void *arg3, uns
#ifdef CONFIG_X86
xenpmu_op(unsigned int op, xen_pmu_params_t *arg)
#endif
+#ifdef CONFIG_HAS_PASSTHROUGH
+iommu_op(unsigned int subop, void *arg)
+#endif
#ifdef CONFIG_PV
caller: pv64
@@ -295,5 +298,8 @@ mca do do - - -
#ifndef CONFIG_PV_SHIM_EXCLUSIVE
paging_domctl_cont do do do do -
#endif
+#ifdef CONFIG_HAS_PASSTHROUGH
+iommu_op do do do do -
+#endif
#endif /* !CPPCHECK */
new file mode 100644
@@ -0,0 +1,343 @@
+/* SPDX-License-Identifier: MIT */
+/**
+ * pv-iommu.h
+ *
+ * Paravirtualized IOMMU driver interface.
+ *
+ * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
+ */
+
+#ifndef __XEN_PUBLIC_PV_IOMMU_H__
+#define __XEN_PUBLIC_PV_IOMMU_H__
+
+#include "xen.h"
+#include "physdev.h"
+
+#ifndef uint64_aligned_t
+#define uint64_aligned_t uint64_t
+#endif
+
+#define IOMMU_DEFAULT_CONTEXT (0)
+
+enum pv_iommu_cmd {
+ /* Basic cmd */
+ IOMMU_noop = 0,
+ IOMMU_query_capabilities = 1,
+ IOMMU_init = 2,
+ IOMMU_alloc_context = 3,
+ IOMMU_free_context = 4,
+ IOMMU_reattach_device = 5,
+ IOMMU_map_pages = 6,
+ IOMMU_unmap_pages = 7,
+ IOMMU_remote_cmd = 8,
+
+ /* Extended cmd */
+ IOMMU_alloc_nested = 9, /* if IOMMUCAP_nested */
+ IOMMU_flush_nested = 10, /* if IOMMUCAP_nested */
+ IOMMU_attach_pasid = 11, /* if IOMMUCAP_pasid */
+ IOMMU_detach_pasid = 12, /* if IOMMUCAP_pasid */
+};
+
+/**
+ * If set, default context allow DMA to domain memory.
+ * If cleared, default context blocks all DMA to domain memory.
+ */
+#define IOMMUCAP_default_identity (1U << 0)
+
+/**
+ * IOMMU_MAP_cache support.
+ */
+#define IOMMUCAP_cache (1U << 1)
+
+/**
+ * If set, IOMMU_alloc_nested and IOMMU_flush_nested are supported.
+ */
+#define IOMMUCAP_nested (1U << 2)
+
+/**
+ * If set, IOMMU_attach_pasid and IOMMU_detach_pasid are supported and
+ * a device PASID can be specified in reattach_context.
+ */
+#define IOMMUCAP_pasid (1U << 3)
+
+/**
+ * If set, IOMMU_ALLOC_identity is supported in pv_iommu_alloc.
+ */
+#define IOMMUCAP_identity (1U << 4)
+
+/**
+ * IOMMU_query_capabilities
+ * Query PV-IOMMU capabilities for this domain.
+ */
+struct pv_iommu_capabilities {
+ /*
+ * OUT: Maximum device address (iova) that the guest can use for mappings.
+ */
+ uint64_aligned_t max_iova_addr;
+
+ /* OUT: IOMMU capabilities flags */
+ uint32_t cap_flags;
+
+ /* OUT: Mask of all supported page sizes. */
+ uint32_t pgsize_mask;
+
+ /* OUT: Maximum pasid (if IOMMUCAP_pasid) */
+ uint32_t max_pasid;
+
+ /* OUT: Maximum number of IOMMU context this domain can use. */
+ uint16_t max_ctx_no;
+
+ uint16_t pad0;
+};
+typedef struct pv_iommu_capabilities pv_iommu_capabilities_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_capabilities_t);
+
+/**
+ * IOMMU_init
+ * Initialize PV-IOMMU for this domain.
+ *
+ * Fails with -EACCESS if PV-IOMMU is already initialized.
+ */
+struct pv_iommu_init {
+ /* IN: Maximum number of IOMMU context this domain can use. */
+ uint32_t max_ctx_no;
+
+ /* IN: Arena size in pages (in power of two) */
+ uint32_t arena_order;
+};
+typedef struct pv_iommu_init pv_iommu_init_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_init_t);
+
+/**
+ * Create a 1:1 identity mapped context to domain memory
+ * (needs IOMMUCAP_identity).
+ */
+#define IOMMU_ALLOC_identity (1 << 0)
+
+/**
+ * IOMMU_alloc_context
+ * Allocate an IOMMU context.
+ * Fails with -ENOSPC if no context number is available.
+ */
+struct pv_iommu_alloc {
+ /* OUT: allocated IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: allocation flags */
+ uint32_t alloc_flags;
+};
+typedef struct pv_iommu_alloc pv_iommu_alloc_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_t);
+
+/**
+ * Move all devices to default context before freeing the context.
+ */
+#define IOMMU_FREE_reattach_default (1 << 0)
+
+/**
+ * IOMMU_free_context
+ * Destroy a IOMMU context.
+ *
+ * If IOMMU_FREE_reattach_default is specified, move all context devices to
+ * default context before destroying this context.
+ *
+ * If there are devices in the context and IOMMU_FREE_reattach_default is not
+ * specified, fail with -EBUSY.
+ *
+ * The default context can't be destroyed.
+ */
+struct pv_iommu_free {
+ /* IN: IOMMU context number to free */
+ uint16_t ctx_no;
+
+ /* IN: Free operation specific flags */
+ uint32_t free_flags;
+};
+typedef struct pv_iommu_free pv_iommu_free_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_free_t);
+
+/* Device has read access */
+#define IOMMU_MAP_readable (1 << 0)
+
+/* Device has write access */
+#define IOMMU_MAP_writeable (1 << 1)
+
+/* Enforce DMA coherency */
+#define IOMMU_MAP_cache (1 << 2)
+
+/**
+ * IOMMU_map_pages
+ * Map pages on a IOMMU context.
+ *
+ * pgsize must be supported by pgsize_mask.
+ * Fails with -EINVAL if mapping on top of another mapping.
+ * Report actually mapped page count in mapped field (regardless of failure).
+ */
+struct pv_iommu_map_pages {
+ /* IN: IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: Guest frame number */
+ uint64_aligned_t gfn;
+
+ /* IN: Device frame number */
+ uint64_aligned_t dfn;
+
+ /* IN: Map flags */
+ uint32_t map_flags;
+
+ /* IN: Size of pages to map */
+ uint32_t pgsize;
+
+ /* IN: Number of pages to map */
+ uint32_t nr_pages;
+
+ /* OUT: Number of pages actually mapped */
+ uint32_t mapped;
+};
+typedef struct pv_iommu_map_pages pv_iommu_map_pages_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_map_pages_t);
+
+/**
+ * IOMMU_unmap_pages
+ * Unmap pages on a IOMMU context.
+ *
+ * pgsize must be supported by pgsize_mask.
+ * Report actually unmapped page count in mapped field (regardless of failure).
+ * Fails with -ENOENT when attempting to unmap a page without any mapping
+ */
+struct pv_iommu_unmap_pages {
+ /* IN: IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: Device frame number */
+ uint64_aligned_t dfn;
+
+ /* IN: Size of pages to unmap */
+ uint32_t pgsize;
+
+ /* IN: Number of pages to unmap */
+ uint32_t nr_pages;
+
+ /* OUT: Number of pages actually unmapped */
+ uint32_t unmapped;
+};
+typedef struct pv_iommu_unmap_pages pv_iommu_unmap_pages_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_unmap_pages_t);
+
+/**
+ * IOMMU_reattach_device
+ * Reattach a device to another IOMMU context.
+ * Fails with -ENODEV if no such device exist.
+ */
+struct pv_iommu_reattach_device {
+ /* IN: Target IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: Physical device to move */
+ struct physdev_pci_device dev;
+
+ /* IN: PASID of the device (if IOMMUCAP_pasid) */
+ uint32_t pasid;
+};
+typedef struct pv_iommu_reattach_device pv_iommu_reattach_device_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_reattach_device_t);
+
+
+/**
+ * IOMMU_remote_cmd
+ * Do a PV-IOMMU operation on another domain.
+ * Current domain needs to be allowed to act on the target domain, otherwise
+ * fails with -EPERM.
+ */
+struct pv_iommu_remote_cmd {
+ /* IN: Target domain to do the subop on */
+ uint16_t domid;
+
+ /* IN: Command to do on target domain. */
+ uint16_t subop;
+
+ /* INOUT: Command argument from current domain memory */
+ XEN_GUEST_HANDLE(void) arg;
+};
+typedef struct pv_iommu_remote_cmd pv_iommu_remote_cmd_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_remote_cmd_t);
+
+/**
+ * IOMMU_alloc_nested
+ * Create a nested IOMMU context (needs IOMMUCAP_nested).
+ *
+ * This context uses a platform-specific page table from domain address space
+ * specified in pgtable_gfn and use it for nested translations.
+ *
+ * Explicit flushes needs to be submited with IOMMU_flush_nested on
+ * modification of the nested pagetable to ensure coherency between IOTLB and
+ * nested page table.
+ *
+ * This context can be destroyed using IOMMU_free_context.
+ * This context cannot be modified using map_pages, unmap_pages.
+ */
+struct pv_iommu_alloc_nested {
+ /* OUT: allocated IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: guest frame number of the nested page table */
+ uint64_aligned_t pgtable_gfn;
+
+ /* IN: nested mode flags */
+ uint64_aligned_t nested_flags;
+};
+typedef struct pv_iommu_alloc_nested pv_iommu_alloc_nested_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_nested_t);
+
+/**
+ * IOMMU_flush_nested (needs IOMMUCAP_nested)
+ * Flush the IOTLB for nested translation.
+ */
+struct pv_iommu_flush_nested {
+ /* TODO */
+};
+typedef struct pv_iommu_flush_nested pv_iommu_flush_nested_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_flush_nested_t);
+
+/**
+ * IOMMU_attach_pasid (needs IOMMUCAP_pasid)
+ * Attach a new device-with-pasid to a IOMMU context.
+ * If a matching device-with-pasid already exists (globally),
+ * fail with -EEXIST.
+ * If pasid is 0, fails with -EINVAL.
+ * If physical device doesn't exist in domain, fail with -ENOENT.
+ */
+struct pv_iommu_attach_pasid {
+ /* IN: IOMMU context to add the device-with-pasid in */
+ uint16_t ctx_no;
+
+ /* IN: Physical device */
+ struct physdev_pci_device dev;
+
+ /* IN: pasid of the device to attach */
+ uint32_t pasid;
+};
+typedef struct pv_iommu_attach_pasid pv_iommu_attach_pasid_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_attach_pasid_t);
+
+/**
+ * IOMMU_detach_pasid (needs IOMMUCAP_pasid)
+ * detach a device-with-pasid.
+ * If the device-with-pasid doesn't exist or belong to the domain,
+ * fail with -ENOENT.
+ * If pasid is 0, fails with -EINVAL.
+ */
+struct pv_iommu_detach_pasid {
+ /* IN: Physical device */
+ struct physdev_pci_device dev;
+
+ /* pasid of the device to detach */
+ uint32_t pasid;
+};
+typedef struct pv_iommu_detach_pasid pv_iommu_detach_pasid_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_detach_pasid_t);
+
+/* long do_iommu_op(int subop, XEN_GUEST_HANDLE_PARAM(void) arg) */
+
+#endif
\ No newline at end of file
@@ -118,6 +118,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
#define __HYPERVISOR_xenpmu_op 40
#define __HYPERVISOR_dm_op 41
#define __HYPERVISOR_hypfs_op 42
+#define __HYPERVISOR_iommu_op 43
/* Architecture-specific hypercall definitions. */
#define __HYPERVISOR_arch_0 48
@@ -328,6 +328,8 @@ struct iommu_ops {
*/
int (*dt_xlate)(device_t *dev, const struct dt_phandle_args *args);
#endif
+
+ uint64_t (*get_max_iova)(struct domain *d);
};
/*
@@ -409,6 +411,10 @@ struct domain_iommu {
/* SAF-2-safe enum constant in arithmetic operation */
DECLARE_BITMAP(features, IOMMU_FEAT_count);
+
+ /* Is the domain allowed to use PV-IOMMU ? */
+ bool allow_pv_iommu;
+
/* Does the guest share HAP mapping with the IOMMU? */
bool hap_pt_share;
@@ -446,6 +452,8 @@ static inline int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d,
}
#endif
+int iommu_domain_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order);
+
int __must_check iommu_suspend(void);
void iommu_resume(void);
void iommu_crash_shutdown(void);
@@ -461,6 +469,7 @@ int iommu_do_pci_domctl(struct xen_domctl *domctl, struct domain *d,
void iommu_dev_iotlb_flush_timeout(struct domain *d, struct pci_dev *pdev);
+uint64_t iommu_get_max_iova(struct domain *d);
struct iommu_context *iommu_get_context(struct domain *d, u16 ctx_id);
void iommu_put_context(struct iommu_context *ctx);
@@ -496,6 +505,8 @@ DECLARE_PER_CPU(bool, iommu_dont_flush_iotlb);
extern struct spinlock iommu_pt_cleanup_lock;
extern struct page_list_head iommu_pt_cleanup_list;
+int arch_iommu_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order);
+int arch_iommu_pviommu_teardown(struct domain *d);
bool arch_iommu_use_permitted(const struct domain *d);
#ifdef CONFIG_X86
Introduce the PV-IOMMU subsystem as defined in docs/designs/pv-iommu.md. Signed-off-by: Teddy Astie <teddy.astie@vates.tech> --- xen/arch/x86/include/asm/iommu.h | 3 + xen/common/Makefile | 1 + xen/common/pv-iommu.c | 536 ++++++++++++++++++++ xen/drivers/passthrough/amd/pci_amd_iommu.c | 15 + xen/drivers/passthrough/iommu.c | 105 ++++ xen/drivers/passthrough/vtd/iommu.c | 8 + xen/drivers/passthrough/x86/iommu.c | 61 ++- xen/include/hypercall-defs.c | 6 + xen/include/public/pv-iommu.h | 343 +++++++++++++ xen/include/public/xen.h | 1 + xen/include/xen/iommu.h | 11 + 11 files changed, 1085 insertions(+), 5 deletions(-) create mode 100644 xen/common/pv-iommu.c create mode 100644 xen/include/public/pv-iommu.h