diff mbox series

[RFC,XEN,v3,5/5] xen/public: Introduce PV-IOMMU hypercall interface

Message ID f9ae663df38acc0d330b7582dfaa7ac199746aca.1720703078.git.teddy.astie@vates.tech (mailing list archive)
State New
Headers show
Series IOMMU subsystem redesign and PV-IOMMU interface | expand

Commit Message

Teddy Astie July 11, 2024, 2:04 p.m. UTC
Introduce a new pv interface to manage the underlying IOMMU and manage contexts
and devices. This interface allows creation of new contexts from Dom0 and
addition of IOMMU mappings using guest PoV.

This interface doesn't allow creation of mapping to other domains.

Signed-off-by Teddy Astie <teddy.astie@vates.tech>
---
Changed in V2:
* formatting

Changed in V3:
* prevent IOMMU operations on dying contexts
---
 xen/common/Makefile           |   1 +
 xen/common/pv-iommu.c         | 328 ++++++++++++++++++++++++++++++++++
 xen/include/hypercall-defs.c  |   6 +
 xen/include/public/pv-iommu.h | 114 ++++++++++++
 xen/include/public/xen.h      |   1 +
 5 files changed, 450 insertions(+)
 create mode 100644 xen/common/pv-iommu.c
 create mode 100644 xen/include/public/pv-iommu.h

Comments

Alejandro Vallejo July 11, 2024, 7:20 p.m. UTC | #1
On Thu Jul 11, 2024 at 3:04 PM BST, Teddy Astie wrote:
> Introduce a new pv interface to manage the underlying IOMMU and manage contexts
> and devices. This interface allows creation of new contexts from Dom0 and
> addition of IOMMU mappings using guest PoV.
>
> This interface doesn't allow creation of mapping to other domains.
>
> Signed-off-by Teddy Astie <teddy.astie@vates.tech>
> ---
> Changed in V2:
> * formatting
>
> Changed in V3:
> * prevent IOMMU operations on dying contexts
> ---
>  xen/common/Makefile           |   1 +
>  xen/common/pv-iommu.c         | 328 ++++++++++++++++++++++++++++++++++
>  xen/include/hypercall-defs.c  |   6 +
>  xen/include/public/pv-iommu.h | 114 ++++++++++++
>  xen/include/public/xen.h      |   1 +
>  5 files changed, 450 insertions(+)
>  create mode 100644 xen/common/pv-iommu.c
>  create mode 100644 xen/include/public/pv-iommu.h
>
> diff --git a/xen/common/Makefile b/xen/common/Makefile
> index f12a474d40..52ada89888 100644
> --- a/xen/common/Makefile
> +++ b/xen/common/Makefile
> @@ -58,6 +58,7 @@ obj-y += wait.o
>  obj-bin-y += warning.init.o
>  obj-$(CONFIG_XENOPROF) += xenoprof.o
>  obj-y += xmalloc_tlsf.o
> +obj-y += pv-iommu.o
>  
>  obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma lzo unlzo unlz4 unzstd earlycpio,$(n).init.o)
>  
> diff --git a/xen/common/pv-iommu.c b/xen/common/pv-iommu.c
> new file mode 100644
> index 0000000000..a94c0f1e1a
> --- /dev/null
> +++ b/xen/common/pv-iommu.c
> @@ -0,0 +1,328 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * xen/common/pv_iommu.c
> + *
> + * PV-IOMMU hypercall interface.
> + */
> +
> +#include <xen/mm.h>
> +#include <xen/lib.h>
> +#include <xen/iommu.h>
> +#include <xen/sched.h>
> +#include <xen/pci.h>
> +#include <xen/guest_access.h>
> +#include <asm/p2m.h>
> +#include <asm/event.h>
> +#include <public/pv-iommu.h>
> +
> +#define PVIOMMU_PREFIX "[PV-IOMMU] "
> +
> +#define PVIOMMU_MAX_PAGES 256 /* Move to Kconfig ? */

It probably wants to be a cmdline argument, I think.

> +
> +/* Allowed masks for each sub-operation */
> +#define ALLOC_OP_FLAGS_MASK (0)
> +#define FREE_OP_FLAGS_MASK (IOMMU_TEARDOWN_REATTACH_DEFAULT)
> +
> +static int get_paged_frame(struct domain *d, gfn_t gfn, mfn_t *mfn,
> +                           struct page_info **page, int readonly)
> +{
> +    p2m_type_t p2mt;
> +
> +    *page = get_page_from_gfn(d, gfn_x(gfn), &p2mt,
> +                             (readonly) ? P2M_ALLOC : P2M_UNSHARE);
> +
> +    if ( !(*page) )
> +    {
> +        *mfn = INVALID_MFN;
> +        if ( p2m_is_shared(p2mt) )
> +            return -EINVAL;
> +        if ( p2m_is_paging(p2mt) )
> +        {
> +            p2m_mem_paging_populate(d, gfn);
> +            return -EIO;
> +        }
> +
> +        return -EPERM;

This is ambiguous with the other usage of EPERM.

> +    }
> +
> +    *mfn = page_to_mfn(*page);
> +
> +    return 0;
> +}
> +
> +static int can_use_iommu_check(struct domain *d)
> +{
> +    if ( !iommu_enabled )
> +    {
> +        printk(PVIOMMU_PREFIX "IOMMU is not enabled\n");
> +        return 0;
> +    }
> +
> +    if ( !is_hardware_domain(d) )
> +    {
> +        printk(PVIOMMU_PREFIX "Non-hardware domain\n");
> +        return 0;
> +    }
> +
> +    if ( !is_iommu_enabled(d) )
> +    {
> +        printk(PVIOMMU_PREFIX "IOMMU disabled for this domain\n");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +static long query_cap_op(struct pv_iommu_op *op, struct domain *d)
> +{
> +    op->cap.max_ctx_no = d->iommu.other_contexts.count;
> +    op->cap.max_nr_pages = PVIOMMU_MAX_PAGES;
> +    op->cap.max_iova_addr = (1LLU << 39) - 1; /* TODO: hardcoded 39-bits */
> +
> +    return 0;
> +}
> +
> +static long alloc_context_op(struct pv_iommu_op *op, struct domain *d)
> +{
> +    u16 ctx_no = 0;
> +    int status = 0;
> +
> +    status = iommu_context_alloc(d, &ctx_no, op->flags & ALLOC_OP_FLAGS_MASK);
> +
> +    if (status < 0)
> +        return status;
> +
> +    printk("Created context %hu\n", ctx_no);
> +
> +    op->ctx_no = ctx_no;
> +    return 0;
> +}
> +
> +static long free_context_op(struct pv_iommu_op *op, struct domain *d)
> +{
> +    return iommu_context_free(d, op->ctx_no,
> +                              IOMMU_TEARDOWN_PREEMPT | (op->flags & FREE_OP_FLAGS_MASK));
> +}
> +
> +static long reattach_device_op(struct pv_iommu_op *op, struct domain *d)
> +{
> +    struct physdev_pci_device dev = op->reattach_device.dev;
> +    device_t *pdev;
> +
> +    pdev = pci_get_pdev(d, PCI_SBDF(dev.seg, dev.bus, dev.devfn));
> +
> +    if ( !pdev )
> +        return -ENOENT;
> +
> +    return iommu_reattach_context(d, d, pdev, op->ctx_no);
> +}
> +
> +static long map_pages_op(struct pv_iommu_op *op, struct domain *d)
> +{
> +    int ret = 0, flush_ret;
> +    struct page_info *page = NULL;
> +    mfn_t mfn;
> +    unsigned int flags;
> +    unsigned int flush_flags = 0;
> +    size_t i = 0;
> +
> +    if ( op->map_pages.nr_pages > PVIOMMU_MAX_PAGES )
> +        return -E2BIG;
> +
> +    if ( !iommu_check_context(d, op->ctx_no) )
> +        return -EINVAL;
> +
> +    //printk("Mapping gfn:%lx-%lx to dfn:%lx-%lx on %hu\n",
> +    //       op->map_pages.gfn, op->map_pages.gfn + op->map_pages.nr_pages - 1,
> +    //       op->map_pages.dfn, op->map_pages.dfn + op->map_pages.nr_pages - 1,
> +    //       op->ctx_no);
> +
> +    flags = 0;
> +
> +    if ( op->flags & IOMMU_OP_readable )
> +        flags |= IOMMUF_readable;
> +
> +    if ( op->flags & IOMMU_OP_writeable )
> +        flags |= IOMMUF_writable;
> +
> +    spin_lock(&dom_iommu(d)->lock);
> +
> +    for (i = 0; i < op->map_pages.nr_pages; i++)

This loop (and the unmap one) want to be bound by MIN(nr_pages, FOO), where  FOO
is some upper bound to the number of hypercalls we're willing to do in a single
continuation. If case of reaching that high number, we should create a new
continuation and allow something else to run.

> +    {
> +        gfn_t gfn = _gfn(op->map_pages.gfn + i);
> +        dfn_t dfn = _dfn(op->map_pages.dfn + i);
> +
> +        /* Lookup pages struct backing gfn */
> +        ret = get_paged_frame(d, gfn, &mfn, &page, 0);
> +
> +        if ( ret )
> +            break;
> +
> +        /* Check for conflict with existing mappings */
> +        if ( !_iommu_lookup_page(d, dfn, &mfn, &flags, op->ctx_no) )
> +        {
> +            put_page(page);
> +            ret = -EADDRINUSE;
> +            break;
> +        }
> +
> +        ret = _iommu_map(d, dfn, mfn, 1, flags, &flush_flags, op->ctx_no);
> +
> +        if ( ret )
> +            break;
> +    }
> +
> +    flush_ret = _iommu_iotlb_flush(d, _dfn(op->map_pages.dfn),
> +                                   op->map_pages.nr_pages, flush_flags,
> +                                   op->ctx_no);
> +
> +    spin_unlock(&dom_iommu(d)->lock);
> +
> +    op->map_pages.mapped = i;
> +
> +    if ( flush_ret )
> +        printk("Flush operation failed (%d)\n", flush_ret);

I haven't looked at _iommu_iotlb_flush(), but a printk isn't good enough here.
We want to set up a continuation to retry later, I think.

What might cause the error?

> +
> +    return ret;
> +}
> +
> +static long unmap_pages_op(struct pv_iommu_op *op, struct domain *d)
> +{
> +    mfn_t mfn;
> +    int ret = 0, flush_ret;
> +    unsigned int flags;
> +    unsigned int flush_flags = 0;
> +    size_t i = 0;
> +
> +    if ( op->unmap_pages.nr_pages > PVIOMMU_MAX_PAGES )
> +        return -E2BIG;
> +
> +    if ( !iommu_check_context(d, op->ctx_no) )
> +        return -EINVAL;
> +
> +    //printk("Unmapping dfn:%lx-%lx on %hu\n",
> +    //       op->unmap_pages.dfn, op->unmap_pages.dfn + op->unmap_pages.nr_pages - 1,
> +    //       op->ctx_no);
> +
> +    spin_lock(&dom_iommu(d)->lock);
> +
> +    for (i = 0; i < op->unmap_pages.nr_pages; i++)
> +    {
> +        dfn_t dfn = _dfn(op->unmap_pages.dfn + i);
> +
> +        /* Check if there is a valid mapping for this domain */
> +        if ( _iommu_lookup_page(d, dfn, &mfn, &flags, op->ctx_no) ) {
> +            ret = -ENOENT;
> +            break;
> +        }
> +
> +        ret = _iommu_unmap(d, dfn, 1, 0, &flush_flags, op->ctx_no);
> +
> +        if (ret)
> +            break;
> +
> +        /* Decrement reference counter */
> +        put_page(mfn_to_page(mfn));
> +    }
> +
> +    flush_ret = _iommu_iotlb_flush(d, _dfn(op->unmap_pages.dfn),
> +                                   op->unmap_pages.nr_pages, flush_flags,
> +                                   op->ctx_no);
> +
> +    spin_unlock(&dom_iommu(d)->lock);
> +
> +    op->unmap_pages.unmapped = i;
> +
> +    if ( flush_ret )
> +        printk("Flush operation failed (%d)\n", flush_ret);

Same as above.

> +
> +    return ret;
> +}
> +
> +static long lookup_page_op(struct pv_iommu_op *op, struct domain *d)
> +{
> +    mfn_t mfn;
> +    gfn_t gfn;
> +    unsigned int flags = 0;
> +
> +    if ( !iommu_check_context(d, op->ctx_no) )
> +        return -EINVAL;
> +
> +    /* Check if there is a valid BFN mapping for this domain */
> +    if ( iommu_lookup_page(d, _dfn(op->lookup_page.dfn), &mfn, &flags, op->ctx_no) )
> +        return -ENOENT;
> +
> +    gfn = mfn_to_gfn(d, mfn);
> +    BUG_ON(gfn_eq(gfn, INVALID_GFN));
> +
> +    op->lookup_page.gfn = gfn_x(gfn);
> +
> +    return 0;
> +}
> +
> +long do_iommu_sub_op(struct pv_iommu_op *op)
> +{
> +    struct domain *d = current->domain;
> +
> +    if ( !can_use_iommu_check(d) )
> +        return -EPERM;

This checks should be split, imo.

```
    if ( !iommu_enabled || !is_iommu_enabled(d) )
        return -ENOTSUP;
    if ( !is_hardware_domain(d) )
        return -EPERM;
```

For the second, we probably want an XSM hook as well (or instead of, not sure).

> +
> +    switch ( op->subop_id )
> +    {
> +        case 0:
> +            return 0;

IOMMUOP_noop?

> +
> +        case IOMMUOP_query_capabilities:
> +            return query_cap_op(op, d);
> +
> +        case IOMMUOP_alloc_context:
> +            return alloc_context_op(op, d);
> +
> +        case IOMMUOP_free_context:
> +            return free_context_op(op, d);
> +
> +        case IOMMUOP_reattach_device:
> +            return reattach_device_op(op, d);
> +
> +        case IOMMUOP_map_pages:
> +            return map_pages_op(op, d);
> +
> +        case IOMMUOP_unmap_pages:
> +            return unmap_pages_op(op, d);
> +
> +        case IOMMUOP_lookup_page:
> +            return lookup_page_op(op, d);
> +
> +        default:
> +            return -EINVAL;
> +    }
> +}
> +
> +long do_iommu_op(XEN_GUEST_HANDLE_PARAM(void) arg)
> +{
> +    long ret = 0;
> +    struct pv_iommu_op op;
> +
> +    if ( unlikely(copy_from_guest(&op, arg, 1)) )
> +        return -EFAULT;
> +
> +    ret = do_iommu_sub_op(&op);
> +
> +    if ( ret == -ERESTART )
> +        return hypercall_create_continuation(__HYPERVISOR_iommu_op, "h", arg);
> +
> +    if ( unlikely(copy_to_guest(arg, &op, 1)) )
> +        return -EFAULT;
> +
> +    return ret;
> +}
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-file-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c
> index 47c093acc8..4ba4480867 100644
> --- a/xen/include/hypercall-defs.c
> +++ b/xen/include/hypercall-defs.c
> @@ -209,6 +209,9 @@ hypfs_op(unsigned int cmd, const char *arg1, unsigned long arg2, void *arg3, uns
>  #ifdef CONFIG_X86
>  xenpmu_op(unsigned int op, xen_pmu_params_t *arg)
>  #endif
> +#ifdef CONFIG_HAS_PASSTHROUGH
> +iommu_op(void *arg)
> +#endif
>  
>  #ifdef CONFIG_PV
>  caller: pv64
> @@ -295,5 +298,8 @@ mca                                do       do       -        -        -
>  #ifndef CONFIG_PV_SHIM_EXCLUSIVE
>  paging_domctl_cont                 do       do       do       do       -
>  #endif
> +#ifdef CONFIG_HAS_PASSTHROUGH
> +iommu_op                           do       do       do       do       -
> +#endif
>  
>  #endif /* !CPPCHECK */
> diff --git a/xen/include/public/pv-iommu.h b/xen/include/public/pv-iommu.h
> new file mode 100644
> index 0000000000..45f9c44eb1
> --- /dev/null
> +++ b/xen/include/public/pv-iommu.h
> @@ -0,0 +1,114 @@
> +/* SPDX-License-Identifier: MIT */
> +/******************************************************************************
> + * pv-iommu.h
> + *
> + * Paravirtualized IOMMU driver interface.
> + *
> + * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
> + */
> +
> +#ifndef __XEN_PUBLIC_PV_IOMMU_H__
> +#define __XEN_PUBLIC_PV_IOMMU_H__
> +
> +#include "xen.h"
> +#include "physdev.h"
> +
> +#define IOMMU_DEFAULT_CONTEXT (0)
> +

From the code, seems like "#define IOMMUOP_noop 0" is missing

> +/**
> + * Query PV-IOMMU capabilities for this domain.
> + */
> +#define IOMMUOP_query_capabilities    1
> +
> +/**
> + * Allocate an IOMMU context, the new context handle will be written to ctx_no.
> + */
> +#define IOMMUOP_alloc_context         2

Please, don't write to the header. It's an absolute pain to Rust-ify and highly
unexpected when absolutely everything else will take that as an input parameter.

Provide the output as a dedicated OUT field in the alloc_context struct
instead.

> +
> +/**
> + * Destroy a IOMMU context.
> + * All devices attached to this context are reattached to default context.
> + *
> + * The default context can't be destroyed (0).
> + */
> +#define IOMMUOP_free_context          3
> +
> +/**
> + * Reattach the device to IOMMU context.
> + */
> +#define IOMMUOP_reattach_device       4
> +
> +#define IOMMUOP_map_pages             5
> +#define IOMMUOP_unmap_pages           6
> +
> +/**
> + * Get the GFN associated to a specific DFN.
> + */
> +#define IOMMUOP_lookup_page           7
> +
> +struct pv_iommu_op {
> +    uint16_t subop_id;
> +    uint16_t ctx_no;

Seeing how there's no polymorphism going on, why not put these on each struct?
Then each handler can take a pointer to each substruct and everything is a lot
safer.

> +
> +/**
> + * Create a context that is cloned from default.
> + * The new context will be populated with 1:1 mappings covering the entire guest memory.
> + */
> +#define IOMMU_CREATE_clone (1 << 0)
> +
> +#define IOMMU_OP_readable (1 << 0)
> +#define IOMMU_OP_writeable (1 << 1)
> +    uint32_t flags;

This bitmap is used for different things in different subops, which highly
suggests it should be per-struct instead.

> +
> +    union {
> +        struct {

Can you move these out to be first-class structs rather than anon? In pretty
much the same way of the other major hypercall groups. I'd like to be able to
generate them from IDL like I intend to do with all others.

> +            uint64_t gfn;
> +            uint64_t dfn;

(a) I'd rather see descriptions in the missing gfn/dfn in various fields, and
more specifically IN/OUT tags everywhere (i.e: as in /* IN: Number of pages to
map */).

(b) Any 64bit numbers in the external ABI must be uint64_aligned_t so the
alignment is 8 on 32bit architectures. I know this instance (and the others)
happen to be aligned in both cases, but still.

That said, either these are 32bit frame numbers or otherwise we're better off
using addresses instead.  It'd also be less confusing on systems with several
page sizes, and it'd be specially less confusing if an additional parameter of
"order" or "page size" was added here.

> +            /* Number of pages to map */
> +            uint32_t nr_pages;
> +            /* Number of pages actually mapped after sub-op */
> +            uint32_t mapped;

A more helpful output might some kind of err_addr with the offending page. Then
the caller doesn't have to do arithmetic to print the error somewhere.

Furthermore, seeing how this is something that should not happen I'd be tempted
to amend the spec to somehow roll back whatever we just did, but that could be
tricky with big numbers in `nr_pages`.

> +        } map_pages;
> +
> +        struct {
> +            uint64_t dfn;
> +            /* Number of pages to unmap */
> +            uint32_t nr_pages;
> +            /* Number of pages actually unmapped after sub-op */
> +            uint32_t unmapped;
> +        } unmap_pages;
> +
> +        struct {
> +            struct physdev_pci_device dev;
> +        } reattach_device;
> +
> +        struct {
> +            uint64_t gfn;
> +            uint64_t dfn;
> +        } lookup_page;
> +
> +        struct {
> +            /* Maximum number of IOMMU context this domain can use. */
> +            uint16_t max_ctx_no;
> +            /* Maximum number of pages that can be modified in a single map/unmap operation. */
> +            uint32_t max_nr_pages;
> +            /* Maximum device address (iova) that the guest can use for mappings. */
> +            uint64_t max_iova_addr;
> +        } cap;
> +    };
> +};
> +
> +typedef struct pv_iommu_op pv_iommu_op_t;
> +DEFINE_XEN_GUEST_HANDLE(pv_iommu_op_t);
> +
> +#endif
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-file-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> \ No newline at end of file
> diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
> index b47d48d0e2..28ab815ebc 100644
> --- a/xen/include/public/xen.h
> +++ b/xen/include/public/xen.h
> @@ -118,6 +118,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
>  #define __HYPERVISOR_xenpmu_op            40
>  #define __HYPERVISOR_dm_op                41
>  #define __HYPERVISOR_hypfs_op             42
> +#define __HYPERVISOR_iommu_op             43
>  
>  /* Architecture-specific hypercall definitions. */
>  #define __HYPERVISOR_arch_0               48
Teddy Astie July 12, 2024, 10:27 a.m. UTC | #2
Hello Alejandro,

>> +#define PVIOMMU_MAX_PAGES 256 /* Move to Kconfig ? */
> 
> It probably wants to be a cmdline argument, I think.
> 

Maybe, but in that sense, I think it should be a domain-specific limit 
rather than a global one. (e.g ability to prevent a DomU from having a 
very permissive limit).

>> +
>> +/* Allowed masks for each sub-operation */
>> +#define ALLOC_OP_FLAGS_MASK (0)
>> +#define FREE_OP_FLAGS_MASK (IOMMU_TEARDOWN_REATTACH_DEFAULT)
>> +
>> +static int get_paged_frame(struct domain *d, gfn_t gfn, mfn_t *mfn,
>> +                           struct page_info **page, int readonly)
>> +{
>> +    p2m_type_t p2mt;
>> +
>> +    *page = get_page_from_gfn(d, gfn_x(gfn), &p2mt,
>> +                             (readonly) ? P2M_ALLOC : P2M_UNSHARE);
>> +
>> +    if ( !(*page) )
>> +    {
>> +        *mfn = INVALID_MFN;
>> +        if ( p2m_is_shared(p2mt) )
>> +            return -EINVAL;
>> +        if ( p2m_is_paging(p2mt) )
>> +        {
>> +            p2m_mem_paging_populate(d, gfn);
>> +            return -EIO;
>> +        }
>> +
>> +        return -EPERM;
> 
> This is ambiguous with the other usage of EPERM.
> 

Yes, I think this part should be reviewed, because it may have 
implication with e.g DMA to grant mappings (not sure how it should be 
handled though). I am not an expert on Xen memory management though.

>> +static long map_pages_op(struct pv_iommu_op *op, struct domain *d)
>> +{
>> +    int ret = 0, flush_ret;
>> +    struct page_info *page = NULL;
>> +    mfn_t mfn;
>> +    unsigned int flags;
>> +    unsigned int flush_flags = 0;
>> +    size_t i = 0;
>> +
>> +    if ( op->map_pages.nr_pages > PVIOMMU_MAX_PAGES )
>> +        return -E2BIG;
>> +
>> +    if ( !iommu_check_context(d, op->ctx_no) )
>> +        return -EINVAL;
>> +
>> +    //printk("Mapping gfn:%lx-%lx to dfn:%lx-%lx on %hu\n",
>> +    //       op->map_pages.gfn, op->map_pages.gfn + op->map_pages.nr_pages - 1,
>> +    //       op->map_pages.dfn, op->map_pages.dfn + op->map_pages.nr_pages - 1,
>> +    //       op->ctx_no);
>> +
>> +    flags = 0;
>> +
>> +    if ( op->flags & IOMMU_OP_readable )
>> +        flags |= IOMMUF_readable;
>> +
>> +    if ( op->flags & IOMMU_OP_writeable )
>> +        flags |= IOMMUF_writable;
>> +
>> +    spin_lock(&dom_iommu(d)->lock);
>> +
>> +    for (i = 0; i < op->map_pages.nr_pages; i++)
> 
> This loop (and the unmap one) want to be bound by MIN(nr_pages, FOO), where  FOO
> is some upper bound to the number of hypercalls we're willing to do in a single
> continuation. If case of reaching that high number, we should create a new
> continuation and allow something else to run.
> 

That can be a good idea, although, it needs some care :
- we need to make sure that when we preempt, the state is well defined 
(e.g we need to do a iotlb_flush before preempting into a continuation)
- the hypercall result should account for all pages mapped in the 
hypercall and all continuations, and stop in case of error; reporting 
how much pages got actually mapped at the end

>> +    {
>> +        gfn_t gfn = _gfn(op->map_pages.gfn + i);
>> +        dfn_t dfn = _dfn(op->map_pages.dfn + i);
>> +
>> +        /* Lookup pages struct backing gfn */
>> +        ret = get_paged_frame(d, gfn, &mfn, &page, 0);
>> +
>> +        if ( ret )
>> +            break;
>> +
>> +        /* Check for conflict with existing mappings */
>> +        if ( !_iommu_lookup_page(d, dfn, &mfn, &flags, op->ctx_no) )
>> +        {
>> +            put_page(page);
>> +            ret = -EADDRINUSE;
>> +            break;
>> +        }
>> +
>> +        ret = _iommu_map(d, dfn, mfn, 1, flags, &flush_flags, op->ctx_no);
>> +
>> +        if ( ret )
>> +            break;
>> +    }
>> +
>> +    flush_ret = _iommu_iotlb_flush(d, _dfn(op->map_pages.dfn),
>> +                                   op->map_pages.nr_pages, flush_flags,
>> +                                   op->ctx_no);
>> +
>> +    spin_unlock(&dom_iommu(d)->lock);
>> +
>> +    op->map_pages.mapped = i;
>> +
>> +    if ( flush_ret )
>> +        printk("Flush operation failed (%d)\n", flush_ret);
> 
> I haven't looked at _iommu_iotlb_flush(), but a printk isn't good enough here.
> We want to set up a continuation to retry later, I think.
> 
> What might cause the error?
> 

I would say, it's mostly implementation-dependant (with x86 IOMMU 
drivers, iotlb_flush cannot fail though). The only other situation where 
_iommu_iotlb_flush can fail is when trying to do map/unmap on a IOMMU 
context that is being destroyed asynchronously (dying state). It makes 
all operations failing immediately with -EINVAL.
>> +
>> +    return ret;
>> +}
>> +
>> +static long lookup_page_op(struct pv_iommu_op *op, struct domain *d)
>> +{
>> +    mfn_t mfn;
>> +    gfn_t gfn;
>> +    unsigned int flags = 0;
>> +
>> +    if ( !iommu_check_context(d, op->ctx_no) )
>> +        return -EINVAL;
>> +
>> +    /* Check if there is a valid BFN mapping for this domain */
>> +    if ( iommu_lookup_page(d, _dfn(op->lookup_page.dfn), &mfn, &flags, op->ctx_no) )
>> +        return -ENOENT;
>> +
>> +    gfn = mfn_to_gfn(d, mfn);
>> +    BUG_ON(gfn_eq(gfn, INVALID_GFN));
>> +
>> +    op->lookup_page.gfn = gfn_x(gfn);
>> +
>> +    return 0;
>> +}
>> +
>> +long do_iommu_sub_op(struct pv_iommu_op *op)
>> +{
>> +    struct domain *d = current->domain;
>> +
>> +    if ( !can_use_iommu_check(d) )
>> +        return -EPERM;
> 
> This checks should be split, imo.
> 
> ```
>      if ( !iommu_enabled || !is_iommu_enabled(d) )
>          return -ENOTSUP;
>      if ( !is_hardware_domain(d) )
>          return -EPERM;
> ```
> 
> For the second, we probably want an XSM hook as well (or instead of, not sure).
> 

Yes (or at least, a way to know if the domain has ability to use 
PV-IOMMU and has devices for it).

>> +
>> +    switch ( op->subop_id )
>> +    {
>> +        case 0:
>> +            return 0;
> 
> IOMMUOP_noop?
> 

Could be a good fit

>> +/**
>> + * Query PV-IOMMU capabilities for this domain.
>> + */
>> +#define IOMMUOP_query_capabilities    1
>> +
>> +/**
>> + * Allocate an IOMMU context, the new context handle will be written to ctx_no.
>> + */
>> +#define IOMMUOP_alloc_context         2
> 
> Please, don't write to the header. It's an absolute pain to Rust-ify and highly
> unexpected when absolutely everything else will take that as an input parameter.
> 
> Provide the output as a dedicated OUT field in the alloc_context struct
> instead.
> 

Yes, as there may be cases where the current approach gets confusing 
(cloning context).

>> +
>> +/**
>> + * Destroy a IOMMU context.
>> + * All devices attached to this context are reattached to default context.
>> + *
>> + * The default context can't be destroyed (0).
>> + */
>> +#define IOMMUOP_free_context          3
>> +
>> +/**
>> + * Reattach the device to IOMMU context.
>> + */
>> +#define IOMMUOP_reattach_device       4
>> +
>> +#define IOMMUOP_map_pages             5
>> +#define IOMMUOP_unmap_pages           6
>> +
>> +/**
>> + * Get the GFN associated to a specific DFN.
>> + */
>> +#define IOMMUOP_lookup_page           7
>> +
>> +struct pv_iommu_op {
>> +    uint16_t subop_id;
>> +    uint16_t ctx_no;
> 
> Seeing how there's no polymorphism going on, why not put these on each struct?
> Then each handler can take a pointer to each substruct and everything is a lot
> safer.
> 

Do you mean something like

struct pv_iommu_op {
   uint16_t subop_id;
   uint16_t ctx_no; /* IN */
   void *subop_struct;
};

?

>> +
>> +/**
>> + * Create a context that is cloned from default.
>> + * The new context will be populated with 1:1 mappings covering the entire guest memory.
>> + */
>> +#define IOMMU_CREATE_clone (1 << 0)
>> +
>> +#define IOMMU_OP_readable (1 << 0)
>> +#define IOMMU_OP_writeable (1 << 1)
>> +    uint32_t flags;
> 
> This bitmap is used for different things in different subops, which highly
> suggests it should be per-struct instead.
> 

Having a flags field in each subop_struct ?

>> +
>> +    union {
>> +        struct {
> 
> Can you move these out to be first-class structs rather than anon? In pretty
> much the same way of the other major hypercall groups. I'd like to be able to
> generate them from IDL like I intend to do with all others.
> 

Yes

>> +            uint64_t gfn;
>> +            uint64_t dfn;
> 
> (a) I'd rather see descriptions in the missing gfn/dfn in various fields, and
> more specifically IN/OUT tags everywhere (i.e: as in /* IN: Number of pages to
> map */).
> 
> (b) Any 64bit numbers in the external ABI must be uint64_aligned_t so the
> alignment is 8 on 32bit architectures. I know this instance (and the others)
> happen to be aligned in both cases, but still.
> 

Makes sense

> That said, either these are 32bit frame numbers or otherwise we're better off
> using addresses instead.  It'd also be less confusing on systems with several
> page sizes, and it'd be specially less confusing if an additional parameter of
> "order" or "page size" was added here.
> 

It's something pretty similar to what Linux expects, and it is in fact 
in guest responsibility to ensure that addresses are aligned to page 
size and that the page size is actually supported.

While guest_addr (replacement for gfn) could be a good fit for phys_addr 
or something similar. I would keep dfn as uint64_t as it's perfectly 
valid to be able to map on >4G areas on a 32-bits platform.
But we want to make sure that all the guest memory can be represented as 
a phys_addr (e.g 32-bits addr may not be enough for a i686 guest due to 
PAE).

>> +            /* Number of pages to map */
>> +            uint32_t nr_pages;
>> +            /* Number of pages actually mapped after sub-op */
>> +            uint32_t mapped;
> 
> A more helpful output might some kind of err_addr with the offending page. Then
> the caller doesn't have to do arithmetic to print the error somewhere.
> 

This value is used as it is the kind of result Linux wants for IOMMU. 
Linux has a return value which is the number of pages actually mapped 
(if it's less than expected, it will retry though and if it is zero, it 
assumes it's a failure).

> Furthermore, seeing how this is something that should not happen I'd be tempted
> to amend the spec to somehow roll back whatever we just did, but that could be
> tricky with big numbers in `nr_pages`.
> 

Being able to rollback may be tricky as it supposes you keep previous 
state. For instance, when doing unmap, you practically need to backup 
parts of the pagetable or do CoW on it, so you can revert in case of 
something goes wrong.
Otherwise, all modifications made directly on the pagetable may be 
immediately effective if those entries are not cached by IOTLB.

That's possible, but needs a more sophisticated management logic of page 
tables at first.


---

I will try to do some changes on this interface with some eventual new 
features (even if unimplemented yet) and some of your proposals and 
publish it as an independent RFC.

Teddy


Teddy Astie | Vates XCP-ng Intern

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Jan Beulich July 12, 2024, 10:46 a.m. UTC | #3
On 11.07.2024 21:20, Alejandro Vallejo wrote:
> On Thu Jul 11, 2024 at 3:04 PM BST, Teddy Astie wrote:
>> --- /dev/null
>> +++ b/xen/common/pv-iommu.c
>> @@ -0,0 +1,328 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * xen/common/pv_iommu.c
>> + *
>> + * PV-IOMMU hypercall interface.
>> + */
>> +
>> +#include <xen/mm.h>
>> +#include <xen/lib.h>
>> +#include <xen/iommu.h>
>> +#include <xen/sched.h>
>> +#include <xen/pci.h>
>> +#include <xen/guest_access.h>
>> +#include <asm/p2m.h>
>> +#include <asm/event.h>
>> +#include <public/pv-iommu.h>
>> +
>> +#define PVIOMMU_PREFIX "[PV-IOMMU] "
>> +
>> +#define PVIOMMU_MAX_PAGES 256 /* Move to Kconfig ? */
> 
> It probably wants to be a cmdline argument, I think.

For Dom0. For DomU-s it wants to be a guest config setting, I suppose. Then
again I wonder if I understand the purpose of this correctly: The number looks
surprisingly small if it was something the guest may use for arranging its
mappings.

Jan
Teddy Astie July 12, 2024, 12:46 p.m. UTC | #4
Hello Jan,

Le 12/07/2024 à 12:46, Jan Beulich a écrit :
> On 11.07.2024 21:20, Alejandro Vallejo wrote:
>> On Thu Jul 11, 2024 at 3:04 PM BST, Teddy Astie wrote:
>>> --- /dev/null
>>> +++ b/xen/common/pv-iommu.c
>>> @@ -0,0 +1,328 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +/*
>>> + * xen/common/pv_iommu.c
>>> + *
>>> + * PV-IOMMU hypercall interface.
>>> + */
>>> +
>>> +#include <xen/mm.h>
>>> +#include <xen/lib.h>
>>> +#include <xen/iommu.h>
>>> +#include <xen/sched.h>
>>> +#include <xen/pci.h>
>>> +#include <xen/guest_access.h>
>>> +#include <asm/p2m.h>
>>> +#include <asm/event.h>
>>> +#include <public/pv-iommu.h>
>>> +
>>> +#define PVIOMMU_PREFIX "[PV-IOMMU] "
>>> +
>>> +#define PVIOMMU_MAX_PAGES 256 /* Move to Kconfig ? */
>>
>> It probably wants to be a cmdline argument, I think.
> 
> For Dom0. For DomU-s it wants to be a guest config setting, I suppose. Then
> again I wonder if I understand the purpose of this correctly: The number looks
> surprisingly small if it was something the guest may use for arranging its
> mappings.
> 
> Jan

Makes sense to be a guest setting for DomUs. I don't think this limit is 
too small, actually it means that we can can map up to 1M of contiguous 
memory in a single hypercall, in the guest case (e.g Linux), it very 
rarely goes beyond this limit.

I put this limit to prevent the guest from trying to map millions of 
pages, which is going to take a while (and may cause stability issues). 
And to actually give a chance for Xen to preempt the guest (and keep the 
ability to shut it down between 2 hypercalls).

Teddy


Teddy Astie | Vates XCP-ng Intern

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Jan Beulich July 12, 2024, 12:55 p.m. UTC | #5
On 12.07.2024 14:46, Teddy Astie wrote:
> Hello Jan,
> 
> Le 12/07/2024 à 12:46, Jan Beulich a écrit :
>> On 11.07.2024 21:20, Alejandro Vallejo wrote:
>>> On Thu Jul 11, 2024 at 3:04 PM BST, Teddy Astie wrote:
>>>> --- /dev/null
>>>> +++ b/xen/common/pv-iommu.c
>>>> @@ -0,0 +1,328 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>>> +/*
>>>> + * xen/common/pv_iommu.c
>>>> + *
>>>> + * PV-IOMMU hypercall interface.
>>>> + */
>>>> +
>>>> +#include <xen/mm.h>
>>>> +#include <xen/lib.h>
>>>> +#include <xen/iommu.h>
>>>> +#include <xen/sched.h>
>>>> +#include <xen/pci.h>
>>>> +#include <xen/guest_access.h>
>>>> +#include <asm/p2m.h>
>>>> +#include <asm/event.h>
>>>> +#include <public/pv-iommu.h>
>>>> +
>>>> +#define PVIOMMU_PREFIX "[PV-IOMMU] "
>>>> +
>>>> +#define PVIOMMU_MAX_PAGES 256 /* Move to Kconfig ? */
>>>
>>> It probably wants to be a cmdline argument, I think.
>>
>> For Dom0. For DomU-s it wants to be a guest config setting, I suppose. Then
>> again I wonder if I understand the purpose of this correctly: The number looks
>> surprisingly small if it was something the guest may use for arranging its
>> mappings.
> 
> Makes sense to be a guest setting for DomUs. I don't think this limit is 
> too small, actually it means that we can can map up to 1M of contiguous 
> memory in a single hypercall, in the guest case (e.g Linux), it very 
> rarely goes beyond this limit.
> 
> I put this limit to prevent the guest from trying to map millions of 
> pages, which is going to take a while (and may cause stability issues). 
> And to actually give a chance for Xen to preempt the guest (and keep the 
> ability to shut it down between 2 hypercalls).

Oh, this is a per-hypercall limit. Then the identifier is misleading.
Plus I don't see why bigger batches would need rejecting. They only need
breaking up using hypercall continuations.

Jan
diff mbox series

Patch

diff --git a/xen/common/Makefile b/xen/common/Makefile
index f12a474d40..52ada89888 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -58,6 +58,7 @@  obj-y += wait.o
 obj-bin-y += warning.init.o
 obj-$(CONFIG_XENOPROF) += xenoprof.o
 obj-y += xmalloc_tlsf.o
+obj-y += pv-iommu.o
 
 obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma lzo unlzo unlz4 unzstd earlycpio,$(n).init.o)
 
diff --git a/xen/common/pv-iommu.c b/xen/common/pv-iommu.c
new file mode 100644
index 0000000000..a94c0f1e1a
--- /dev/null
+++ b/xen/common/pv-iommu.c
@@ -0,0 +1,328 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * xen/common/pv_iommu.c
+ *
+ * PV-IOMMU hypercall interface.
+ */
+
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/iommu.h>
+#include <xen/sched.h>
+#include <xen/pci.h>
+#include <xen/guest_access.h>
+#include <asm/p2m.h>
+#include <asm/event.h>
+#include <public/pv-iommu.h>
+
+#define PVIOMMU_PREFIX "[PV-IOMMU] "
+
+#define PVIOMMU_MAX_PAGES 256 /* Move to Kconfig ? */
+
+/* Allowed masks for each sub-operation */
+#define ALLOC_OP_FLAGS_MASK (0)
+#define FREE_OP_FLAGS_MASK (IOMMU_TEARDOWN_REATTACH_DEFAULT)
+
+static int get_paged_frame(struct domain *d, gfn_t gfn, mfn_t *mfn,
+                           struct page_info **page, int readonly)
+{
+    p2m_type_t p2mt;
+
+    *page = get_page_from_gfn(d, gfn_x(gfn), &p2mt,
+                             (readonly) ? P2M_ALLOC : P2M_UNSHARE);
+
+    if ( !(*page) )
+    {
+        *mfn = INVALID_MFN;
+        if ( p2m_is_shared(p2mt) )
+            return -EINVAL;
+        if ( p2m_is_paging(p2mt) )
+        {
+            p2m_mem_paging_populate(d, gfn);
+            return -EIO;
+        }
+
+        return -EPERM;
+    }
+
+    *mfn = page_to_mfn(*page);
+
+    return 0;
+}
+
+static int can_use_iommu_check(struct domain *d)
+{
+    if ( !iommu_enabled )
+    {
+        printk(PVIOMMU_PREFIX "IOMMU is not enabled\n");
+        return 0;
+    }
+
+    if ( !is_hardware_domain(d) )
+    {
+        printk(PVIOMMU_PREFIX "Non-hardware domain\n");
+        return 0;
+    }
+
+    if ( !is_iommu_enabled(d) )
+    {
+        printk(PVIOMMU_PREFIX "IOMMU disabled for this domain\n");
+        return 0;
+    }
+
+    return 1;
+}
+
+static long query_cap_op(struct pv_iommu_op *op, struct domain *d)
+{
+    op->cap.max_ctx_no = d->iommu.other_contexts.count;
+    op->cap.max_nr_pages = PVIOMMU_MAX_PAGES;
+    op->cap.max_iova_addr = (1LLU << 39) - 1; /* TODO: hardcoded 39-bits */
+
+    return 0;
+}
+
+static long alloc_context_op(struct pv_iommu_op *op, struct domain *d)
+{
+    u16 ctx_no = 0;
+    int status = 0;
+
+    status = iommu_context_alloc(d, &ctx_no, op->flags & ALLOC_OP_FLAGS_MASK);
+
+    if (status < 0)
+        return status;
+
+    printk("Created context %hu\n", ctx_no);
+
+    op->ctx_no = ctx_no;
+    return 0;
+}
+
+static long free_context_op(struct pv_iommu_op *op, struct domain *d)
+{
+    return iommu_context_free(d, op->ctx_no,
+                              IOMMU_TEARDOWN_PREEMPT | (op->flags & FREE_OP_FLAGS_MASK));
+}
+
+static long reattach_device_op(struct pv_iommu_op *op, struct domain *d)
+{
+    struct physdev_pci_device dev = op->reattach_device.dev;
+    device_t *pdev;
+
+    pdev = pci_get_pdev(d, PCI_SBDF(dev.seg, dev.bus, dev.devfn));
+
+    if ( !pdev )
+        return -ENOENT;
+
+    return iommu_reattach_context(d, d, pdev, op->ctx_no);
+}
+
+static long map_pages_op(struct pv_iommu_op *op, struct domain *d)
+{
+    int ret = 0, flush_ret;
+    struct page_info *page = NULL;
+    mfn_t mfn;
+    unsigned int flags;
+    unsigned int flush_flags = 0;
+    size_t i = 0;
+
+    if ( op->map_pages.nr_pages > PVIOMMU_MAX_PAGES )
+        return -E2BIG;
+
+    if ( !iommu_check_context(d, op->ctx_no) )
+        return -EINVAL;
+
+    //printk("Mapping gfn:%lx-%lx to dfn:%lx-%lx on %hu\n",
+    //       op->map_pages.gfn, op->map_pages.gfn + op->map_pages.nr_pages - 1,
+    //       op->map_pages.dfn, op->map_pages.dfn + op->map_pages.nr_pages - 1,
+    //       op->ctx_no);
+
+    flags = 0;
+
+    if ( op->flags & IOMMU_OP_readable )
+        flags |= IOMMUF_readable;
+
+    if ( op->flags & IOMMU_OP_writeable )
+        flags |= IOMMUF_writable;
+
+    spin_lock(&dom_iommu(d)->lock);
+
+    for (i = 0; i < op->map_pages.nr_pages; i++)
+    {
+        gfn_t gfn = _gfn(op->map_pages.gfn + i);
+        dfn_t dfn = _dfn(op->map_pages.dfn + i);
+
+        /* Lookup pages struct backing gfn */
+        ret = get_paged_frame(d, gfn, &mfn, &page, 0);
+
+        if ( ret )
+            break;
+
+        /* Check for conflict with existing mappings */
+        if ( !_iommu_lookup_page(d, dfn, &mfn, &flags, op->ctx_no) )
+        {
+            put_page(page);
+            ret = -EADDRINUSE;
+            break;
+        }
+
+        ret = _iommu_map(d, dfn, mfn, 1, flags, &flush_flags, op->ctx_no);
+
+        if ( ret )
+            break;
+    }
+
+    flush_ret = _iommu_iotlb_flush(d, _dfn(op->map_pages.dfn),
+                                   op->map_pages.nr_pages, flush_flags,
+                                   op->ctx_no);
+
+    spin_unlock(&dom_iommu(d)->lock);
+
+    op->map_pages.mapped = i;
+
+    if ( flush_ret )
+        printk("Flush operation failed (%d)\n", flush_ret);
+
+    return ret;
+}
+
+static long unmap_pages_op(struct pv_iommu_op *op, struct domain *d)
+{
+    mfn_t mfn;
+    int ret = 0, flush_ret;
+    unsigned int flags;
+    unsigned int flush_flags = 0;
+    size_t i = 0;
+
+    if ( op->unmap_pages.nr_pages > PVIOMMU_MAX_PAGES )
+        return -E2BIG;
+
+    if ( !iommu_check_context(d, op->ctx_no) )
+        return -EINVAL;
+
+    //printk("Unmapping dfn:%lx-%lx on %hu\n",
+    //       op->unmap_pages.dfn, op->unmap_pages.dfn + op->unmap_pages.nr_pages - 1,
+    //       op->ctx_no);
+
+    spin_lock(&dom_iommu(d)->lock);
+
+    for (i = 0; i < op->unmap_pages.nr_pages; i++)
+    {
+        dfn_t dfn = _dfn(op->unmap_pages.dfn + i);
+
+        /* Check if there is a valid mapping for this domain */
+        if ( _iommu_lookup_page(d, dfn, &mfn, &flags, op->ctx_no) ) {
+            ret = -ENOENT;
+            break;
+        }
+
+        ret = _iommu_unmap(d, dfn, 1, 0, &flush_flags, op->ctx_no);
+
+        if (ret)
+            break;
+
+        /* Decrement reference counter */
+        put_page(mfn_to_page(mfn));
+    }
+
+    flush_ret = _iommu_iotlb_flush(d, _dfn(op->unmap_pages.dfn),
+                                   op->unmap_pages.nr_pages, flush_flags,
+                                   op->ctx_no);
+
+    spin_unlock(&dom_iommu(d)->lock);
+
+    op->unmap_pages.unmapped = i;
+
+    if ( flush_ret )
+        printk("Flush operation failed (%d)\n", flush_ret);
+
+    return ret;
+}
+
+static long lookup_page_op(struct pv_iommu_op *op, struct domain *d)
+{
+    mfn_t mfn;
+    gfn_t gfn;
+    unsigned int flags = 0;
+
+    if ( !iommu_check_context(d, op->ctx_no) )
+        return -EINVAL;
+
+    /* Check if there is a valid BFN mapping for this domain */
+    if ( iommu_lookup_page(d, _dfn(op->lookup_page.dfn), &mfn, &flags, op->ctx_no) )
+        return -ENOENT;
+
+    gfn = mfn_to_gfn(d, mfn);
+    BUG_ON(gfn_eq(gfn, INVALID_GFN));
+
+    op->lookup_page.gfn = gfn_x(gfn);
+
+    return 0;
+}
+
+long do_iommu_sub_op(struct pv_iommu_op *op)
+{
+    struct domain *d = current->domain;
+
+    if ( !can_use_iommu_check(d) )
+        return -EPERM;
+
+    switch ( op->subop_id )
+    {
+        case 0:
+            return 0;
+
+        case IOMMUOP_query_capabilities:
+            return query_cap_op(op, d);
+
+        case IOMMUOP_alloc_context:
+            return alloc_context_op(op, d);
+
+        case IOMMUOP_free_context:
+            return free_context_op(op, d);
+
+        case IOMMUOP_reattach_device:
+            return reattach_device_op(op, d);
+
+        case IOMMUOP_map_pages:
+            return map_pages_op(op, d);
+
+        case IOMMUOP_unmap_pages:
+            return unmap_pages_op(op, d);
+
+        case IOMMUOP_lookup_page:
+            return lookup_page_op(op, d);
+
+        default:
+            return -EINVAL;
+    }
+}
+
+long do_iommu_op(XEN_GUEST_HANDLE_PARAM(void) arg)
+{
+    long ret = 0;
+    struct pv_iommu_op op;
+
+    if ( unlikely(copy_from_guest(&op, arg, 1)) )
+        return -EFAULT;
+
+    ret = do_iommu_sub_op(&op);
+
+    if ( ret == -ERESTART )
+        return hypercall_create_continuation(__HYPERVISOR_iommu_op, "h", arg);
+
+    if ( unlikely(copy_to_guest(arg, &op, 1)) )
+        return -EFAULT;
+
+    return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c
index 47c093acc8..4ba4480867 100644
--- a/xen/include/hypercall-defs.c
+++ b/xen/include/hypercall-defs.c
@@ -209,6 +209,9 @@  hypfs_op(unsigned int cmd, const char *arg1, unsigned long arg2, void *arg3, uns
 #ifdef CONFIG_X86
 xenpmu_op(unsigned int op, xen_pmu_params_t *arg)
 #endif
+#ifdef CONFIG_HAS_PASSTHROUGH
+iommu_op(void *arg)
+#endif
 
 #ifdef CONFIG_PV
 caller: pv64
@@ -295,5 +298,8 @@  mca                                do       do       -        -        -
 #ifndef CONFIG_PV_SHIM_EXCLUSIVE
 paging_domctl_cont                 do       do       do       do       -
 #endif
+#ifdef CONFIG_HAS_PASSTHROUGH
+iommu_op                           do       do       do       do       -
+#endif
 
 #endif /* !CPPCHECK */
diff --git a/xen/include/public/pv-iommu.h b/xen/include/public/pv-iommu.h
new file mode 100644
index 0000000000..45f9c44eb1
--- /dev/null
+++ b/xen/include/public/pv-iommu.h
@@ -0,0 +1,114 @@ 
+/* SPDX-License-Identifier: MIT */
+/******************************************************************************
+ * pv-iommu.h
+ *
+ * Paravirtualized IOMMU driver interface.
+ *
+ * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
+ */
+
+#ifndef __XEN_PUBLIC_PV_IOMMU_H__
+#define __XEN_PUBLIC_PV_IOMMU_H__
+
+#include "xen.h"
+#include "physdev.h"
+
+#define IOMMU_DEFAULT_CONTEXT (0)
+
+/**
+ * Query PV-IOMMU capabilities for this domain.
+ */
+#define IOMMUOP_query_capabilities    1
+
+/**
+ * Allocate an IOMMU context, the new context handle will be written to ctx_no.
+ */
+#define IOMMUOP_alloc_context         2
+
+/**
+ * Destroy a IOMMU context.
+ * All devices attached to this context are reattached to default context.
+ *
+ * The default context can't be destroyed (0).
+ */
+#define IOMMUOP_free_context          3
+
+/**
+ * Reattach the device to IOMMU context.
+ */
+#define IOMMUOP_reattach_device       4
+
+#define IOMMUOP_map_pages             5
+#define IOMMUOP_unmap_pages           6
+
+/**
+ * Get the GFN associated to a specific DFN.
+ */
+#define IOMMUOP_lookup_page           7
+
+struct pv_iommu_op {
+    uint16_t subop_id;
+    uint16_t ctx_no;
+
+/**
+ * Create a context that is cloned from default.
+ * The new context will be populated with 1:1 mappings covering the entire guest memory.
+ */
+#define IOMMU_CREATE_clone (1 << 0)
+
+#define IOMMU_OP_readable (1 << 0)
+#define IOMMU_OP_writeable (1 << 1)
+    uint32_t flags;
+
+    union {
+        struct {
+            uint64_t gfn;
+            uint64_t dfn;
+            /* Number of pages to map */
+            uint32_t nr_pages;
+            /* Number of pages actually mapped after sub-op */
+            uint32_t mapped;
+        } map_pages;
+
+        struct {
+            uint64_t dfn;
+            /* Number of pages to unmap */
+            uint32_t nr_pages;
+            /* Number of pages actually unmapped after sub-op */
+            uint32_t unmapped;
+        } unmap_pages;
+
+        struct {
+            struct physdev_pci_device dev;
+        } reattach_device;
+
+        struct {
+            uint64_t gfn;
+            uint64_t dfn;
+        } lookup_page;
+
+        struct {
+            /* Maximum number of IOMMU context this domain can use. */
+            uint16_t max_ctx_no;
+            /* Maximum number of pages that can be modified in a single map/unmap operation. */
+            uint32_t max_nr_pages;
+            /* Maximum device address (iova) that the guest can use for mappings. */
+            uint64_t max_iova_addr;
+        } cap;
+    };
+};
+
+typedef struct pv_iommu_op pv_iommu_op_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_op_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
\ No newline at end of file
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
index b47d48d0e2..28ab815ebc 100644
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -118,6 +118,7 @@  DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
 #define __HYPERVISOR_xenpmu_op            40
 #define __HYPERVISOR_dm_op                41
 #define __HYPERVISOR_hypfs_op             42
+#define __HYPERVISOR_iommu_op             43
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48