diff mbox series

[RFC] iommu/xen: Add Xen PV-IOMMU driver

Message ID fe36b8d36ed3bc01c78901bdf7b87a71cb1adaad.1718286176.git.teddy.astie@vates.tech (mailing list archive)
State Superseded
Headers show
Series [RFC] iommu/xen: Add Xen PV-IOMMU driver | expand

Commit Message

Teddy Astie June 13, 2024, 1:50 p.m. UTC
In the context of Xen, Linux runs as Dom0 and doesn't have access to the
machine IOMMU. Although, a IOMMU is mandatory to use some kernel features
such as VFIO or DMA protection.

In Xen, we added a paravirtualized IOMMU with iommu_op hypercall in order
to allow Dom0 to implement such feature. This commit introduces a new
IOMMU driver that uses this new hypercall interface.

Signed-off-by Teddy Astie <teddy.astie@vates.tech>
---
 arch/x86/include/asm/xen/hypercall.h |   6 +
 drivers/iommu/Kconfig                |   9 +
 drivers/iommu/Makefile               |   1 +
 drivers/iommu/xen-iommu.c            | 508 +++++++++++++++++++++++++++
 include/xen/interface/memory.h       |  33 ++
 include/xen/interface/pv-iommu.h     | 114 ++++++
 include/xen/interface/xen.h          |   1 +
 7 files changed, 672 insertions(+)
 create mode 100644 drivers/iommu/xen-iommu.c
 create mode 100644 include/xen/interface/pv-iommu.h

Comments

Jan Beulich June 13, 2024, 2:32 p.m. UTC | #1
On 13.06.2024 15:50, Teddy Astie wrote:
> @@ -214,6 +215,38 @@ struct xen_add_to_physmap_range {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range);
>  
> +/*
> + * With some legacy devices, certain guest-physical addresses cannot safely
> + * be used for other purposes, e.g. to map guest RAM.  This hypercall
> + * enumerates those regions so the toolstack can avoid using them.
> + */
> +#define XENMEM_reserved_device_memory_map   27
> +struct xen_reserved_device_memory {
> +    xen_pfn_t start_pfn;
> +    xen_ulong_t nr_pages;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory);
> +
> +struct xen_reserved_device_memory_map {
> +#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */
> +    /* IN */
> +    uint32_t flags;
> +    /*
> +     * IN/OUT
> +     *
> +     * Gets set to the required number of entries when too low,
> +     * signaled by error code -ERANGE.
> +     */
> +    unsigned int nr_entries;
> +    /* OUT */
> +    GUEST_HANDLE(xen_reserved_device_memory) buffer;
> +    /* IN */
> +    union {
> +        struct physdev_pci_device pci;
> +    } dev;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map);

This is a tools-only (i.e. unstable) sub-function in Xen; even the comment
at the top says "toolstack". It is therefore not suitable for use in a
kernel.

> --- /dev/null
> +++ b/include/xen/interface/pv-iommu.h
> @@ -0,0 +1,114 @@
> +/* SPDX-License-Identifier: MIT */
> +/******************************************************************************
> + * pv-iommu.h
> + *
> + * Paravirtualized IOMMU driver interface.
> + *
> + * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
> + */
> +
> +#ifndef __XEN_PUBLIC_PV_IOMMU_H__
> +#define __XEN_PUBLIC_PV_IOMMU_H__
> +
> +#include "xen.h"
> +#include "physdev.h"
> +
> +#define IOMMU_DEFAULT_CONTEXT (0)
> +
> +/**
> + * Query PV-IOMMU capabilities for this domain.
> + */
> +#define IOMMUOP_query_capabilities    1
> +
> +/**
> + * Allocate an IOMMU context, the new context handle will be written to ctx_no.
> + */
> +#define IOMMUOP_alloc_context         2
> +
> +/**
> + * Destroy a IOMMU context.
> + * All devices attached to this context are reattached to default context.
> + *
> + * The default context can't be destroyed (0).
> + */
> +#define IOMMUOP_free_context          3
> +
> +/**
> + * Reattach the device to IOMMU context.
> + */
> +#define IOMMUOP_reattach_device       4
> +
> +#define IOMMUOP_map_pages             5
> +#define IOMMUOP_unmap_pages           6
> +
> +/**
> + * Get the GFN associated to a specific DFN.
> + */
> +#define IOMMUOP_lookup_page           7
> +
> +struct pv_iommu_op {
> +    uint16_t subop_id;
> +    uint16_t ctx_no;
> +
> +/**
> + * Create a context that is cloned from default.
> + * The new context will be populated with 1:1 mappings covering the entire guest memory.
> + */
> +#define IOMMU_CREATE_clone (1 << 0)
> +
> +#define IOMMU_OP_readable (1 << 0)
> +#define IOMMU_OP_writeable (1 << 1)
> +    uint32_t flags;
> +
> +    union {
> +        struct {
> +            uint64_t gfn;
> +            uint64_t dfn;
> +            /* Number of pages to map */
> +            uint32_t nr_pages;
> +            /* Number of pages actually mapped after sub-op */
> +            uint32_t mapped;
> +        } map_pages;
> +
> +        struct {
> +            uint64_t dfn;
> +            /* Number of pages to unmap */
> +            uint32_t nr_pages;
> +            /* Number of pages actually unmapped after sub-op */
> +            uint32_t unmapped;
> +        } unmap_pages;
> +
> +        struct {
> +            struct physdev_pci_device dev;
> +        } reattach_device;
> +
> +        struct {
> +            uint64_t gfn;
> +            uint64_t dfn;
> +        } lookup_page;
> +
> +        struct {
> +            /* Maximum number of IOMMU context this domain can use. */
> +            uint16_t max_ctx_no;
> +            /* Maximum number of pages that can be modified in a single map/unmap operation. */
> +            uint32_t max_nr_pages;
> +            /* Maximum device address (iova) that the guest can use for mappings. */
> +            uint64_t max_iova_addr;
> +        } cap;
> +    };
> +};
> +
> +typedef struct pv_iommu_op pv_iommu_op_t;
> +DEFINE_GUEST_HANDLE_STRUCT(pv_iommu_op_t);
> +
> +#endif
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-file-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> \ No newline at end of file

Nit: I'm pretty sure you want to avoid this.

Jan
Teddy Astie June 17, 2024, 1:36 p.m. UTC | #2
Le 13/06/2024 à 16:32, Jan Beulich a écrit :
> On 13.06.2024 15:50, Teddy Astie wrote:
>> @@ -214,6 +215,38 @@ struct xen_add_to_physmap_range {
>>   };
>>   DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range);
>>   
>> +/*
>> + * With some legacy devices, certain guest-physical addresses cannot safely
>> + * be used for other purposes, e.g. to map guest RAM.  This hypercall
>> + * enumerates those regions so the toolstack can avoid using them.
>> + */
>> +#define XENMEM_reserved_device_memory_map   27
>> +struct xen_reserved_device_memory {
>> +    xen_pfn_t start_pfn;
>> +    xen_ulong_t nr_pages;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory);
>> +
>> +struct xen_reserved_device_memory_map {
>> +#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */
>> +    /* IN */
>> +    uint32_t flags;
>> +    /*
>> +     * IN/OUT
>> +     *
>> +     * Gets set to the required number of entries when too low,
>> +     * signaled by error code -ERANGE.
>> +     */
>> +    unsigned int nr_entries;
>> +    /* OUT */
>> +    GUEST_HANDLE(xen_reserved_device_memory) buffer;
>> +    /* IN */
>> +    union {
>> +        struct physdev_pci_device pci;
>> +    } dev;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map);
> 
> This is a tools-only (i.e. unstable) sub-function in Xen; even the comment
> at the top says "toolstack". It is therefore not suitable for use in a
> kernel.
> 
IMO this comment actually describes how the toolstack uses the 
hypercall, but I don't think it is actually reserved for toolstack use.
Or maybe we should allow the kernel to use this hypercall as well.

>> +
>> +/*
>> + * Local variables:
>> + * mode: C
>> + * c-file-style: "BSD"
>> + * c-basic-offset: 4
>> + * tab-width: 4
>> + * indent-tabs-mode: nil
>> + * End:
>> + */
>> \ No newline at end of file
> 
> Nit: I'm pretty sure you want to avoid this.
> 
Indeed.

Teddy


Teddy Astie | Vates XCP-ng Intern

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Jan Beulich June 17, 2024, 1:43 p.m. UTC | #3
On 17.06.2024 15:36, Teddy Astie wrote:
> Le 13/06/2024 à 16:32, Jan Beulich a écrit :
>> On 13.06.2024 15:50, Teddy Astie wrote:
>>> @@ -214,6 +215,38 @@ struct xen_add_to_physmap_range {
>>>   };
>>>   DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range);
>>>   
>>> +/*
>>> + * With some legacy devices, certain guest-physical addresses cannot safely
>>> + * be used for other purposes, e.g. to map guest RAM.  This hypercall
>>> + * enumerates those regions so the toolstack can avoid using them.
>>> + */
>>> +#define XENMEM_reserved_device_memory_map   27
>>> +struct xen_reserved_device_memory {
>>> +    xen_pfn_t start_pfn;
>>> +    xen_ulong_t nr_pages;
>>> +};
>>> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory);
>>> +
>>> +struct xen_reserved_device_memory_map {
>>> +#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */
>>> +    /* IN */
>>> +    uint32_t flags;
>>> +    /*
>>> +     * IN/OUT
>>> +     *
>>> +     * Gets set to the required number of entries when too low,
>>> +     * signaled by error code -ERANGE.
>>> +     */
>>> +    unsigned int nr_entries;
>>> +    /* OUT */
>>> +    GUEST_HANDLE(xen_reserved_device_memory) buffer;
>>> +    /* IN */
>>> +    union {
>>> +        struct physdev_pci_device pci;
>>> +    } dev;
>>> +};
>>> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map);
>>
>> This is a tools-only (i.e. unstable) sub-function in Xen; even the comment
>> at the top says "toolstack". It is therefore not suitable for use in a
>> kernel.
>>
> IMO this comment actually describes how the toolstack uses the 
> hypercall, but I don't think it is actually reserved for toolstack use.

Well, the canonical version of the header is quite explicit about this,
by having the definition in a __XEN__ || __XEN_TOOLS__ section.

> Or maybe we should allow the kernel to use this hypercall as well.

That's an option to consider.

Jan
Jason Gunthorpe June 19, 2024, 4:30 p.m. UTC | #4
On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote:

> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type)
> +{
> +	struct xen_iommu_domain *domain;
> +	u16 ctx_no;
> +	int ret;
> +
> +	if (type & IOMMU_DOMAIN_IDENTITY) {
> +		/* use default domain */
> +		ctx_no = 0;

Please use the new ops, domain_alloc_paging and the static identity domain.

> +static struct iommu_group *xen_iommu_device_group(struct device *dev)
> +{
> +	if (!dev_is_pci(dev))
> +		return ERR_PTR(-ENODEV);
> +

device_group is only called after probe_device, since you already
exclude !pci during probe there is no need for this wrapper, just set
the op directly to pci_device_group.

> +static void xen_iommu_release_device(struct device *dev)
> +{
> +	int ret;
> +	struct pci_dev *pdev;
> +	struct pv_iommu_op op = {
> +		.subop_id = IOMMUOP_reattach_device,
> +		.flags = 0,
> +		.ctx_no = 0 /* reattach device back to default context */
> +	};

Consider if you can use release_domain for this, I think this is
probably a BLOCKED domain behavior.

> +	if (!dev_is_pci(dev))
> +		return;

No op is ever called on a non-probed device, remove all these checks.

> +static int xen_iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
> +							   phys_addr_t paddr, size_t pgsize, size_t pgcount,
> +							   int prot, gfp_t gfp, size_t *mapped)
> +{
> +	size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
> +	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
> +	struct pv_iommu_op op = {
> +		.subop_id = IOMMUOP_map_pages,
> +		.flags = 0,
> +		.ctx_no = dom->ctx_no
> +	};
> +	/* NOTE: paddr is actually bound to pfn, not gfn */
> +	uint64_t pfn = addr_to_pfn(paddr);
> +	uint64_t dfn = addr_to_pfn(iova);
> +	int ret = 0;
> +
> +	if (WARN(!dom->ctx_no, "Tried to map page to default context"))
> +		return -EINVAL;

A paging domain should be the only domain ops that have a populated
map so this should be made impossible by construction.

Jason
Teddy Astie June 21, 2024, 3:09 p.m. UTC | #5
Hello Jason,

Le 19/06/2024 à 18:30, Jason Gunthorpe a écrit :
> On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote:
> 
>> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type)
>> +{
>> +	struct xen_iommu_domain *domain;
>> +	u16 ctx_no;
>> +	int ret;
>> +
>> +	if (type & IOMMU_DOMAIN_IDENTITY) {
>> +		/* use default domain */
>> +		ctx_no = 0;
> 
> Please use the new ops, domain_alloc_paging and the static identity domain.

Yes, in the v2, I will use this newer interface.

I have a question on this new interface : is it valid to not have a 
identity domain (and "default domain" being blocking); well in the 
current implementation it doesn't really matter, but at some point, we 
may want to allow not having it (thus making this driver mandatory).

> 
>> +static struct iommu_group *xen_iommu_device_group(struct device *dev)
>> +{
>> +	if (!dev_is_pci(dev))
>> +		return ERR_PTR(-ENODEV);
>> +
> 
> device_group is only called after probe_device, since you already
> exclude !pci during probe there is no need for this wrapper, just set
> the op directly to pci_device_group.
> 
>> +	if (!dev_is_pci(dev))
>> +		return;
> 
> No op is ever called on a non-probed device, remove all these checks.
>
> 
> A paging domain should be the only domain ops that have a populated
> map so this should be made impossible by construction
Makes sense, will remove these redundant checks in v2.

> 
>> +static void xen_iommu_release_device(struct device *dev)
>> +{
>> +	int ret;
>> +	struct pci_dev *pdev;
>> +	struct pv_iommu_op op = {
>> +		.subop_id = IOMMUOP_reattach_device,
>> +		.flags = 0,
>> +		.ctx_no = 0 /* reattach device back to default context */
>> +	};
> 
> Consider if you can use release_domain for this, I think this is
> probably a BLOCKED domain behavior.

The goal is to put back all devices where they were at the beginning 
(the default "context"), which is what release_domain looks like it is 
doing. Will use it for v2.

> 
> Jason

Teddy


Teddy Astie | Vates XCP-ng Intern

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Baolu Lu June 23, 2024, 3:21 a.m. UTC | #6
On 6/21/24 11:09 PM, Teddy Astie wrote:
> Le 19/06/2024 à 18:30, Jason Gunthorpe a écrit :
>> On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote:
>>
>>> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type)
>>> +{
>>> +	struct xen_iommu_domain *domain;
>>> +	u16 ctx_no;
>>> +	int ret;
>>> +
>>> +	if (type & IOMMU_DOMAIN_IDENTITY) {
>>> +		/* use default domain */
>>> +		ctx_no = 0;
>> Please use the new ops, domain_alloc_paging and the static identity domain.
> Yes, in the v2, I will use this newer interface.
> 
> I have a question on this new interface : is it valid to not have a
> identity domain (and "default domain" being blocking); well in the
> current implementation it doesn't really matter, but at some point, we
> may want to allow not having it (thus making this driver mandatory).

It's valid to not have an identity domain if "default domain being
blocking" means a paging domain with no mappings.

In the iommu driver's iommu_ops::def_domain_type callback, just always
return IOMMU_DOMAIN_DMA, which indicates that the iommu driver doesn't
support identity translation.

Best regards,
baolu
Robin Murphy June 24, 2024, 11:09 a.m. UTC | #7
On 2024-06-23 4:21 am, Baolu Lu wrote:
> On 6/21/24 11:09 PM, Teddy Astie wrote:
>> Le 19/06/2024 à 18:30, Jason Gunthorpe a écrit :
>>> On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote:
>>>
>>>> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type)
>>>> +{
>>>> +    struct xen_iommu_domain *domain;
>>>> +    u16 ctx_no;
>>>> +    int ret;
>>>> +
>>>> +    if (type & IOMMU_DOMAIN_IDENTITY) {
>>>> +        /* use default domain */
>>>> +        ctx_no = 0;
>>> Please use the new ops, domain_alloc_paging and the static identity 
>>> domain.
>> Yes, in the v2, I will use this newer interface.
>>
>> I have a question on this new interface : is it valid to not have a
>> identity domain (and "default domain" being blocking); well in the
>> current implementation it doesn't really matter, but at some point, we
>> may want to allow not having it (thus making this driver mandatory).
> 
> It's valid to not have an identity domain if "default domain being
> blocking" means a paging domain with no mappings.
> 
> In the iommu driver's iommu_ops::def_domain_type callback, just always
> return IOMMU_DOMAIN_DMA, which indicates that the iommu driver doesn't
> support identity translation.

That's not necessary - if neither ops->identity_domain nor 
ops->domain_alloc(IOMMU_DOMAIN_IDENTITY) gives a valid domain then we 
fall back to IOMMU_DOMAIN_DMA anyway.

Thanks,
Robin.
Baolu Lu June 25, 2024, 1:18 a.m. UTC | #8
On 6/24/24 7:09 PM, Robin Murphy wrote:
> On 2024-06-23 4:21 am, Baolu Lu wrote:
>> On 6/21/24 11:09 PM, Teddy Astie wrote:
>>> Le 19/06/2024 à 18:30, Jason Gunthorpe a écrit :
>>>> On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote:
>>>>
>>>>> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type)
>>>>> +{
>>>>> +    struct xen_iommu_domain *domain;
>>>>> +    u16 ctx_no;
>>>>> +    int ret;
>>>>> +
>>>>> +    if (type & IOMMU_DOMAIN_IDENTITY) {
>>>>> +        /* use default domain */
>>>>> +        ctx_no = 0;
>>>> Please use the new ops, domain_alloc_paging and the static identity 
>>>> domain.
>>> Yes, in the v2, I will use this newer interface.
>>>
>>> I have a question on this new interface : is it valid to not have a
>>> identity domain (and "default domain" being blocking); well in the
>>> current implementation it doesn't really matter, but at some point, we
>>> may want to allow not having it (thus making this driver mandatory).
>>
>> It's valid to not have an identity domain if "default domain being
>> blocking" means a paging domain with no mappings.
>>
>> In the iommu driver's iommu_ops::def_domain_type callback, just always
>> return IOMMU_DOMAIN_DMA, which indicates that the iommu driver doesn't
>> support identity translation.
> 
> That's not necessary - if neither ops->identity_domain nor 
> ops->domain_alloc(IOMMU_DOMAIN_IDENTITY) gives a valid domain then we 
> fall back to IOMMU_DOMAIN_DMA anyway.

Yes. That's true.

Best regards,
baolu
diff mbox series

Patch

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a2dd24947eb8..6b1857f27c14 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -490,6 +490,12 @@  HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
 	return _hypercall2(int, xenpmu_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_iommu_op(void *arg)
+{
+	return _hypercall1(int, iommu_op, arg);
+}
+
 static inline int
 HYPERVISOR_dm_op(
 	domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs)
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2b12b583ef4b..8d8a22b91e34 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -482,6 +482,15 @@  config VIRTIO_IOMMU
 
 	  Say Y here if you intend to run this kernel as a guest.
 
+config XEN_IOMMU
+	bool "Xen IOMMU driver"
+	depends on XEN_DOM0
+	select IOMMU_API
+	help
+		Xen PV-IOMMU driver for Dom0.
+
+		Say Y here if you intend to run this guest as Xen Dom0.
+
 config SPRD_IOMMU
 	tristate "Unisoc IOMMU Support"
 	depends on ARCH_SPRD || COMPILE_TEST
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 769e43d780ce..11fa258d3a04 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -30,3 +30,4 @@  obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
 obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o io-pgfault.o
 obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
 obj-$(CONFIG_APPLE_DART) += apple-dart.o
+obj-$(CONFIG_XEN_IOMMU) += xen-iommu.o
\ No newline at end of file
diff --git a/drivers/iommu/xen-iommu.c b/drivers/iommu/xen-iommu.c
new file mode 100644
index 000000000000..2c8e42240a6b
--- /dev/null
+++ b/drivers/iommu/xen-iommu.c
@@ -0,0 +1,508 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xen PV-IOMMU driver.
+ *
+ * Copyright (C) 2024 Vates SAS
+ *
+ * Author: Teddy Astie <teddy.astie@vates.tech>
+ *
+ */
+
+#define pr_fmt(fmt)	"xen-iommu: " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/iommu.h>
+#include <linux/dma-map-ops.h>
+#include <linux/pci.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/device/driver.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/printk.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/minmax.h>
+#include <linux/string.h>
+#include <asm/iommu.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/pv-iommu.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_DESCRIPTION("Xen IOMMU driver");
+MODULE_AUTHOR("Teddy Astie <teddy.astie@vates.tech>");
+MODULE_LICENSE("GPL");
+
+#define MSI_RANGE_START		(0xfee00000)
+#define MSI_RANGE_END		(0xfeefffff)
+
+#define XEN_IOMMU_PGSIZES       (0x1000)
+
+struct xen_iommu_domain {
+	struct iommu_domain domain;
+
+	u16 ctx_no; /* Xen PV-IOMMU context number */
+};
+
+static struct iommu_device xen_iommu_device;
+
+static uint32_t max_nr_pages;
+static uint64_t max_iova_addr;
+
+static spinlock_t lock;
+
+static inline struct xen_iommu_domain *to_xen_iommu_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct xen_iommu_domain, domain);
+}
+
+static inline u64 addr_to_pfn(u64 addr)
+{
+	return addr >> 12;
+}
+
+static inline u64 pfn_to_addr(u64 pfn)
+{
+	return pfn << 12;
+}
+
+bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return true;
+
+	default:
+		return false;
+	}
+}
+
+struct iommu_domain *xen_iommu_domain_alloc(unsigned type)
+{
+	struct xen_iommu_domain *domain;
+	u16 ctx_no;
+	int ret;
+
+	if (type & IOMMU_DOMAIN_IDENTITY) {
+		/* use default domain */
+		ctx_no = 0;
+	} else {
+		struct pv_iommu_op op = {
+			.ctx_no = 0,
+			.flags = 0,
+			.subop_id = IOMMUOP_alloc_context
+		};
+
+		ret = HYPERVISOR_iommu_op(&op);
+
+		if (ret) {
+			pr_err("Unable to create Xen IOMMU context (%d)", ret);
+			return ERR_PTR(ret);
+		}
+
+		ctx_no = op.ctx_no;
+	}
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+
+	domain->ctx_no = ctx_no;
+
+	domain->domain.geometry.aperture_start = 0;
+
+	domain->domain.geometry.aperture_end = max_iova_addr;
+	domain->domain.geometry.force_aperture = true;
+
+	return &domain->domain;
+}
+
+static struct iommu_group *xen_iommu_device_group(struct device *dev)
+{
+	if (!dev_is_pci(dev))
+		return ERR_PTR(-ENODEV);
+
+	return pci_device_group(dev);
+}
+
+static struct iommu_device *xen_iommu_probe_device(struct device *dev)
+{
+	if (!dev_is_pci(dev))
+		return ERR_PTR(-ENODEV);
+
+	return &xen_iommu_device;
+}
+
+static void xen_iommu_probe_finalize(struct device *dev)
+{
+	set_dma_ops(dev, NULL);
+	iommu_setup_dma_ops(dev, 0, max_iova_addr);
+}
+
+static void xen_iommu_release_device(struct device *dev)
+{
+	int ret;
+	struct pci_dev *pdev;
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_reattach_device,
+		.flags = 0,
+		.ctx_no = 0 /* reattach device back to default context */
+	};
+
+	if (!dev_is_pci(dev))
+		return;
+
+	pdev = to_pci_dev(dev);
+
+	op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
+	op.reattach_device.dev.bus = pdev->bus->number;
+	op.reattach_device.dev.devfn = pdev->devfn;
+
+	ret = HYPERVISOR_iommu_op(&op);
+
+	if (ret)
+		pr_warn("Unable to release device %p\n", &op.reattach_device.dev);
+}
+
+static int xen_iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
+							   phys_addr_t paddr, size_t pgsize, size_t pgcount,
+							   int prot, gfp_t gfp, size_t *mapped)
+{
+	size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_map_pages,
+		.flags = 0,
+		.ctx_no = dom->ctx_no
+	};
+	/* NOTE: paddr is actually bound to pfn, not gfn */
+	uint64_t pfn = addr_to_pfn(paddr);
+	uint64_t dfn = addr_to_pfn(iova);
+	int ret = 0;
+
+	if (WARN(!dom->ctx_no, "Tried to map page to default context"))
+		return -EINVAL;
+
+	//pr_info("Mapping to %lx %zu %zu paddr %x\n", iova, pgsize, pgcount, paddr);
+
+	if (prot & IOMMU_READ)
+		op.flags |= IOMMU_OP_readable;
+
+	if (prot & IOMMU_WRITE)
+		op.flags |= IOMMU_OP_writeable;
+
+	while (xen_pg_count) {
+		size_t to_map = min(xen_pg_count, max_nr_pages);
+		uint64_t gfn = pfn_to_gfn(pfn);
+
+		//pr_info("Mapping %lx-%lx at %lx-%lx\n", gfn, gfn + to_map - 1, dfn, dfn + to_map - 1);
+
+		op.map_pages.gfn = gfn;
+		op.map_pages.dfn = dfn;
+
+		op.map_pages.nr_pages = to_map;
+
+		ret = HYPERVISOR_iommu_op(&op);
+
+		//pr_info("map_pages.mapped = %u\n", op.map_pages.mapped);
+
+		if (mapped)
+			*mapped += XEN_PAGE_SIZE * op.map_pages.mapped;
+
+		if (ret)
+			break;
+
+		xen_pg_count -= to_map;
+
+		pfn += to_map;
+		dfn += to_map;
+	}
+
+	return ret;
+}
+
+static size_t xen_iommu_unmap_pages(struct iommu_domain *domain, unsigned long iova,
+									size_t pgsize, size_t pgcount,
+									struct iommu_iotlb_gather *iotlb_gather)
+{
+	size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_unmap_pages,
+		.ctx_no = dom->ctx_no,
+		.flags = 0,
+	};
+	uint64_t dfn = addr_to_pfn(iova);
+	int ret = 0;
+
+	if (WARN(!dom->ctx_no, "Tried to unmap page to default context"))
+		return -EINVAL;
+
+	while (xen_pg_count) {
+		size_t to_unmap = min(xen_pg_count, max_nr_pages);
+
+		//pr_info("Unmapping %lx-%lx\n", dfn, dfn + to_unmap - 1);
+
+		op.unmap_pages.dfn = dfn;
+		op.unmap_pages.nr_pages = to_unmap;
+
+		ret = HYPERVISOR_iommu_op(&op);
+
+		if (ret)
+			pr_warn("Unmap failure (%lx-%lx)\n", dfn, dfn + to_unmap - 1);
+
+		xen_pg_count -= to_unmap;
+
+		dfn += to_unmap;
+	}
+
+	return pgcount * pgsize;
+}
+
+int xen_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	struct pci_dev *pdev;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_reattach_device,
+		.flags = 0,
+		.ctx_no = dom->ctx_no,
+	};
+
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	pdev = to_pci_dev(dev);
+
+	op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
+	op.reattach_device.dev.bus = pdev->bus->number;
+	op.reattach_device.dev.devfn = pdev->devfn;
+
+	return HYPERVISOR_iommu_op(&op);
+}
+
+static void xen_iommu_free(struct iommu_domain *domain)
+{
+	int ret;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+
+	if (dom->ctx_no != 0) {
+		struct pv_iommu_op op = {
+			.ctx_no = dom->ctx_no,
+			.flags = 0,
+			.subop_id = IOMMUOP_free_context
+		};
+
+		ret = HYPERVISOR_iommu_op(&op);
+
+		if (ret)
+			pr_err("Context %hu destruction failure\n", dom->ctx_no);
+	}
+
+	kfree(domain);
+}
+
+static phys_addr_t xen_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+	int ret;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+
+	struct pv_iommu_op op = {
+		.ctx_no = dom->ctx_no,
+		.flags = 0,
+		.subop_id = IOMMUOP_lookup_page,
+	};
+
+	op.lookup_page.dfn = addr_to_pfn(iova);
+
+	ret = HYPERVISOR_iommu_op(&op);
+
+	if (ret)
+		return 0;
+
+	phys_addr_t page_addr = pfn_to_addr(gfn_to_pfn(op.lookup_page.gfn));
+
+	/* Consider non-aligned iova */
+	return page_addr + (iova & 0xFFF);
+}
+
+static void xen_iommu_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct iommu_resv_region *reg;
+	struct xen_reserved_device_memory *entries;
+	struct xen_reserved_device_memory_map map;
+	struct pci_dev *pdev;
+	int ret, i;
+
+	if (!dev_is_pci(dev))
+		return;
+
+	pdev = to_pci_dev(dev);
+
+	reg = iommu_alloc_resv_region(MSI_RANGE_START,
+		MSI_RANGE_END - MSI_RANGE_START + 1,
+		0, IOMMU_RESV_MSI, GFP_KERNEL);
+
+	if (!reg)
+		return;
+
+	list_add_tail(&reg->list, head);
+
+	/* Map xen-specific entries */
+
+	/* First, get number of entries to map */
+	map.buffer = NULL;
+	map.nr_entries = 0;
+	map.flags = 0;
+
+	map.dev.pci.seg = pci_domain_nr(pdev->bus);
+	map.dev.pci.bus = pdev->bus->number;
+	map.dev.pci.devfn = pdev->devfn;
+
+	ret = HYPERVISOR_memory_op(XENMEM_reserved_device_memory_map, &map);
+
+	if (ret == 0)
+		/* No reserved region, nothing to do */
+		return;
+
+	if (ret != -ENOBUFS) {
+		pr_err("Unable to get reserved region count (%d)\n", ret);
+		return;
+	}
+
+	/* Assume a reasonable number of entries, otherwise, something is probably wrong */
+	if (WARN_ON(map.nr_entries > 256))
+		pr_warn("Xen reporting many reserved regions (%u)\n", map.nr_entries);
+
+	/* And finally get actual mappings */
+	entries = kcalloc(map.nr_entries, sizeof(struct xen_reserved_device_memory),
+					  GFP_KERNEL);
+
+	if (!entries) {
+		pr_err("No memory for map entries\n");
+		return;
+	}
+
+	map.buffer = entries;
+
+	ret = HYPERVISOR_memory_op(XENMEM_reserved_device_memory_map, &map);
+
+	if (ret != 0) {
+		pr_err("Unable to get reserved regions (%d)\n", ret);
+		kfree(entries);
+		return;
+	}
+
+	for (i = 0; i < map.nr_entries; i++) {
+		struct xen_reserved_device_memory entry = entries[i];
+
+		reg = iommu_alloc_resv_region(pfn_to_addr(entry.start_pfn),
+									  pfn_to_addr(entry.nr_pages),
+									  0, IOMMU_RESV_RESERVED, GFP_KERNEL);
+
+		if (!reg)
+			break;
+
+		list_add_tail(&reg->list, head);
+	}
+
+	kfree(entries);
+}
+
+static struct iommu_ops xen_iommu_ops = {
+	.capable = xen_iommu_capable,
+	.domain_alloc = xen_iommu_domain_alloc,
+	.probe_device = xen_iommu_probe_device,
+	.probe_finalize = xen_iommu_probe_finalize,
+	.device_group = xen_iommu_device_group,
+	.release_device = xen_iommu_release_device,
+	.get_resv_regions = xen_iommu_get_resv_regions,
+	.pgsize_bitmap = XEN_IOMMU_PGSIZES,
+	.default_domain_ops = &(const struct iommu_domain_ops) {
+		.map_pages = xen_iommu_map_pages,
+		.unmap_pages = xen_iommu_unmap_pages,
+		.attach_dev = xen_iommu_attach_dev,
+		.iova_to_phys = xen_iommu_iova_to_phys,
+		.free = xen_iommu_free,
+	},
+};
+
+int __init xen_iommu_init(void)
+{
+	int ret;
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_query_capabilities
+	};
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	/* Check if iommu_op is supported */
+	if (HYPERVISOR_iommu_op(&op) == -ENOSYS)
+		return -ENODEV; /* No Xen IOMMU hardware */
+
+	pr_info("Initialising Xen IOMMU driver\n");
+	pr_info("max_nr_pages=%d\n", op.cap.max_nr_pages);
+	pr_info("max_ctx_no=%d\n", op.cap.max_ctx_no);
+	pr_info("max_iova_addr=%llx\n", op.cap.max_iova_addr);
+
+	if (op.cap.max_ctx_no == 0) {
+		pr_err("Unable to use IOMMU PV driver (no context available)\n");
+		return -ENOTSUPP; /* Unable to use IOMMU PV ? */
+	}
+
+	if (xen_domain_type == XEN_PV_DOMAIN)
+		/* TODO: In PV domain, due to the existing pfn-gfn mapping we need to
+		 * consider that under certains circonstances, we have :
+		 *   pfn_to_gfn(x + 1) != pfn_to_gfn(x) + 1
+		 *
+		 * In these cases, we would want to separate the subop into several calls.
+		 * (only doing the grouped operation when the mapping is actually contigous)
+		 * Only map operation would be affected, as unmap actually uses dfn which
+		 * doesn't have this kind of mapping.
+		 *
+		 * Force single-page operations to work arround this issue for now.
+		 */
+		max_nr_pages = 1;
+	else
+		/* With HVM domains, pfn_to_gfn is identity, there is no issue regarding this. */
+		max_nr_pages = op.cap.max_nr_pages;
+
+	max_iova_addr = op.cap.max_iova_addr;
+
+	spin_lock_init(&lock);
+
+	ret = iommu_device_sysfs_add(&xen_iommu_device, NULL, NULL, "xen-iommu");
+	if (ret) {
+		pr_err("Unable to add Xen IOMMU sysfs\n");
+		return ret;
+	}
+
+	ret = iommu_device_register(&xen_iommu_device, &xen_iommu_ops, NULL);
+	if (ret) {
+		pr_err("Unable to register Xen IOMMU device %d\n", ret);
+		iommu_device_sysfs_remove(&xen_iommu_device);
+		return ret;
+	}
+
+	/* swiotlb is redundant when IOMMU is active. */
+	x86_swiotlb_enable = false;
+
+	return 0;
+}
+
+void __exit xen_iommu_fini(void)
+{
+	pr_info("Unregistering Xen IOMMU driver\n");
+
+	iommu_device_unregister(&xen_iommu_device);
+	iommu_device_sysfs_remove(&xen_iommu_device);
+}
+
+module_init(xen_iommu_init);
+module_exit(xen_iommu_fini);
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 1a371a825c55..08571add426b 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -10,6 +10,7 @@ 
 #ifndef __XEN_PUBLIC_MEMORY_H__
 #define __XEN_PUBLIC_MEMORY_H__
 
+#include "xen/interface/physdev.h"
 #include <linux/spinlock.h>
 
 /*
@@ -214,6 +215,38 @@  struct xen_add_to_physmap_range {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range);
 
+/*
+ * With some legacy devices, certain guest-physical addresses cannot safely
+ * be used for other purposes, e.g. to map guest RAM.  This hypercall
+ * enumerates those regions so the toolstack can avoid using them.
+ */
+#define XENMEM_reserved_device_memory_map   27
+struct xen_reserved_device_memory {
+    xen_pfn_t start_pfn;
+    xen_ulong_t nr_pages;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory);
+
+struct xen_reserved_device_memory_map {
+#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */
+    /* IN */
+    uint32_t flags;
+    /*
+     * IN/OUT
+     *
+     * Gets set to the required number of entries when too low,
+     * signaled by error code -ERANGE.
+     */
+    unsigned int nr_entries;
+    /* OUT */
+    GUEST_HANDLE(xen_reserved_device_memory) buffer;
+    /* IN */
+    union {
+        struct physdev_pci_device pci;
+    } dev;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map);
+
 /*
  * Returns the pseudo-physical memory map as it was when the domain
  * was started (specified by XENMEM_set_memory_map).
diff --git a/include/xen/interface/pv-iommu.h b/include/xen/interface/pv-iommu.h
new file mode 100644
index 000000000000..5560609d0e7a
--- /dev/null
+++ b/include/xen/interface/pv-iommu.h
@@ -0,0 +1,114 @@ 
+/* SPDX-License-Identifier: MIT */
+/******************************************************************************
+ * pv-iommu.h
+ *
+ * Paravirtualized IOMMU driver interface.
+ *
+ * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
+ */
+
+#ifndef __XEN_PUBLIC_PV_IOMMU_H__
+#define __XEN_PUBLIC_PV_IOMMU_H__
+
+#include "xen.h"
+#include "physdev.h"
+
+#define IOMMU_DEFAULT_CONTEXT (0)
+
+/**
+ * Query PV-IOMMU capabilities for this domain.
+ */
+#define IOMMUOP_query_capabilities    1
+
+/**
+ * Allocate an IOMMU context, the new context handle will be written to ctx_no.
+ */
+#define IOMMUOP_alloc_context         2
+
+/**
+ * Destroy a IOMMU context.
+ * All devices attached to this context are reattached to default context.
+ *
+ * The default context can't be destroyed (0).
+ */
+#define IOMMUOP_free_context          3
+
+/**
+ * Reattach the device to IOMMU context.
+ */
+#define IOMMUOP_reattach_device       4
+
+#define IOMMUOP_map_pages             5
+#define IOMMUOP_unmap_pages           6
+
+/**
+ * Get the GFN associated to a specific DFN.
+ */
+#define IOMMUOP_lookup_page           7
+
+struct pv_iommu_op {
+    uint16_t subop_id;
+    uint16_t ctx_no;
+
+/**
+ * Create a context that is cloned from default.
+ * The new context will be populated with 1:1 mappings covering the entire guest memory.
+ */
+#define IOMMU_CREATE_clone (1 << 0)
+
+#define IOMMU_OP_readable (1 << 0)
+#define IOMMU_OP_writeable (1 << 1)
+    uint32_t flags;
+
+    union {
+        struct {
+            uint64_t gfn;
+            uint64_t dfn;
+            /* Number of pages to map */
+            uint32_t nr_pages;
+            /* Number of pages actually mapped after sub-op */
+            uint32_t mapped;
+        } map_pages;
+
+        struct {
+            uint64_t dfn;
+            /* Number of pages to unmap */
+            uint32_t nr_pages;
+            /* Number of pages actually unmapped after sub-op */
+            uint32_t unmapped;
+        } unmap_pages;
+
+        struct {
+            struct physdev_pci_device dev;
+        } reattach_device;
+
+        struct {
+            uint64_t gfn;
+            uint64_t dfn;
+        } lookup_page;
+
+        struct {
+            /* Maximum number of IOMMU context this domain can use. */
+            uint16_t max_ctx_no;
+            /* Maximum number of pages that can be modified in a single map/unmap operation. */
+            uint32_t max_nr_pages;
+            /* Maximum device address (iova) that the guest can use for mappings. */
+            uint64_t max_iova_addr;
+        } cap;
+    };
+};
+
+typedef struct pv_iommu_op pv_iommu_op_t;
+DEFINE_GUEST_HANDLE_STRUCT(pv_iommu_op_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
\ No newline at end of file
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 0ca23eca2a9c..8b1daf3fecc6 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -65,6 +65,7 @@ 
 #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
 #define __HYPERVISOR_xenpmu_op            40
 #define __HYPERVISOR_dm_op                41
+#define __HYPERVISOR_iommu_op 					  43
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48