diff mbox series

[RFC,v2] iommu/xen: Add Xen PV-IOMMU driver

Message ID 24d7ec005e77e4e0127995ba6f4ad16f33737fa5.1718981216.git.teddy.astie@vates.tech (mailing list archive)
State New, archived
Headers show
Series [RFC,v2] iommu/xen: Add Xen PV-IOMMU driver | expand

Commit Message

Teddy Astie June 21, 2024, 4:08 p.m. UTC
From: Teddy Astie <teddy.astie@vates.tech>

In the context of Xen, Linux runs as Dom0 and doesn't have access to the
machine IOMMU. Although, a IOMMU is mandatory to use some kernel features
such as VFIO or DMA protection.

In Xen, we added a paravirtualized IOMMU with iommu_op hypercall in order to
allow Dom0 to implement such feature. This commit introduces a new IOMMU driver
that uses this new hypercall interface.

Signed-off-by Teddy Astie <teddy.astie@vates.tech>
---
Changes since v1 :
* formatting changes
* applied Jan Beulich proposed changes : removed vim notes at end of pv-iommu.h
* applied Jason Gunthorpe proposed changes : use new ops and remove redundant
checks
---
 arch/x86/include/asm/xen/hypercall.h |   6 +
 drivers/iommu/Kconfig                |   9 +
 drivers/iommu/Makefile               |   1 +
 drivers/iommu/xen-iommu.c            | 489 +++++++++++++++++++++++++++
 include/xen/interface/memory.h       |  33 ++
 include/xen/interface/pv-iommu.h     | 104 ++++++
 include/xen/interface/xen.h          |   1 +
 7 files changed, 643 insertions(+)
 create mode 100644 drivers/iommu/xen-iommu.c
 create mode 100644 include/xen/interface/pv-iommu.h

Comments

Robin Murphy June 21, 2024, 11:28 p.m. UTC | #1
On 2024-06-21 5:08 pm, TSnake41 wrote:
> From: Teddy Astie <teddy.astie@vates.tech>
> 
> In the context of Xen, Linux runs as Dom0 and doesn't have access to the
> machine IOMMU. Although, a IOMMU is mandatory to use some kernel features
> such as VFIO or DMA protection.
> 
> In Xen, we added a paravirtualized IOMMU with iommu_op hypercall in order to
> allow Dom0 to implement such feature. This commit introduces a new IOMMU driver
> that uses this new hypercall interface.
> 
> Signed-off-by Teddy Astie <teddy.astie@vates.tech>
> ---
> Changes since v1 :
> * formatting changes
> * applied Jan Beulich proposed changes : removed vim notes at end of pv-iommu.h
> * applied Jason Gunthorpe proposed changes : use new ops and remove redundant
> checks
> ---
>   arch/x86/include/asm/xen/hypercall.h |   6 +
>   drivers/iommu/Kconfig                |   9 +
>   drivers/iommu/Makefile               |   1 +
>   drivers/iommu/xen-iommu.c            | 489 +++++++++++++++++++++++++++
>   include/xen/interface/memory.h       |  33 ++
>   include/xen/interface/pv-iommu.h     | 104 ++++++
>   include/xen/interface/xen.h          |   1 +
>   7 files changed, 643 insertions(+)
>   create mode 100644 drivers/iommu/xen-iommu.c
>   create mode 100644 include/xen/interface/pv-iommu.h
> 
> diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
> index a2dd24947eb8..6b1857f27c14 100644
> --- a/arch/x86/include/asm/xen/hypercall.h
> +++ b/arch/x86/include/asm/xen/hypercall.h
> @@ -490,6 +490,12 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
>   	return _hypercall2(int, xenpmu_op, op, arg);
>   }
>   
> +static inline int
> +HYPERVISOR_iommu_op(void *arg)
> +{
> +	return _hypercall1(int, iommu_op, arg);
> +}
> +
>   static inline int
>   HYPERVISOR_dm_op(
>   	domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs)
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 0af39bbbe3a3..242cefac77c9 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -480,6 +480,15 @@ config VIRTIO_IOMMU
>   
>   	  Say Y here if you intend to run this kernel as a guest.
>   
> +config XEN_IOMMU
> +	bool "Xen IOMMU driver"
> +	depends on XEN_DOM0

Clearly this depends on X86 as well.

> +	select IOMMU_API
> +	help
> +		Xen PV-IOMMU driver for Dom0.
> +
> +		Say Y here if you intend to run this guest as Xen Dom0.
> +
>   config SPRD_IOMMU
>   	tristate "Unisoc IOMMU Support"
>   	depends on ARCH_SPRD || COMPILE_TEST
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 542760d963ec..393afe22c901 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -30,3 +30,4 @@ obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
>   obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
>   obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
>   obj-$(CONFIG_APPLE_DART) += apple-dart.o
> +obj-$(CONFIG_XEN_IOMMU) += xen-iommu.o
> \ No newline at end of file
> diff --git a/drivers/iommu/xen-iommu.c b/drivers/iommu/xen-iommu.c
> new file mode 100644
> index 000000000000..b765445d27cd
> --- /dev/null
> +++ b/drivers/iommu/xen-iommu.c
> @@ -0,0 +1,489 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Xen PV-IOMMU driver.
> + *
> + * Copyright (C) 2024 Vates SAS
> + *
> + * Author: Teddy Astie <teddy.astie@vates.tech>
> + *
> + */
> +
> +#define pr_fmt(fmt)	"xen-iommu: " fmt
> +
> +#include <linux/kernel.h>
> +#include <linux/init.h>
> +#include <linux/types.h>
> +#include <linux/iommu.h>
> +#include <linux/dma-map-ops.h>

Please drop this; it's a driver, not a DMA ops implementation.

> +#include <linux/pci.h>
> +#include <linux/list.h>
> +#include <linux/string.h>
> +#include <linux/device/driver.h>
> +#include <linux/slab.h>
> +#include <linux/err.h>
> +#include <linux/printk.h>
> +#include <linux/stddef.h>
> +#include <linux/spinlock.h>
> +#include <linux/minmax.h>
> +#include <linux/string.h>
> +#include <asm/iommu.h>
> +
> +#include <xen/xen.h>
> +#include <xen/page.h>
> +#include <xen/interface/memory.h>
> +#include <xen/interface/physdev.h>
> +#include <xen/interface/pv-iommu.h>
> +#include <asm/xen/hypercall.h>
> +#include <asm/xen/page.h>
> +
> +MODULE_DESCRIPTION("Xen IOMMU driver");
> +MODULE_AUTHOR("Teddy Astie <teddy.astie@vates.tech>");
> +MODULE_LICENSE("GPL");
> +
> +#define MSI_RANGE_START		(0xfee00000)
> +#define MSI_RANGE_END		(0xfeefffff)
> +
> +#define XEN_IOMMU_PGSIZES       (0x1000)
> +
> +struct xen_iommu_domain {
> +	struct iommu_domain domain;
> +
> +	u16 ctx_no; /* Xen PV-IOMMU context number */
> +};
> +
> +static struct iommu_device xen_iommu_device;
> +
> +static uint32_t max_nr_pages;
> +static uint64_t max_iova_addr;
> +
> +static spinlock_t lock;

Not a great name - usually it's good to name a lock after what it 
protects. Although perhaps it is already, since AFAICS this isn't 
actually used anywhere anyway.

> +static inline struct xen_iommu_domain *to_xen_iommu_domain(struct iommu_domain *dom)
> +{
> +	return container_of(dom, struct xen_iommu_domain, domain);
> +}
> +
> +static inline u64 addr_to_pfn(u64 addr)
> +{
> +	return addr >> 12;
> +}
> +
> +static inline u64 pfn_to_addr(u64 pfn)
> +{
> +	return pfn << 12;
> +}
> +
> +bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
> +{
> +	switch (cap) {
> +	case IOMMU_CAP_CACHE_COHERENCY:
> +		return true;

Will the PV-IOMMU only ever be exposed on hardware where that really is 
always true?

> +
> +	default:
> +		return false;
> +	}
> +}
> +
> +struct iommu_domain *xen_iommu_domain_alloc_paging(struct device *dev)
> +{
> +	struct xen_iommu_domain *domain;
> +	int ret;
> +
> +	struct pv_iommu_op op = {
> +		.ctx_no = 0,
> +		.flags = 0,
> +		.subop_id = IOMMUOP_alloc_context
> +	};
> +
> +	ret = HYPERVISOR_iommu_op(&op);
> +
> +	if (ret) {
> +		pr_err("Unable to create Xen IOMMU context (%d)", ret);
> +		return ERR_PTR(ret);
> +	}
> +
> +	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
> +
> +	domain->ctx_no = op.ctx_no;
> +
> +	domain->domain.geometry.aperture_start = 0;
> +	domain->domain.geometry.aperture_end = max_iova_addr;
> +	domain->domain.geometry.force_aperture = true;
> +
> +	return &domain->domain;
> +}
> +
> +static struct iommu_device *xen_iommu_probe_device(struct device *dev)
> +{
> +	if (!dev_is_pci(dev))
> +		return ERR_PTR(-ENODEV);
> +
> +	return &xen_iommu_device;

Even emulated PCI devices have to have an (emulated, presumably) IOMMU?

> +}
> +
> +static void xen_iommu_probe_finalize(struct device *dev)
> +{
> +	set_dma_ops(dev, NULL);
> +	iommu_setup_dma_ops(dev, 0, max_iova_addr);

This shouldn't even compile... anyway, core code does this now, please 
drop the whole callback.

> +}
> +
> +static int xen_iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
> +			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
> +			       int prot, gfp_t gfp, size_t *mapped)
> +{
> +	size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;

You only advertise the one page size, so you'll always get that back, 
and this seems a bit redundant.

> +	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
> +	struct pv_iommu_op op = {
> +		.subop_id = IOMMUOP_map_pages,
> +		.flags = 0,
> +		.ctx_no = dom->ctx_no
> +	};
> +	/* NOTE: paddr is actually bound to pfn, not gfn */
> +	uint64_t pfn = addr_to_pfn(paddr);
> +	uint64_t dfn = addr_to_pfn(iova);
> +	int ret = 0;
> +
> +	//pr_info("Mapping to %lx %zu %zu paddr %x\n", iova, pgsize, pgcount, paddr);

Please try to clean up debugging leftovers before posting the patch (but 
also note that there are already tracepoints and debug messages which 
can be enabled in the core code to give visibility of most of this.)

> +
> +	if (prot & IOMMU_READ)
> +		op.flags |= IOMMU_OP_readable;
> +
> +	if (prot & IOMMU_WRITE)
> +		op.flags |= IOMMU_OP_writeable;
> +
> +	while (xen_pg_count) {

Unless you're super-concerned about performance already, you don't 
really need to worry about looping here - you can happily return short 
as long as you've mapped *something*, and the core code will call you 
back again with the remainder. But it also doesn't complicate things 
*too* much as it is, so feel free to leave it in if you want to.

> +		size_t to_map = min(xen_pg_count, max_nr_pages);
> +		uint64_t gfn = pfn_to_gfn(pfn);
> +
> +		//pr_info("Mapping %lx-%lx at %lx-%lx\n", gfn, gfn + to_map - 1, dfn, dfn + to_map - 1);
> +
> +		op.map_pages.gfn = gfn;
> +		op.map_pages.dfn = dfn;
> +
> +		op.map_pages.nr_pages = to_map;
> +
> +		ret = HYPERVISOR_iommu_op(&op);
> +
> +		//pr_info("map_pages.mapped = %u\n", op.map_pages.mapped);
> +
> +		if (mapped)
> +			*mapped += XEN_PAGE_SIZE * op.map_pages.mapped;
> +
> +		if (ret)
> +			break;
> +
> +		xen_pg_count -= to_map;
> +
> +		pfn += to_map;
> +		dfn += to_map;
> +	}
> +
> +	return ret;
> +}
> +
> +static size_t xen_iommu_unmap_pages(struct iommu_domain *domain, unsigned long iova,
> +				    size_t pgsize, size_t pgcount,
> +				    struct iommu_iotlb_gather *iotlb_gather)
> +{
> +	size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
> +	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
> +	struct pv_iommu_op op = {
> +		.subop_id = IOMMUOP_unmap_pages,
> +		.ctx_no = dom->ctx_no,
> +		.flags = 0,
> +	};
> +	uint64_t dfn = addr_to_pfn(iova);
> +	int ret = 0;
> +
> +	if (WARN(!dom->ctx_no, "Tried to unmap page to default context"))
> +		return -EINVAL;

This would go hilariously wrong... the return value here is bytes 
successfully unmapped, a total failure should return 0. But then how 
would it ever happen anyway? Unmap is a domain op, so a domain which 
doesn't allow unmapping shouldn't offer it in the first place...

> +	while (xen_pg_count) {
> +		size_t to_unmap = min(xen_pg_count, max_nr_pages);
> +
> +		//pr_info("Unmapping %lx-%lx\n", dfn, dfn + to_unmap - 1);
> +
> +		op.unmap_pages.dfn = dfn;
> +		op.unmap_pages.nr_pages = to_unmap;
> +
> +		ret = HYPERVISOR_iommu_op(&op);
> +
> +		if (ret)
> +			pr_warn("Unmap failure (%lx-%lx)\n", dfn, dfn + to_unmap - 1);

In this case I'd argue that you really *do* want to return short, in the 
hope of propagating the error back up and letting the caller know the 
address space is now messed up before things start blowing up even more 
if they keep going and subsequently try to map new pages into 
not-actually-unmapped VAs.

> +
> +		xen_pg_count -= to_unmap;
> +
> +		dfn += to_unmap;
> +	}
> +
> +	return pgcount * pgsize;
> +}
> +
> +int xen_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
> +{
> +	struct pci_dev *pdev;
> +	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
> +	struct pv_iommu_op op = {
> +		.subop_id = IOMMUOP_reattach_device,
> +		.flags = 0,
> +		.ctx_no = dom->ctx_no,
> +	};
> +
> +	pdev = to_pci_dev(dev);
> +
> +	op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
> +	op.reattach_device.dev.bus = pdev->bus->number;
> +	op.reattach_device.dev.devfn = pdev->devfn;
> +
> +	return HYPERVISOR_iommu_op(&op);
> +}
> +
> +static void xen_iommu_free(struct iommu_domain *domain)
> +{
> +	int ret;
> +	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
> +
> +	if (dom->ctx_no != 0) {

Much like unmap above, this not being true would imply that someone's 
managed to go round the back of the core code to get the .free op from a 
validly-allocated domain and then pass something other than that domain 
to it. Personally I'd consider that a level of brokenness that's not 
even worth trying to consider at all, but if you want to go as far as 
determining that you *have* clearly been given something you couldn't 
have allocated, then trying to kfree() it probably isn't wise either.

> +		struct pv_iommu_op op = {
> +			.ctx_no = dom->ctx_no,
> +			.flags = 0,
> +			.subop_id = IOMMUOP_free_context
> +		};
> +
> +		ret = HYPERVISOR_iommu_op(&op);
> +
> +		if (ret)
> +			pr_err("Context %hu destruction failure\n", dom->ctx_no);
> +	}
> +
> +	kfree(domain);
> +}
> +
> +static phys_addr_t xen_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
> +{
> +	int ret;
> +	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
> +
> +	struct pv_iommu_op op = {
> +		.ctx_no = dom->ctx_no,
> +		.flags = 0,
> +		.subop_id = IOMMUOP_lookup_page,
> +	};
> +
> +	op.lookup_page.dfn = addr_to_pfn(iova);
> +
> +	ret = HYPERVISOR_iommu_op(&op);
> +
> +	if (ret)
> +		return 0;
> +
> +	phys_addr_t page_addr = pfn_to_addr(gfn_to_pfn(op.lookup_page.gfn));
> +
> +	/* Consider non-aligned iova */
> +	return page_addr + (iova & 0xFFF);
> +}
> +
> +static void xen_iommu_get_resv_regions(struct device *dev, struct list_head *head)
> +{
> +	struct iommu_resv_region *reg;
> +	struct xen_reserved_device_memory *entries;
> +	struct xen_reserved_device_memory_map map;
> +	struct pci_dev *pdev;
> +	int ret, i;
> +
> +	pdev = to_pci_dev(dev);
> +
> +	reg = iommu_alloc_resv_region(MSI_RANGE_START,
> +				      MSI_RANGE_END - MSI_RANGE_START + 1,
> +				      0, IOMMU_RESV_MSI, GFP_KERNEL);
> +
> +	if (!reg)
> +		return;
> +
> +	list_add_tail(&reg->list, head);
> +
> +	/* Map xen-specific entries */
> +
> +	/* First, get number of entries to map */
> +	map.buffer = NULL;
> +	map.nr_entries = 0;
> +	map.flags = 0;
> +
> +	map.dev.pci.seg = pci_domain_nr(pdev->bus);
> +	map.dev.pci.bus = pdev->bus->number;
> +	map.dev.pci.devfn = pdev->devfn;
> +
> +	ret = HYPERVISOR_memory_op(XENMEM_reserved_device_memory_map, &map);
> +
> +	if (ret == 0)
> +		/* No reserved region, nothing to do */
> +		return;
> +
> +	if (ret != -ENOBUFS) {
> +		pr_err("Unable to get reserved region count (%d)\n", ret);
> +		return;
> +	}
> +
> +	/* Assume a reasonable number of entries, otherwise, something is probably wrong */
> +	if (WARN_ON(map.nr_entries > 256))
> +		pr_warn("Xen reporting many reserved regions (%u)\n", map.nr_entries);
> +
> +	/* And finally get actual mappings */
> +	entries = kcalloc(map.nr_entries, sizeof(struct xen_reserved_device_memory),
> +					  GFP_KERNEL);
> +
> +	if (!entries) {
> +		pr_err("No memory for map entries\n");
> +		return;
> +	}
> +
> +	map.buffer = entries;
> +
> +	ret = HYPERVISOR_memory_op(XENMEM_reserved_device_memory_map, &map);
> +
> +	if (ret != 0) {
> +		pr_err("Unable to get reserved regions (%d)\n", ret);
> +		kfree(entries);
> +		return;
> +	}
> +
> +	for (i = 0; i < map.nr_entries; i++) {
> +		struct xen_reserved_device_memory entry = entries[i];
> +
> +		reg = iommu_alloc_resv_region(pfn_to_addr(entry.start_pfn),
> +					      pfn_to_addr(entry.nr_pages),
> +					      0, IOMMU_RESV_RESERVED, GFP_KERNEL);
> +
> +		if (!reg)
> +			break;
> +
> +		list_add_tail(&reg->list, head);
> +	}
> +
> +	kfree(entries);
> +}
> +
> +static int default_domain_attach_dev(struct iommu_domain *domain,
> +				     struct device *dev)
> +{
> +	int ret;
> +	struct pci_dev *pdev;
> +	struct pv_iommu_op op = {
> +		.subop_id = IOMMUOP_reattach_device,
> +		.flags = 0,
> +		.ctx_no = 0 /* reattach device back to default context */
> +	};
> +
> +	pdev = to_pci_dev(dev);
> +
> +	op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
> +	op.reattach_device.dev.bus = pdev->bus->number;
> +	op.reattach_device.dev.devfn = pdev->devfn;
> +
> +	ret = HYPERVISOR_iommu_op(&op);
> +
> +	if (ret)
> +		pr_warn("Unable to release device %p\n", &op.reattach_device.dev);
> +
> +	return ret;
> +}
> +
> +static struct iommu_domain default_domain = {
> +	.ops = &(const struct iommu_domain_ops){
> +		.attach_dev = default_domain_attach_dev
> +	}
> +};

Looks like you could make it a static xen_iommu_domain and just use the 
normal attach callback? Either way please name it something less 
confusing like xen_iommu_identity_domain - "default" is far too 
overloaded round here already...

> +static struct iommu_ops xen_iommu_ops = {
> +	.identity_domain = &default_domain,
> +	.release_domain = &default_domain,
> +	.capable = xen_iommu_capable,
> +	.domain_alloc_paging = xen_iommu_domain_alloc_paging,
> +	.probe_device = xen_iommu_probe_device,
> +	.probe_finalize = xen_iommu_probe_finalize,
> +	.device_group = pci_device_group,
> +	.get_resv_regions = xen_iommu_get_resv_regions,
> +	.pgsize_bitmap = XEN_IOMMU_PGSIZES,
> +	.default_domain_ops = &(const struct iommu_domain_ops) {
> +		.map_pages = xen_iommu_map_pages,
> +		.unmap_pages = xen_iommu_unmap_pages,
> +		.attach_dev = xen_iommu_attach_dev,
> +		.iova_to_phys = xen_iommu_iova_to_phys,
> +		.free = xen_iommu_free,
> +	},
> +};
> +
> +int __init xen_iommu_init(void)
> +{
> +	int ret;
> +	struct pv_iommu_op op = {
> +		.subop_id = IOMMUOP_query_capabilities
> +	};
> +
> +	if (!xen_domain())
> +		return -ENODEV;
> +
> +	/* Check if iommu_op is supported */
> +	if (HYPERVISOR_iommu_op(&op) == -ENOSYS)
> +		return -ENODEV; /* No Xen IOMMU hardware */
> +
> +	pr_info("Initialising Xen IOMMU driver\n");
> +	pr_info("max_nr_pages=%d\n", op.cap.max_nr_pages);
> +	pr_info("max_ctx_no=%d\n", op.cap.max_ctx_no);
> +	pr_info("max_iova_addr=%llx\n", op.cap.max_iova_addr);
> +
> +	if (op.cap.max_ctx_no == 0) {
> +		pr_err("Unable to use IOMMU PV driver (no context available)\n");
> +		return -ENOTSUPP; /* Unable to use IOMMU PV ? */
> +	}
> +
> +	if (xen_domain_type == XEN_PV_DOMAIN)
> +		/* TODO: In PV domain, due to the existing pfn-gfn mapping we need to
> +		 * consider that under certains circonstances, we have :
> +		 *   pfn_to_gfn(x + 1) != pfn_to_gfn(x) + 1
> +		 *
> +		 * In these cases, we would want to separate the subop into several calls.
> +		 * (only doing the grouped operation when the mapping is actually contigous)
> +		 * Only map operation would be affected, as unmap actually uses dfn which
> +		 * doesn't have this kind of mapping.
> +		 *
> +		 * Force single-page operations to work arround this issue for now.
> +		 */
> +		max_nr_pages = 1;
> +	else
> +		/* With HVM domains, pfn_to_gfn is identity, there is no issue regarding this. */
> +		max_nr_pages = op.cap.max_nr_pages;
> +
> +	max_iova_addr = op.cap.max_iova_addr;
> +
> +	spin_lock_init(&lock);
> +
> +	ret = iommu_device_sysfs_add(&xen_iommu_device, NULL, NULL, "xen-iommu");
> +	if (ret) {
> +		pr_err("Unable to add Xen IOMMU sysfs\n");
> +		return ret;
> +	}
> +
> +	ret = iommu_device_register(&xen_iommu_device, &xen_iommu_ops, NULL);
> +	if (ret) {
> +		pr_err("Unable to register Xen IOMMU device %d\n", ret);
> +		iommu_device_sysfs_remove(&xen_iommu_device);
> +		return ret;
> +	}
> +
> +	/* swiotlb is redundant when IOMMU is active. */
> +	x86_swiotlb_enable = false;

That's not always true, but either way if this is at 
module_init/device_initcall time then it's too late to make any 
difference anyway.

> +
> +	return 0;
> +}
> +
> +void __exit xen_iommu_fini(void)
> +{
> +	pr_info("Unregistering Xen IOMMU driver\n");
> +
> +	iommu_device_unregister(&xen_iommu_device);
> +	iommu_device_sysfs_remove(&xen_iommu_device);
> +}

This is dead code since the Kconfig is only "bool". Either allow it to 
be an actual module (and make sure that works), or drop the pretence 
altogether.

Thanks,
Robin.
Teddy Astie June 24, 2024, 2:36 p.m. UTC | #2
Hello Robin,
Thanks for the thourough review.

>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>> index 0af39bbbe3a3..242cefac77c9 100644
>> --- a/drivers/iommu/Kconfig
>> +++ b/drivers/iommu/Kconfig
>> @@ -480,6 +480,15 @@ config VIRTIO_IOMMU
>>         Say Y here if you intend to run this kernel as a guest.
>> +config XEN_IOMMU
>> +    bool "Xen IOMMU driver"
>> +    depends on XEN_DOM0
> 
> Clearly this depends on X86 as well.
> 
Well, I don't intend this driver to be X86-only, even though the current 
Xen RFC doesn't support ARM (yet). Unless there is a counter-indication 
for it ?

>> +#include <linux/kernel.h>
>> +#include <linux/init.h>
>> +#include <linux/types.h>
>> +#include <linux/iommu.h>
>> +#include <linux/dma-map-ops.h>
> 
> Please drop this; it's a driver, not a DMA ops implementation.
> 
Sure, in addition to some others unneeded headers.

>> +#include <linux/pci.h>
>> +#include <linux/list.h>
>> +#include <linux/string.h>
>> +#include <linux/device/driver.h>
>> +#include <linux/slab.h>
>> +#include <linux/err.h>
>> +#include <linux/printk.h>
>> +#include <linux/stddef.h>
>> +#include <linux/spinlock.h>
>> +#include <linux/minmax.h>
>> +#include <linux/string.h>
>> +#include <asm/iommu.h>
>> +
>> +#include <xen/xen.h>
>> +#include <xen/page.h>
>> +#include <xen/interface/memory.h>
>> +#include <xen/interface/physdev.h>
>> +#include <xen/interface/pv-iommu.h>
>> +#include <asm/xen/hypercall.h>
>> +#include <asm/xen/page.h>
>> +
>> +MODULE_DESCRIPTION("Xen IOMMU driver");
>> +MODULE_AUTHOR("Teddy Astie <teddy.astie@vates.tech>");
>> +MODULE_LICENSE("GPL");
>> +
>> +#define MSI_RANGE_START        (0xfee00000)
>> +#define MSI_RANGE_END        (0xfeefffff)
>> +
>> +#define XEN_IOMMU_PGSIZES       (0x1000)
>> +
>> +struct xen_iommu_domain {
>> +    struct iommu_domain domain;
>> +
>> +    u16 ctx_no; /* Xen PV-IOMMU context number */
>> +};
>> +
>> +static struct iommu_device xen_iommu_device;
>> +
>> +static uint32_t max_nr_pages;
>> +static uint64_t max_iova_addr;
>> +
>> +static spinlock_t lock;
> 
> Not a great name - usually it's good to name a lock after what it 
> protects. Although perhaps it is already, since AFAICS this isn't 
> actually used anywhere anyway.
> 

Yes, that shouldn't be there.

>> +static inline struct xen_iommu_domain *to_xen_iommu_domain(struct 
>> iommu_domain *dom)
>> +{
>> +    return container_of(dom, struct xen_iommu_domain, domain);
>> +}
>> +
>> +static inline u64 addr_to_pfn(u64 addr)
>> +{
>> +    return addr >> 12;
>> +}
>> +
>> +static inline u64 pfn_to_addr(u64 pfn)
>> +{
>> +    return pfn << 12;
>> +}
>> +
>> +bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
>> +{
>> +    switch (cap) {
>> +    case IOMMU_CAP_CACHE_COHERENCY:
>> +        return true;
> 
> Will the PV-IOMMU only ever be exposed on hardware where that really is 
> always true?
> 

On the hypervisor side, the PV-IOMMU interface always implicitely flush 
the IOMMU hardware on map/unmap operation, so at the end of the 
hypercall, the cache should be always coherent IMO.

>> +
>> +static struct iommu_device *xen_iommu_probe_device(struct device *dev)
>> +{
>> +    if (!dev_is_pci(dev))
>> +        return ERR_PTR(-ENODEV);
>> +
>> +    return &xen_iommu_device;
> 
> Even emulated PCI devices have to have an (emulated, presumably) IOMMU?
> 

No and that's indeed one thing to check.

>> +}
>> +
>> +static void xen_iommu_probe_finalize(struct device *dev)
>> +{
>> +    set_dma_ops(dev, NULL);
>> +    iommu_setup_dma_ops(dev, 0, max_iova_addr);
> 

That got changed in 6.10, good to know that this code is not required 
anymore.

> 
>> +}
>> +
>> +static int xen_iommu_map_pages(struct iommu_domain *domain, unsigned 
>> long iova,
>> +                   phys_addr_t paddr, size_t pgsize, size_t pgcount,
>> +                   int prot, gfp_t gfp, size_t *mapped)
>> +{
>> +    size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
> 
> You only advertise the one page size, so you'll always get that back, 
> and this seems a bit redundant.
> 

Yes

>> +    struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
>> +    struct pv_iommu_op op = {
>> +        .subop_id = IOMMUOP_map_pages,
>> +        .flags = 0,
>> +        .ctx_no = dom->ctx_no
>> +    };
>> +    /* NOTE: paddr is actually bound to pfn, not gfn */
>> +    uint64_t pfn = addr_to_pfn(paddr);
>> +    uint64_t dfn = addr_to_pfn(iova);
>> +    int ret = 0;
>> +
>> +    //pr_info("Mapping to %lx %zu %zu paddr %x\n", iova, pgsize, 
>> pgcount, paddr);
> 
> Please try to clean up debugging leftovers before posting the patch (but 
> also note that there are already tracepoints and debug messages which 
> can be enabled in the core code to give visibility of most of this.)
> 

Yes

>> +
>> +    if (prot & IOMMU_READ)
>> +        op.flags |= IOMMU_OP_readable;
>> +
>> +    if (prot & IOMMU_WRITE)
>> +        op.flags |= IOMMU_OP_writeable;
>> +
>> +    while (xen_pg_count) {
> 
> Unless you're super-concerned about performance already, you don't 
> really need to worry about looping here - you can happily return short 
> as long as you've mapped *something*, and the core code will call you 
> back again with the remainder. But it also doesn't complicate things 
> *too* much as it is, so feel free to leave it in if you want to.
> 

Ok

>> +        size_t to_map = min(xen_pg_count, max_nr_pages);
>> +        uint64_t gfn = pfn_to_gfn(pfn);
>> +
>> +        //pr_info("Mapping %lx-%lx at %lx-%lx\n", gfn, gfn + to_map - 
>> 1, dfn, dfn + to_map - 1);
>> +
>> +        op.map_pages.gfn = gfn;
>> +        op.map_pages.dfn = dfn;
>> +
>> +        op.map_pages.nr_pages = to_map;
>> +
>> +        ret = HYPERVISOR_iommu_op(&op);
>> +
>> +        //pr_info("map_pages.mapped = %u\n", op.map_pages.mapped);
>> +
>> +        if (mapped)
>> +            *mapped += XEN_PAGE_SIZE * op.map_pages.mapped;
>> +
>> +        if (ret)
>> +            break;
>> +
>> +        xen_pg_count -= to_map;
>> +
>> +        pfn += to_map;
>> +        dfn += to_map;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static size_t xen_iommu_unmap_pages(struct iommu_domain *domain, 
>> unsigned long iova,
>> +                    size_t pgsize, size_t pgcount,
>> +                    struct iommu_iotlb_gather *iotlb_gather)
>> +{
>> +    size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
>> +    struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
>> +    struct pv_iommu_op op = {
>> +        .subop_id = IOMMUOP_unmap_pages,
>> +        .ctx_no = dom->ctx_no,
>> +        .flags = 0,
>> +    };
>> +    uint64_t dfn = addr_to_pfn(iova);
>> +    int ret = 0;
>> +
>> +    if (WARN(!dom->ctx_no, "Tried to unmap page to default context"))
>> +        return -EINVAL;
> 
> This would go hilariously wrong... the return value here is bytes 
> successfully unmapped, a total failure should return 0.

This check is not useful as the core code is never going to call it on 
this domain.

> 
>> +    while (xen_pg_count) {
>> +        size_t to_unmap = min(xen_pg_count, max_nr_pages);
>> +
>> +        //pr_info("Unmapping %lx-%lx\n", dfn, dfn + to_unmap - 1);
>> +
>> +        op.unmap_pages.dfn = dfn;
>> +        op.unmap_pages.nr_pages = to_unmap;
>> +
>> +        ret = HYPERVISOR_iommu_op(&op);
>> +
>> +        if (ret)
>> +            pr_warn("Unmap failure (%lx-%lx)\n", dfn, dfn + to_unmap 
>> - 1);
> 
> But then how 
>> would it ever happen anyway? Unmap is a domain op, so a domain which 
>> doesn't allow unmapping shouldn't offer it in the first place...

Unmap failing should be exceptionnal, but is possible e.g with 
transparent superpages (like Xen IOMMU drivers do). Xen drivers folds 
appropriate contiguous mappings into superpages entries to optimize 
memory usage and iotlb. However, if you unmap in the middle of a region 
covered by a superpage entry, this is no longer a valid superpage entry, 
and you need to allocate and fill the lower levels, which is faillible 
if lacking memory.

> In this case I'd argue that you really *do* want to return short, in the 
> hope of propagating the error back up and letting the caller know the 
> address space is now messed up before things start blowing up even more 
> if they keep going and subsequently try to map new pages into 
> not-actually-unmapped VAs.

While mapping on top of another mapping is ok for us (it's just going to 
override the previous mapping), I definetely agree that having the 
address space messed up is not good.

> 
>> +
>> +        xen_pg_count -= to_unmap;
>> +
>> +        dfn += to_unmap;
>> +    }
>> +
>> +    return pgcount * pgsize;
>> +}
>> +
>> +int xen_iommu_attach_dev(struct iommu_domain *domain, struct device 
>> *dev)
>> +{
>> +    struct pci_dev *pdev;
>> +    struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
>> +    struct pv_iommu_op op = {
>> +        .subop_id = IOMMUOP_reattach_device,
>> +        .flags = 0,
>> +        .ctx_no = dom->ctx_no,
>> +    };
>> +
>> +    pdev = to_pci_dev(dev);
>> +
>> +    op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
>> +    op.reattach_device.dev.bus = pdev->bus->number;
>> +    op.reattach_device.dev.devfn = pdev->devfn;
>> +
>> +    return HYPERVISOR_iommu_op(&op);
>> +}
>> +
>> +static void xen_iommu_free(struct iommu_domain *domain)
>> +{
>> +    int ret;
>> +    struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
>> +
>> +    if (dom->ctx_no != 0) {
> 
> Much like unmap above, this not being true would imply that someone's 
> managed to go round the back of the core code to get the .free op from a 
> validly-allocated domain and then pass something other than that domain 
> to it. Personally I'd consider that a level of brokenness that's not 
> even worth trying to consider at all, but if you want to go as far as 
> determining that you *have* clearly been given something you couldn't 
> have allocated, then trying to kfree() it probably isn't wise either.
> 

Yes

>> +
>> +static int default_domain_attach_dev(struct iommu_domain *domain,
>> +                     struct device *dev)
>> +{
>> +    int ret;
>> +    struct pci_dev *pdev;
>> +    struct pv_iommu_op op = {
>> +        .subop_id = IOMMUOP_reattach_device,
>> +        .flags = 0,
>> +        .ctx_no = 0 /* reattach device back to default context */
>> +    };
>> +
>> +    pdev = to_pci_dev(dev);
>> +
>> +    op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
>> +    op.reattach_device.dev.bus = pdev->bus->number;
>> +    op.reattach_device.dev.devfn = pdev->devfn;
>> +
>> +    ret = HYPERVISOR_iommu_op(&op);
>> +
>> +    if (ret)
>> +        pr_warn("Unable to release device %p\n", 
>> &op.reattach_device.dev);
>> +
>> +    return ret;
>> +}
>> +
>> +static struct iommu_domain default_domain = {
>> +    .ops = &(const struct iommu_domain_ops){
>> +        .attach_dev = default_domain_attach_dev
>> +    }
>> +};
> 
> Looks like you could make it a static xen_iommu_domain and just use the 
> normal attach callback? Either way please name it something less 
> confusing like xen_iommu_identity_domain - "default" is far too 
> overloaded round here already...
> 

Yes, although, if in the future, we can have either this domain as 
identity or blocking/paging depending on some upper level configuration. 
Should we have both identity and blocking domains, and only setting the 
relevant one in iommu_ops, or keep this naming.

>> +static struct iommu_ops xen_iommu_ops = {
>> +    .identity_domain = &default_domain,
>> +    .release_domain = &default_domain,
>> +    .capable = xen_iommu_capable,
>> +    .domain_alloc_paging = xen_iommu_domain_alloc_paging,
>> +    .probe_device = xen_iommu_probe_device,
>> +    .probe_finalize = xen_iommu_probe_finalize,
>> +    .device_group = pci_device_group,
>> +    .get_resv_regions = xen_iommu_get_resv_regions,
>> +    .pgsize_bitmap = XEN_IOMMU_PGSIZES,
>> +    .default_domain_ops = &(const struct iommu_domain_ops) {
>> +        .map_pages = xen_iommu_map_pages,
>> +        .unmap_pages = xen_iommu_unmap_pages,
>> +        .attach_dev = xen_iommu_attach_dev,
>> +        .iova_to_phys = xen_iommu_iova_to_phys,
>> +        .free = xen_iommu_free,
>> +    },
>> +};
>> +
>> +int __init xen_iommu_init(void)
>> +{
>> +    int ret;
>> +    struct pv_iommu_op op = {
>> +        .subop_id = IOMMUOP_query_capabilities
>> +    };
>> +
>> +    if (!xen_domain())
>> +        return -ENODEV;
>> +
>> +    /* Check if iommu_op is supported */
>> +    if (HYPERVISOR_iommu_op(&op) == -ENOSYS)
>> +        return -ENODEV; /* No Xen IOMMU hardware */
>> +
>> +    pr_info("Initialising Xen IOMMU driver\n");
>> +    pr_info("max_nr_pages=%d\n", op.cap.max_nr_pages);
>> +    pr_info("max_ctx_no=%d\n", op.cap.max_ctx_no);
>> +    pr_info("max_iova_addr=%llx\n", op.cap.max_iova_addr);
>> +
>> +    if (op.cap.max_ctx_no == 0) {
>> +        pr_err("Unable to use IOMMU PV driver (no context 
>> available)\n");
>> +        return -ENOTSUPP; /* Unable to use IOMMU PV ? */
>> +    }
>> +
>> +    if (xen_domain_type == XEN_PV_DOMAIN)
>> +        /* TODO: In PV domain, due to the existing pfn-gfn mapping we 
>> need to
>> +         * consider that under certains circonstances, we have :
>> +         *   pfn_to_gfn(x + 1) != pfn_to_gfn(x) + 1
>> +         *
>> +         * In these cases, we would want to separate the subop into 
>> several calls.
>> +         * (only doing the grouped operation when the mapping is 
>> actually contigous)
>> +         * Only map operation would be affected, as unmap actually 
>> uses dfn which
>> +         * doesn't have this kind of mapping.
>> +         *
>> +         * Force single-page operations to work arround this issue 
>> for now.
>> +         */
>> +        max_nr_pages = 1;
>> +    else
>> +        /* With HVM domains, pfn_to_gfn is identity, there is no 
>> issue regarding this. */
>> +        max_nr_pages = op.cap.max_nr_pages;
>> +
>> +    max_iova_addr = op.cap.max_iova_addr;
>> +
>> +    spin_lock_init(&lock);
>> +
>> +    ret = iommu_device_sysfs_add(&xen_iommu_device, NULL, NULL, 
>> "xen-iommu");
>> +    if (ret) {
>> +        pr_err("Unable to add Xen IOMMU sysfs\n");
>> +        return ret;
>> +    }
>> +
>> +    ret = iommu_device_register(&xen_iommu_device, &xen_iommu_ops, 
>> NULL);
>> +    if (ret) {
>> +        pr_err("Unable to register Xen IOMMU device %d\n", ret);
>> +        iommu_device_sysfs_remove(&xen_iommu_device);
>> +        return ret;
>> +    }
>> +
>> +    /* swiotlb is redundant when IOMMU is active. */
>> +    x86_swiotlb_enable = false;
> 
> That's not always true, but either way if this is at 
> module_init/device_initcall time then it's too late to make any 
> difference anyway.
> 

Ok

>> +
>> +    return 0;
>> +}
>> +
>> +void __exit xen_iommu_fini(void)
>> +{
>> +    pr_info("Unregistering Xen IOMMU driver\n");
>> +
>> +    iommu_device_unregister(&xen_iommu_device);
>> +    iommu_device_sysfs_remove(&xen_iommu_device);
>> +}
> 
> This is dead code since the Kconfig is only "bool". Either allow it to 
> be an actual module (and make sure that works), or drop the pretence 
> altogether.
> 

Ok, I though this function was actually a requirement even if it is not 
a module.

> Thanks,
> Robin.

Teddy


Teddy Astie | Vates XCP-ng Intern

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Jason Gunthorpe June 24, 2024, 4:32 p.m. UTC | #3
On Mon, Jun 24, 2024 at 02:36:45PM +0000, Teddy Astie wrote:
> >> +bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
> >> +{
> >> +    switch (cap) {
> >> +    case IOMMU_CAP_CACHE_COHERENCY:
> >> +        return true;
> >
> > Will the PV-IOMMU only ever be exposed on hardware where that really is
> > always true?
> >
> 
> On the hypervisor side, the PV-IOMMU interface always implicitely flush
> the IOMMU hardware on map/unmap operation, so at the end of the
> hypercall, the cache should be always coherent IMO.

Cache coherency is a property of the underlying IOMMU HW and reflects
the ability to prevent generating transactions that would bypass the
cache.

On AMD and Intel IOMMU HW this maps to a bit in their PTEs that must
always be set to claim this capability.

No ARM SMMU supports it yet.

If you imagine supporting ARM someday then this can't be a fixed true.

> Unmap failing should be exceptionnal, but is possible e.g with
> transparent superpages (like Xen IOMMU drivers do). Xen drivers folds
> appropriate contiguous mappings into superpages entries to optimize
> memory usage and iotlb. However, if you unmap in the middle of a region
> covered by a superpage entry, this is no longer a valid superpage entry,
> and you need to allocate and fill the lower levels, which is faillible
> if lacking memory.

This doesn't seem necessary. From an IOMMU perspective the contract is
that whatever gets mapped must be wholly unmapped and the unmap cannot
fail.

Failing to unmap causes big problems for iommufd and vfio as it is
about to free to the memory underlying the maps. Nothing good will
happen after this.

An implementation should rely on the core code to provide the
contiguous ranges and not attempt to combine mappings across two
map_pages() calls. If it does this it can refuse to unmap a slice of a
superpage, and thus it never has to allocate memory during unmap.

> While mapping on top of another mapping is ok for us (it's just going to
> override the previous mapping), I definetely agree that having the
> address space messed up is not good.

Technically map_pages should fail if it is already populated, but
nothing should ever do that.

Jason
Easwar Hariharan June 24, 2024, 5:36 p.m. UTC | #4
Hi Jason,

On 6/24/2024 9:32 AM, Jason Gunthorpe wrote:
> On Mon, Jun 24, 2024 at 02:36:45PM +0000, Teddy Astie wrote:
>>>> +bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
>>>> +{
>>>> +    switch (cap) {
>>>> +    case IOMMU_CAP_CACHE_COHERENCY:
>>>> +        return true;
>>>
>>> Will the PV-IOMMU only ever be exposed on hardware where that really is
>>> always true?
>>>
>>
>> On the hypervisor side, the PV-IOMMU interface always implicitely flush
>> the IOMMU hardware on map/unmap operation, so at the end of the
>> hypercall, the cache should be always coherent IMO.
> 
> Cache coherency is a property of the underlying IOMMU HW and reflects
> the ability to prevent generating transactions that would bypass the
> cache.
> 
> On AMD and Intel IOMMU HW this maps to a bit in their PTEs that must
> always be set to claim this capability.
> 
> No ARM SMMU supports it yet.
> 

Unrelated to this patch: Both the arm-smmu and arm-smmu-v3 drivers claim
this capability if the device tree/IORT table have the corresponding flags.

I read through DEN0049 to determine what are the knock-on effects, or
equivalently the requirements to set those flags in the IORT, but came
up empty. Could you help with what I'm missing to resolve the apparent
contradiction between your statement and the code?

Thanks,
Easwar
Jason Gunthorpe June 24, 2024, 5:58 p.m. UTC | #5
On Mon, Jun 24, 2024 at 10:36:13AM -0700, Easwar Hariharan wrote:
> Hi Jason,
> 
> On 6/24/2024 9:32 AM, Jason Gunthorpe wrote:
> > On Mon, Jun 24, 2024 at 02:36:45PM +0000, Teddy Astie wrote:
> >>>> +bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
> >>>> +{
> >>>> +    switch (cap) {
> >>>> +    case IOMMU_CAP_CACHE_COHERENCY:
> >>>> +        return true;
> >>>
> >>> Will the PV-IOMMU only ever be exposed on hardware where that really is
> >>> always true?
> >>>
> >>
> >> On the hypervisor side, the PV-IOMMU interface always implicitely flush
> >> the IOMMU hardware on map/unmap operation, so at the end of the
> >> hypercall, the cache should be always coherent IMO.
> > 
> > Cache coherency is a property of the underlying IOMMU HW and reflects
> > the ability to prevent generating transactions that would bypass the
> > cache.
> > 
> > On AMD and Intel IOMMU HW this maps to a bit in their PTEs that must
> > always be set to claim this capability.
> > 
> > No ARM SMMU supports it yet.
> > 
> 
> Unrelated to this patch: Both the arm-smmu and arm-smmu-v3 drivers claim
> this capability if the device tree/IORT table have the corresponding flags.

Oh sorry, I misread this, as the ENFORCED version, so it isn't quite
right.

ENFORCED is as above

Just normal CACHE_COHERENCY requires just HW support and any
enablement in the IOMMU.

AMD and Intel are always enabled

ARM requires HW support and possibly some specific iommu programming.

> I read through DEN0049 to determine what are the knock-on effects, or
> equivalently the requirements to set those flags in the IORT, but came
> up empty. Could you help with what I'm missing to resolve the apparent
> contradiction between your statement and the code?

The flags are set if the underlying HW can do the coherency work, and
Linux has the IOMMU (STE/ioptes) set to work with that.

Map/unmap cache flushing during the hypercall is not a substitution,
Linux still needs to know if streaming DMA requires the flushes.

Jason
Robin Murphy June 24, 2024, 6 p.m. UTC | #6
On 2024-06-24 6:36 pm, Easwar Hariharan wrote:
> Hi Jason,
> 
> On 6/24/2024 9:32 AM, Jason Gunthorpe wrote:
>> On Mon, Jun 24, 2024 at 02:36:45PM +0000, Teddy Astie wrote:
>>>>> +bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
>>>>> +{
>>>>> +    switch (cap) {
>>>>> +    case IOMMU_CAP_CACHE_COHERENCY:
>>>>> +        return true;
>>>>
>>>> Will the PV-IOMMU only ever be exposed on hardware where that really is
>>>> always true?
>>>>
>>>
>>> On the hypervisor side, the PV-IOMMU interface always implicitely flush
>>> the IOMMU hardware on map/unmap operation, so at the end of the
>>> hypercall, the cache should be always coherent IMO.
>>
>> Cache coherency is a property of the underlying IOMMU HW and reflects
>> the ability to prevent generating transactions that would bypass the
>> cache.
>>
>> On AMD and Intel IOMMU HW this maps to a bit in their PTEs that must
>> always be set to claim this capability.
>>
>> No ARM SMMU supports it yet.
>>
> 
> Unrelated to this patch: Both the arm-smmu and arm-smmu-v3 drivers claim
> this capability if the device tree/IORT table have the corresponding flags.
> 
> I read through DEN0049 to determine what are the knock-on effects, or
> equivalently the requirements to set those flags in the IORT, but came
> up empty. Could you help with what I'm missing to resolve the apparent
> contradiction between your statement and the code?

We did rejig things slightly a while back. The status quo now is that 
IOMMU_CAP_CACHE_COHERENCY mostly covers whether IOMMU mappings can make 
device accesses coherent at all, tied in with the IOMMU_CACHE prot value 
- this is effectively forced for Intel and AMD, while for SMMU we have 
to take a guess, but as commented it's a pretty reasonable assumption 
that if the SMMU's own output for table walks etc. is coherent then its 
translation outputs are likely to be too. The further property of being 
able to then enforce a coherent mapping regardless of what an endpoint 
might try to get around it (PCIe No Snoop etc.) is now under the 
enforce_cache_coherency op - that's what SMMU can't guarantee for now 
due to the IMP-DEF nature of whether S2FWB overrides No Snoop or not.

Thanks,
Robin.
Robin Murphy June 26, 2024, 12:09 p.m. UTC | #7
On 2024-06-24 3:36 pm, Teddy Astie wrote:
> Hello Robin,
> Thanks for the thourough review.
> 
>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>> index 0af39bbbe3a3..242cefac77c9 100644
>>> --- a/drivers/iommu/Kconfig
>>> +++ b/drivers/iommu/Kconfig
>>> @@ -480,6 +480,15 @@ config VIRTIO_IOMMU
>>>          Say Y here if you intend to run this kernel as a guest.
>>> +config XEN_IOMMU
>>> +    bool "Xen IOMMU driver"
>>> +    depends on XEN_DOM0
>>
>> Clearly this depends on X86 as well.
>>
> Well, I don't intend this driver to be X86-only, even though the current
> Xen RFC doesn't support ARM (yet). Unless there is a counter-indication
> for it ?

It's purely practical - even if you drop the asm/iommu.h stuff it would 
still break ARM DOM0 builds due to HYPERVISOR_iommu_op() only being 
defined for x86. And it's better to add a dependency here to make it 
clear what's *currently* supported, than to add dummy code to allow it 
to build for ARM if that's not actually tested or usable yet.

>>> +bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
>>> +{
>>> +    switch (cap) {
>>> +    case IOMMU_CAP_CACHE_COHERENCY:
>>> +        return true;
>>
>> Will the PV-IOMMU only ever be exposed on hardware where that really is
>> always true?
>>
> 
> On the hypervisor side, the PV-IOMMU interface always implicitely flush
> the IOMMU hardware on map/unmap operation, so at the end of the
> hypercall, the cache should be always coherent IMO.

As Jason already brought up, this is not about TLBs or anything cached 
by the IOMMU itself, it's about the memory type(s) it can create 
mappings with. Returning true here says Xen guarantees it can use a 
cacheable memory type which will let DMA snoop the CPU caches. 
Furthermore, not explicitly handling IOMMU_CACHE in the map_pages op 
then also implies that it will *always* do that, so you couldn't 
actually get an uncached mapping even if you wanted one.

>>> +    while (xen_pg_count) {
>>> +        size_t to_unmap = min(xen_pg_count, max_nr_pages);
>>> +
>>> +        //pr_info("Unmapping %lx-%lx\n", dfn, dfn + to_unmap - 1);
>>> +
>>> +        op.unmap_pages.dfn = dfn;
>>> +        op.unmap_pages.nr_pages = to_unmap;
>>> +
>>> +        ret = HYPERVISOR_iommu_op(&op);
>>> +
>>> +        if (ret)
>>> +            pr_warn("Unmap failure (%lx-%lx)\n", dfn, dfn + to_unmap
>>> - 1);
>>
>> But then how
>>> would it ever happen anyway? Unmap is a domain op, so a domain which
>>> doesn't allow unmapping shouldn't offer it in the first place...
> 
> Unmap failing should be exceptionnal, but is possible e.g with
> transparent superpages (like Xen IOMMU drivers do). Xen drivers folds
> appropriate contiguous mappings into superpages entries to optimize
> memory usage and iotlb. However, if you unmap in the middle of a region
> covered by a superpage entry, this is no longer a valid superpage entry,
> and you need to allocate and fill the lower levels, which is faillible
> if lacking memory.

OK, so in the worst case you could potentially have a partial unmap 
failure if the range crosses a superpage boundary and the end part 
happens to have been folded, and Xen doesn't detect and prepare that 
allocation until it's already unmapped up to the boundary. If that is 
so, does the hypercall interface give any information about partial 
failure, or can any error only be taken to mean that some or all of the 
given range may or may not have be unmapped now?
>> In this case I'd argue that you really *do* want to return short, in the
>> hope of propagating the error back up and letting the caller know the
>> address space is now messed up before things start blowing up even more
>> if they keep going and subsequently try to map new pages into
>> not-actually-unmapped VAs.
> 
> While mapping on top of another mapping is ok for us (it's just going to
> override the previous mapping), I definetely agree that having the
> address space messed up is not good.

Oh, indeed, quietly replacing existing PTEs might help paper over errors 
in this particular instance, but it does then allow *other* cases to go 
wrong in fun and infuriating ways :)

>>> +static struct iommu_domain default_domain = {
>>> +    .ops = &(const struct iommu_domain_ops){
>>> +        .attach_dev = default_domain_attach_dev
>>> +    }
>>> +};
>>
>> Looks like you could make it a static xen_iommu_domain and just use the
>> normal attach callback? Either way please name it something less
>> confusing like xen_iommu_identity_domain - "default" is far too
>> overloaded round here already...
>>
> 
> Yes, although, if in the future, we can have either this domain as
> identity or blocking/paging depending on some upper level configuration.
> Should we have both identity and blocking domains, and only setting the
> relevant one in iommu_ops, or keep this naming.

That's something that can be considered if and when it does happen. For 
now, if it's going to be pre-mapped as an identity domain, then let's 
just treat it as such and keep things straightforward.

>>> +void __exit xen_iommu_fini(void)
>>> +{
>>> +    pr_info("Unregistering Xen IOMMU driver\n");
>>> +
>>> +    iommu_device_unregister(&xen_iommu_device);
>>> +    iommu_device_sysfs_remove(&xen_iommu_device);
>>> +}
>>
>> This is dead code since the Kconfig is only "bool". Either allow it to
>> be an actual module (and make sure that works), or drop the pretence
>> altogether.
>>
> 
> Ok, I though this function was actually a requirement even if it is not
> a module.

No, quite the opposite - even code which is modular doesn't have to 
support removal if it doesn't want to.

Thanks,
Robin.
Teddy Astie June 26, 2024, 1:40 p.m. UTC | #8
Hello Robin,

Le 26/06/2024 à 14:09, Robin Murphy a écrit :
> On 2024-06-24 3:36 pm, Teddy Astie wrote:
>> Hello Robin,
>> Thanks for the thourough review.
>>
>>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>>> index 0af39bbbe3a3..242cefac77c9 100644
>>>> --- a/drivers/iommu/Kconfig
>>>> +++ b/drivers/iommu/Kconfig
>>>> @@ -480,6 +480,15 @@ config VIRTIO_IOMMU
>>>>          Say Y here if you intend to run this kernel as a guest.
>>>> +config XEN_IOMMU
>>>> +    bool "Xen IOMMU driver"
>>>> +    depends on XEN_DOM0
>>>
>>> Clearly this depends on X86 as well.
>>>
>> Well, I don't intend this driver to be X86-only, even though the current
>> Xen RFC doesn't support ARM (yet). Unless there is a counter-indication
>> for it ?
> 
> It's purely practical - even if you drop the asm/iommu.h stuff it would 
> still break ARM DOM0 builds due to HYPERVISOR_iommu_op() only being 
> defined for x86. And it's better to add a dependency here to make it 
> clear what's *currently* supported, than to add dummy code to allow it 
> to build for ARM if that's not actually tested or usable yet.
> 

Ok, it does exist from the hypervisor side (even though a such Xen build 
is not possible yet), I suppose I just need to add relevant hypercall 
interfaces for arm/arm64 in hypercall{.S,.h}.

>>>> +bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
>>>> +{
>>>> +    switch (cap) {
>>>> +    case IOMMU_CAP_CACHE_COHERENCY:
>>>> +        return true;
>>>
>>> Will the PV-IOMMU only ever be exposed on hardware where that really is
>>> always true?
>>>
>>
>> On the hypervisor side, the PV-IOMMU interface always implicitely flush
>> the IOMMU hardware on map/unmap operation, so at the end of the
>> hypercall, the cache should be always coherent IMO.
> 
> As Jason already brought up, this is not about TLBs or anything cached 
> by the IOMMU itself, it's about the memory type(s) it can create 
> mappings with. Returning true here says Xen guarantees it can use a 
> cacheable memory type which will let DMA snoop the CPU caches. 
> Furthermore, not explicitly handling IOMMU_CACHE in the map_pages op 
> then also implies that it will *always* do that, so you couldn't 
> actually get an uncached mapping even if you wanted one.
> 

Yes, this is a point we are currently discussing on the Xen side.

>>>> +    while (xen_pg_count) {
>>>> +        size_t to_unmap = min(xen_pg_count, max_nr_pages);
>>>> +
>>>> +        //pr_info("Unmapping %lx-%lx\n", dfn, dfn + to_unmap - 1);
>>>> +
>>>> +        op.unmap_pages.dfn = dfn;
>>>> +        op.unmap_pages.nr_pages = to_unmap;
>>>> +
>>>> +        ret = HYPERVISOR_iommu_op(&op);
>>>> +
>>>> +        if (ret)
>>>> +            pr_warn("Unmap failure (%lx-%lx)\n", dfn, dfn + to_unmap
>>>> - 1);
>>>
>>> But then how
>>>> would it ever happen anyway? Unmap is a domain op, so a domain which
>>>> doesn't allow unmapping shouldn't offer it in the first place...
>>
>> Unmap failing should be exceptionnal, but is possible e.g with
>> transparent superpages (like Xen IOMMU drivers do). Xen drivers folds
>> appropriate contiguous mappings into superpages entries to optimize
>> memory usage and iotlb. However, if you unmap in the middle of a region
>> covered by a superpage entry, this is no longer a valid superpage entry,
>> and you need to allocate and fill the lower levels, which is faillible
>> if lacking memory.
> 
> OK, so in the worst case you could potentially have a partial unmap 
> failure if the range crosses a superpage boundary and the end part 
> happens to have been folded, and Xen doesn't detect and prepare that 
> allocation until it's already unmapped up to the boundary. If that is 
> so, does the hypercall interface give any information about partial 
> failure, or can any error only be taken to mean that some or all of the 
> given range may or may not have be unmapped now?

The hypercall interface has op.unmap_page.unmapped to indicate the real 
amount of pages actually unmapped even in case of error. If it is less 
than requested initially, it will retry with the remaining part, 
certainly fail afterward. Although, I have the impression that it is 
going to fail silently.

>>> In this case I'd argue that you really *do* want to return short, in the
>>> hope of propagating the error back up and letting the caller know the
>>> address space is now messed up before things start blowing up even more
>>> if they keep going and subsequently try to map new pages into
>>> not-actually-unmapped VAs.
>>
>> While mapping on top of another mapping is ok for us (it's just going to
>> override the previous mapping), I definetely agree that having the
>> address space messed up is not good.
> 
> Oh, indeed, quietly replacing existing PTEs might help paper over errors 
> in this particular instance, but it does then allow *other* cases to go 
> wrong in fun and infuriating ways :)
> 

Yes, but I was wrong at my stance. Our hypercall interface doesn't allow 
mapping on top of another one (it's going to fail with -EADDRINUSE). So 
unmap failing is still going to cause bad issues.

Should iommu_unmap and related code consider the cases where unmapped != 
size and report it appropriately ? So such cases, if ever happening will 
be reported louder and not failing silently.

>>>> +static struct iommu_domain default_domain = {
>>>> +    .ops = &(const struct iommu_domain_ops){
>>>> +        .attach_dev = default_domain_attach_dev
>>>> +    }
>>>> +};
>>>
>>> Looks like you could make it a static xen_iommu_domain and just use the
>>> normal attach callback? Either way please name it something less
>>> confusing like xen_iommu_identity_domain - "default" is far too
>>> overloaded round here already...
>>>
>>
>> Yes, although, if in the future, we can have either this domain as
>> identity or blocking/paging depending on some upper level configuration.
>> Should we have both identity and blocking domains, and only setting the
>> relevant one in iommu_ops, or keep this naming.
> 
> That's something that can be considered if and when it does happen. For 
> now, if it's going to be pre-mapped as an identity domain, then let's 
> just treat it as such and keep things straightforward.
> 

Let's name it xen_iommu_identity_domain.

>>>> +void __exit xen_iommu_fini(void)
>>>> +{
>>>> +    pr_info("Unregistering Xen IOMMU driver\n");
>>>> +
>>>> +    iommu_device_unregister(&xen_iommu_device);
>>>> +    iommu_device_sysfs_remove(&xen_iommu_device);
>>>> +}
>>>
>>> This is dead code since the Kconfig is only "bool". Either allow it to
>>> be an actual module (and make sure that works), or drop the pretence
>>> altogether.
>>>
>>
>> Ok, I though this function was actually a requirement even if it is not
>> a module.
> 
> No, quite the opposite - even code which is modular doesn't have to 
> support removal if it doesn't want to.
> 

Ok

> Thanks,
> Robin.

Teddy


Teddy Astie | Vates XCP-ng Intern

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
diff mbox series

Patch

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a2dd24947eb8..6b1857f27c14 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -490,6 +490,12 @@  HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
 	return _hypercall2(int, xenpmu_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_iommu_op(void *arg)
+{
+	return _hypercall1(int, iommu_op, arg);
+}
+
 static inline int
 HYPERVISOR_dm_op(
 	domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs)
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 0af39bbbe3a3..242cefac77c9 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -480,6 +480,15 @@  config VIRTIO_IOMMU
 
 	  Say Y here if you intend to run this kernel as a guest.
 
+config XEN_IOMMU
+	bool "Xen IOMMU driver"
+	depends on XEN_DOM0
+	select IOMMU_API
+	help
+		Xen PV-IOMMU driver for Dom0.
+
+		Say Y here if you intend to run this guest as Xen Dom0.
+
 config SPRD_IOMMU
 	tristate "Unisoc IOMMU Support"
 	depends on ARCH_SPRD || COMPILE_TEST
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 542760d963ec..393afe22c901 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -30,3 +30,4 @@  obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
 obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
 obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
 obj-$(CONFIG_APPLE_DART) += apple-dart.o
+obj-$(CONFIG_XEN_IOMMU) += xen-iommu.o
\ No newline at end of file
diff --git a/drivers/iommu/xen-iommu.c b/drivers/iommu/xen-iommu.c
new file mode 100644
index 000000000000..b765445d27cd
--- /dev/null
+++ b/drivers/iommu/xen-iommu.c
@@ -0,0 +1,489 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xen PV-IOMMU driver.
+ *
+ * Copyright (C) 2024 Vates SAS
+ *
+ * Author: Teddy Astie <teddy.astie@vates.tech>
+ *
+ */
+
+#define pr_fmt(fmt)	"xen-iommu: " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/iommu.h>
+#include <linux/dma-map-ops.h>
+#include <linux/pci.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/device/driver.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/printk.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/minmax.h>
+#include <linux/string.h>
+#include <asm/iommu.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/pv-iommu.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_DESCRIPTION("Xen IOMMU driver");
+MODULE_AUTHOR("Teddy Astie <teddy.astie@vates.tech>");
+MODULE_LICENSE("GPL");
+
+#define MSI_RANGE_START		(0xfee00000)
+#define MSI_RANGE_END		(0xfeefffff)
+
+#define XEN_IOMMU_PGSIZES       (0x1000)
+
+struct xen_iommu_domain {
+	struct iommu_domain domain;
+
+	u16 ctx_no; /* Xen PV-IOMMU context number */
+};
+
+static struct iommu_device xen_iommu_device;
+
+static uint32_t max_nr_pages;
+static uint64_t max_iova_addr;
+
+static spinlock_t lock;
+
+static inline struct xen_iommu_domain *to_xen_iommu_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct xen_iommu_domain, domain);
+}
+
+static inline u64 addr_to_pfn(u64 addr)
+{
+	return addr >> 12;
+}
+
+static inline u64 pfn_to_addr(u64 pfn)
+{
+	return pfn << 12;
+}
+
+bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return true;
+
+	default:
+		return false;
+	}
+}
+
+struct iommu_domain *xen_iommu_domain_alloc_paging(struct device *dev)
+{
+	struct xen_iommu_domain *domain;
+	int ret;
+
+	struct pv_iommu_op op = {
+		.ctx_no = 0,
+		.flags = 0,
+		.subop_id = IOMMUOP_alloc_context
+	};
+
+	ret = HYPERVISOR_iommu_op(&op);
+
+	if (ret) {
+		pr_err("Unable to create Xen IOMMU context (%d)", ret);
+		return ERR_PTR(ret);
+	}
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+
+	domain->ctx_no = op.ctx_no;
+
+	domain->domain.geometry.aperture_start = 0;
+	domain->domain.geometry.aperture_end = max_iova_addr;
+	domain->domain.geometry.force_aperture = true;
+
+	return &domain->domain;
+}
+
+static struct iommu_device *xen_iommu_probe_device(struct device *dev)
+{
+	if (!dev_is_pci(dev))
+		return ERR_PTR(-ENODEV);
+
+	return &xen_iommu_device;
+}
+
+static void xen_iommu_probe_finalize(struct device *dev)
+{
+	set_dma_ops(dev, NULL);
+	iommu_setup_dma_ops(dev, 0, max_iova_addr);
+}
+
+static int xen_iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
+			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
+			       int prot, gfp_t gfp, size_t *mapped)
+{
+	size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_map_pages,
+		.flags = 0,
+		.ctx_no = dom->ctx_no
+	};
+	/* NOTE: paddr is actually bound to pfn, not gfn */
+	uint64_t pfn = addr_to_pfn(paddr);
+	uint64_t dfn = addr_to_pfn(iova);
+	int ret = 0;
+
+	//pr_info("Mapping to %lx %zu %zu paddr %x\n", iova, pgsize, pgcount, paddr);
+
+	if (prot & IOMMU_READ)
+		op.flags |= IOMMU_OP_readable;
+
+	if (prot & IOMMU_WRITE)
+		op.flags |= IOMMU_OP_writeable;
+
+	while (xen_pg_count) {
+		size_t to_map = min(xen_pg_count, max_nr_pages);
+		uint64_t gfn = pfn_to_gfn(pfn);
+
+		//pr_info("Mapping %lx-%lx at %lx-%lx\n", gfn, gfn + to_map - 1, dfn, dfn + to_map - 1);
+
+		op.map_pages.gfn = gfn;
+		op.map_pages.dfn = dfn;
+
+		op.map_pages.nr_pages = to_map;
+
+		ret = HYPERVISOR_iommu_op(&op);
+
+		//pr_info("map_pages.mapped = %u\n", op.map_pages.mapped);
+
+		if (mapped)
+			*mapped += XEN_PAGE_SIZE * op.map_pages.mapped;
+
+		if (ret)
+			break;
+
+		xen_pg_count -= to_map;
+
+		pfn += to_map;
+		dfn += to_map;
+	}
+
+	return ret;
+}
+
+static size_t xen_iommu_unmap_pages(struct iommu_domain *domain, unsigned long iova,
+				    size_t pgsize, size_t pgcount,
+				    struct iommu_iotlb_gather *iotlb_gather)
+{
+	size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_unmap_pages,
+		.ctx_no = dom->ctx_no,
+		.flags = 0,
+	};
+	uint64_t dfn = addr_to_pfn(iova);
+	int ret = 0;
+
+	if (WARN(!dom->ctx_no, "Tried to unmap page to default context"))
+		return -EINVAL;
+
+	while (xen_pg_count) {
+		size_t to_unmap = min(xen_pg_count, max_nr_pages);
+
+		//pr_info("Unmapping %lx-%lx\n", dfn, dfn + to_unmap - 1);
+
+		op.unmap_pages.dfn = dfn;
+		op.unmap_pages.nr_pages = to_unmap;
+
+		ret = HYPERVISOR_iommu_op(&op);
+
+		if (ret)
+			pr_warn("Unmap failure (%lx-%lx)\n", dfn, dfn + to_unmap - 1);
+
+		xen_pg_count -= to_unmap;
+
+		dfn += to_unmap;
+	}
+
+	return pgcount * pgsize;
+}
+
+int xen_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	struct pci_dev *pdev;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_reattach_device,
+		.flags = 0,
+		.ctx_no = dom->ctx_no,
+	};
+
+	pdev = to_pci_dev(dev);
+
+	op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
+	op.reattach_device.dev.bus = pdev->bus->number;
+	op.reattach_device.dev.devfn = pdev->devfn;
+
+	return HYPERVISOR_iommu_op(&op);
+}
+
+static void xen_iommu_free(struct iommu_domain *domain)
+{
+	int ret;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+
+	if (dom->ctx_no != 0) {
+		struct pv_iommu_op op = {
+			.ctx_no = dom->ctx_no,
+			.flags = 0,
+			.subop_id = IOMMUOP_free_context
+		};
+
+		ret = HYPERVISOR_iommu_op(&op);
+
+		if (ret)
+			pr_err("Context %hu destruction failure\n", dom->ctx_no);
+	}
+
+	kfree(domain);
+}
+
+static phys_addr_t xen_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+	int ret;
+	struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+
+	struct pv_iommu_op op = {
+		.ctx_no = dom->ctx_no,
+		.flags = 0,
+		.subop_id = IOMMUOP_lookup_page,
+	};
+
+	op.lookup_page.dfn = addr_to_pfn(iova);
+
+	ret = HYPERVISOR_iommu_op(&op);
+
+	if (ret)
+		return 0;
+
+	phys_addr_t page_addr = pfn_to_addr(gfn_to_pfn(op.lookup_page.gfn));
+
+	/* Consider non-aligned iova */
+	return page_addr + (iova & 0xFFF);
+}
+
+static void xen_iommu_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct iommu_resv_region *reg;
+	struct xen_reserved_device_memory *entries;
+	struct xen_reserved_device_memory_map map;
+	struct pci_dev *pdev;
+	int ret, i;
+
+	pdev = to_pci_dev(dev);
+
+	reg = iommu_alloc_resv_region(MSI_RANGE_START,
+				      MSI_RANGE_END - MSI_RANGE_START + 1,
+				      0, IOMMU_RESV_MSI, GFP_KERNEL);
+
+	if (!reg)
+		return;
+
+	list_add_tail(&reg->list, head);
+
+	/* Map xen-specific entries */
+
+	/* First, get number of entries to map */
+	map.buffer = NULL;
+	map.nr_entries = 0;
+	map.flags = 0;
+
+	map.dev.pci.seg = pci_domain_nr(pdev->bus);
+	map.dev.pci.bus = pdev->bus->number;
+	map.dev.pci.devfn = pdev->devfn;
+
+	ret = HYPERVISOR_memory_op(XENMEM_reserved_device_memory_map, &map);
+
+	if (ret == 0)
+		/* No reserved region, nothing to do */
+		return;
+
+	if (ret != -ENOBUFS) {
+		pr_err("Unable to get reserved region count (%d)\n", ret);
+		return;
+	}
+
+	/* Assume a reasonable number of entries, otherwise, something is probably wrong */
+	if (WARN_ON(map.nr_entries > 256))
+		pr_warn("Xen reporting many reserved regions (%u)\n", map.nr_entries);
+
+	/* And finally get actual mappings */
+	entries = kcalloc(map.nr_entries, sizeof(struct xen_reserved_device_memory),
+					  GFP_KERNEL);
+
+	if (!entries) {
+		pr_err("No memory for map entries\n");
+		return;
+	}
+
+	map.buffer = entries;
+
+	ret = HYPERVISOR_memory_op(XENMEM_reserved_device_memory_map, &map);
+
+	if (ret != 0) {
+		pr_err("Unable to get reserved regions (%d)\n", ret);
+		kfree(entries);
+		return;
+	}
+
+	for (i = 0; i < map.nr_entries; i++) {
+		struct xen_reserved_device_memory entry = entries[i];
+
+		reg = iommu_alloc_resv_region(pfn_to_addr(entry.start_pfn),
+					      pfn_to_addr(entry.nr_pages),
+					      0, IOMMU_RESV_RESERVED, GFP_KERNEL);
+
+		if (!reg)
+			break;
+
+		list_add_tail(&reg->list, head);
+	}
+
+	kfree(entries);
+}
+
+static int default_domain_attach_dev(struct iommu_domain *domain,
+				     struct device *dev)
+{
+	int ret;
+	struct pci_dev *pdev;
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_reattach_device,
+		.flags = 0,
+		.ctx_no = 0 /* reattach device back to default context */
+	};
+
+	pdev = to_pci_dev(dev);
+
+	op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
+	op.reattach_device.dev.bus = pdev->bus->number;
+	op.reattach_device.dev.devfn = pdev->devfn;
+
+	ret = HYPERVISOR_iommu_op(&op);
+
+	if (ret)
+		pr_warn("Unable to release device %p\n", &op.reattach_device.dev);
+
+	return ret;
+}
+
+static struct iommu_domain default_domain = {
+	.ops = &(const struct iommu_domain_ops){
+		.attach_dev = default_domain_attach_dev
+	}
+};
+
+static struct iommu_ops xen_iommu_ops = {
+	.identity_domain = &default_domain,
+	.release_domain = &default_domain,
+	.capable = xen_iommu_capable,
+	.domain_alloc_paging = xen_iommu_domain_alloc_paging,
+	.probe_device = xen_iommu_probe_device,
+	.probe_finalize = xen_iommu_probe_finalize,
+	.device_group = pci_device_group,
+	.get_resv_regions = xen_iommu_get_resv_regions,
+	.pgsize_bitmap = XEN_IOMMU_PGSIZES,
+	.default_domain_ops = &(const struct iommu_domain_ops) {
+		.map_pages = xen_iommu_map_pages,
+		.unmap_pages = xen_iommu_unmap_pages,
+		.attach_dev = xen_iommu_attach_dev,
+		.iova_to_phys = xen_iommu_iova_to_phys,
+		.free = xen_iommu_free,
+	},
+};
+
+int __init xen_iommu_init(void)
+{
+	int ret;
+	struct pv_iommu_op op = {
+		.subop_id = IOMMUOP_query_capabilities
+	};
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	/* Check if iommu_op is supported */
+	if (HYPERVISOR_iommu_op(&op) == -ENOSYS)
+		return -ENODEV; /* No Xen IOMMU hardware */
+
+	pr_info("Initialising Xen IOMMU driver\n");
+	pr_info("max_nr_pages=%d\n", op.cap.max_nr_pages);
+	pr_info("max_ctx_no=%d\n", op.cap.max_ctx_no);
+	pr_info("max_iova_addr=%llx\n", op.cap.max_iova_addr);
+
+	if (op.cap.max_ctx_no == 0) {
+		pr_err("Unable to use IOMMU PV driver (no context available)\n");
+		return -ENOTSUPP; /* Unable to use IOMMU PV ? */
+	}
+
+	if (xen_domain_type == XEN_PV_DOMAIN)
+		/* TODO: In PV domain, due to the existing pfn-gfn mapping we need to
+		 * consider that under certains circonstances, we have :
+		 *   pfn_to_gfn(x + 1) != pfn_to_gfn(x) + 1
+		 *
+		 * In these cases, we would want to separate the subop into several calls.
+		 * (only doing the grouped operation when the mapping is actually contigous)
+		 * Only map operation would be affected, as unmap actually uses dfn which
+		 * doesn't have this kind of mapping.
+		 *
+		 * Force single-page operations to work arround this issue for now.
+		 */
+		max_nr_pages = 1;
+	else
+		/* With HVM domains, pfn_to_gfn is identity, there is no issue regarding this. */
+		max_nr_pages = op.cap.max_nr_pages;
+
+	max_iova_addr = op.cap.max_iova_addr;
+
+	spin_lock_init(&lock);
+
+	ret = iommu_device_sysfs_add(&xen_iommu_device, NULL, NULL, "xen-iommu");
+	if (ret) {
+		pr_err("Unable to add Xen IOMMU sysfs\n");
+		return ret;
+	}
+
+	ret = iommu_device_register(&xen_iommu_device, &xen_iommu_ops, NULL);
+	if (ret) {
+		pr_err("Unable to register Xen IOMMU device %d\n", ret);
+		iommu_device_sysfs_remove(&xen_iommu_device);
+		return ret;
+	}
+
+	/* swiotlb is redundant when IOMMU is active. */
+	x86_swiotlb_enable = false;
+
+	return 0;
+}
+
+void __exit xen_iommu_fini(void)
+{
+	pr_info("Unregistering Xen IOMMU driver\n");
+
+	iommu_device_unregister(&xen_iommu_device);
+	iommu_device_sysfs_remove(&xen_iommu_device);
+}
+
+module_init(xen_iommu_init);
+module_exit(xen_iommu_fini);
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 1a371a825c55..c860acaf4b0e 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -10,6 +10,7 @@ 
 #ifndef __XEN_PUBLIC_MEMORY_H__
 #define __XEN_PUBLIC_MEMORY_H__
 
+#include <xen/interface/physdev.h>
 #include <linux/spinlock.h>
 
 /*
@@ -214,6 +215,38 @@  struct xen_add_to_physmap_range {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range);
 
+/*
+ * With some legacy devices, certain guest-physical addresses cannot safely
+ * be used for other purposes, e.g. to map guest RAM.  This hypercall
+ * enumerates those regions so the toolstack can avoid using them.
+ */
+#define XENMEM_reserved_device_memory_map   27
+struct xen_reserved_device_memory {
+    xen_pfn_t start_pfn;
+    xen_ulong_t nr_pages;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory);
+
+struct xen_reserved_device_memory_map {
+#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */
+    /* IN */
+    uint32_t flags;
+    /*
+     * IN/OUT
+     *
+     * Gets set to the required number of entries when too low,
+     * signaled by error code -ERANGE.
+     */
+    unsigned int nr_entries;
+    /* OUT */
+    GUEST_HANDLE(xen_reserved_device_memory) buffer;
+    /* IN */
+    union {
+        struct physdev_pci_device pci;
+    } dev;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map);
+
 /*
  * Returns the pseudo-physical memory map as it was when the domain
  * was started (specified by XENMEM_set_memory_map).
diff --git a/include/xen/interface/pv-iommu.h b/include/xen/interface/pv-iommu.h
new file mode 100644
index 000000000000..8a8d366e5f4c
--- /dev/null
+++ b/include/xen/interface/pv-iommu.h
@@ -0,0 +1,104 @@ 
+/* SPDX-License-Identifier: MIT */
+/******************************************************************************
+ * pv-iommu.h
+ *
+ * Paravirtualized IOMMU driver interface.
+ *
+ * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
+ */
+
+#ifndef __XEN_PUBLIC_PV_IOMMU_H__
+#define __XEN_PUBLIC_PV_IOMMU_H__
+
+#include "xen.h"
+#include "physdev.h"
+
+#define IOMMU_DEFAULT_CONTEXT (0)
+
+/**
+ * Query PV-IOMMU capabilities for this domain.
+ */
+#define IOMMUOP_query_capabilities    1
+
+/**
+ * Allocate an IOMMU context, the new context handle will be written to ctx_no.
+ */
+#define IOMMUOP_alloc_context         2
+
+/**
+ * Destroy a IOMMU context.
+ * All devices attached to this context are reattached to default context.
+ *
+ * The default context can't be destroyed (0).
+ */
+#define IOMMUOP_free_context          3
+
+/**
+ * Reattach the device to IOMMU context.
+ */
+#define IOMMUOP_reattach_device       4
+
+#define IOMMUOP_map_pages             5
+#define IOMMUOP_unmap_pages           6
+
+/**
+ * Get the GFN associated to a specific DFN.
+ */
+#define IOMMUOP_lookup_page           7
+
+struct pv_iommu_op {
+    uint16_t subop_id;
+    uint16_t ctx_no;
+
+/**
+ * Create a context that is cloned from default.
+ * The new context will be populated with 1:1 mappings covering the entire guest memory.
+ */
+#define IOMMU_CREATE_clone (1 << 0)
+
+#define IOMMU_OP_readable (1 << 0)
+#define IOMMU_OP_writeable (1 << 1)
+    uint32_t flags;
+
+    union {
+        struct {
+            uint64_t gfn;
+            uint64_t dfn;
+            /* Number of pages to map */
+            uint32_t nr_pages;
+            /* Number of pages actually mapped after sub-op */
+            uint32_t mapped;
+        } map_pages;
+
+        struct {
+            uint64_t dfn;
+            /* Number of pages to unmap */
+            uint32_t nr_pages;
+            /* Number of pages actually unmapped after sub-op */
+            uint32_t unmapped;
+        } unmap_pages;
+
+        struct {
+            struct physdev_pci_device dev;
+        } reattach_device;
+
+        struct {
+            uint64_t gfn;
+            uint64_t dfn;
+        } lookup_page;
+
+        struct {
+            /* Maximum number of IOMMU context this domain can use. */
+            uint16_t max_ctx_no;
+            /* Maximum number of pages that can be modified in a single map/unmap operation. */
+            uint32_t max_nr_pages;
+            /* Maximum device address (iova) that the guest can use for mappings. */
+            uint64_t max_iova_addr;
+        } cap;
+    };
+};
+
+typedef struct pv_iommu_op pv_iommu_op_t;
+DEFINE_GUEST_HANDLE_STRUCT(pv_iommu_op_t);
+
+#endif
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 0ca23eca2a9c..8b1daf3fecc6 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -65,6 +65,7 @@ 
 #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
 #define __HYPERVISOR_xenpmu_op            40
 #define __HYPERVISOR_dm_op                41
+#define __HYPERVISOR_iommu_op 					  43
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48