diff mbox

[RFC,v1,2/7] iommu/arm: ipmmu-vmsa: Add Xen changes for main driver

Message ID 1501081804-4882-3-git-send-email-olekstysh@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Oleksandr Tyshchenko July 26, 2017, 3:09 p.m. UTC
From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>

Modify the Linux IPMMU driver to be functional inside Xen.
All devices within a single Xen domain must use the same
IOMMU context no matter what IOMMU domains they are attached to.
This is the main difference between drivers in Linux
and Xen. Having 8 separate contexts allow us to passthrough
devices to 8 guest domain at the same time.

Also wrap following code in #if 0:
- All DMA related stuff
- Linux PM callbacks
- Driver remove callback
- iommu_group management

Maybe, it would be more correct to move different Linux2Xen wrappers,
define-s, helpers from IPMMU-VMSA and SMMU to some common file
before introducing IPMMU-VMSA patch series. And this common file
might be reused by possible future IOMMUs on ARM.

Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
CC: Julien Grall <julien.grall@arm.com>
CC: Stefano Stabellini <sstabellini@kernel.org>
---
 xen/drivers/passthrough/arm/ipmmu-vmsa.c | 984 +++++++++++++++++++++++++++++--
 1 file changed, 948 insertions(+), 36 deletions(-)

Comments

Julien Grall Aug. 8, 2017, 11:34 a.m. UTC | #1
Hi,

On 26/07/17 16:09, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>
> Modify the Linux IPMMU driver to be functional inside Xen.
> All devices within a single Xen domain must use the same
> IOMMU context no matter what IOMMU domains they are attached to.
> This is the main difference between drivers in Linux
> and Xen. Having 8 separate contexts allow us to passthrough
> devices to 8 guest domain at the same time.
>
> Also wrap following code in #if 0:
> - All DMA related stuff
> - Linux PM callbacks
> - Driver remove callback
> - iommu_group management
>
> Maybe, it would be more correct to move different Linux2Xen wrappers,
> define-s, helpers from IPMMU-VMSA and SMMU to some common file
> before introducing IPMMU-VMSA patch series. And this common file
> might be reused by possible future IOMMUs on ARM.

Yes please if we go forward with the Linux way.

>
> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> CC: Julien Grall <julien.grall@arm.com>
> CC: Stefano Stabellini <sstabellini@kernel.org>
> ---
>  xen/drivers/passthrough/arm/ipmmu-vmsa.c | 984 +++++++++++++++++++++++++++++--
>  1 file changed, 948 insertions(+), 36 deletions(-)
>
> diff --git a/xen/drivers/passthrough/arm/ipmmu-vmsa.c b/xen/drivers/passthrough/arm/ipmmu-vmsa.c
> index 2b380ff..e54b507 100644
> --- a/xen/drivers/passthrough/arm/ipmmu-vmsa.c
> +++ b/xen/drivers/passthrough/arm/ipmmu-vmsa.c
> @@ -6,31 +6,212 @@
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
>   * the Free Software Foundation; version 2 of the License.
> + *
> + * Based on Linux drivers/iommu/ipmmu-vmsa.c
> + * => commit f4747eba89c9b5d90fdf0a5458866283c47395d8
> + * (iommu/ipmmu-vmsa: Restrict IOMMU Domain Geometry to 32-bit address space)
> + *
> + * Xen modification:
> + * Oleksandr Tyshchenko <Oleksandr_Tyshchenko@epam.com>
> + * Copyright (C) 2016-2017 EPAM Systems Inc.
>   */
>
> -#include <linux/bitmap.h>
> -#include <linux/delay.h>
> -#include <linux/dma-iommu.h>
> -#include <linux/dma-mapping.h>
> -#include <linux/err.h>
> -#include <linux/export.h>
> -#include <linux/interrupt.h>
> -#include <linux/io.h>
> -#include <linux/iommu.h>
> -#include <linux/module.h>
> -#include <linux/of.h>
> -#include <linux/of_iommu.h>
> -#include <linux/platform_device.h>
> -#include <linux/sizes.h>
> -#include <linux/slab.h>
> -
> -#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
> -#include <asm/dma-iommu.h>
> -#include <asm/pgalloc.h>
> -#endif
> +#include <xen/config.h>
> +#include <xen/delay.h>
> +#include <xen/errno.h>
> +#include <xen/err.h>
> +#include <xen/irq.h>
> +#include <xen/lib.h>
> +#include <xen/list.h>
> +#include <xen/mm.h>
> +#include <xen/vmap.h>
> +#include <xen/rbtree.h>
> +#include <xen/sched.h>
> +#include <xen/sizes.h>
> +#include <asm/atomic.h>
> +#include <asm/device.h>
> +#include <asm/io.h>
> +#include <asm/platform.h>
>
>  #include "io-pgtable.h"
>
> +/* TODO:
> + * 1. Optimize xen_domain->lock usage.
> + * 2. Show domain_id in every printk which is per Xen domain.
> + *
> + */
> +
> +/***** Start of Xen specific code *****/
> +
> +#define IOMMU_READ	(1 << 0)
> +#define IOMMU_WRITE	(1 << 1)
> +#define IOMMU_CACHE	(1 << 2) /* DMA cache coherency */
> +#define IOMMU_NOEXEC	(1 << 3)
> +#define IOMMU_MMIO	(1 << 4) /* e.g. things like MSI doorbells */
> +
> +#define __fls(x) (fls(x) - 1)
> +#define __ffs(x) (ffs(x) - 1)
> +
> +#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
> +
> +#define ioread32 readl
> +#define iowrite32 writel
> +
> +#define dev_info dev_notice
> +
> +#define devm_request_irq(unused, irq, func, flags, name, dev) \
> +	request_irq(irq, flags, func, name, dev)
> +
> +/* Alias to Xen device tree helpers */
> +#define device_node dt_device_node
> +#define of_phandle_args dt_phandle_args
> +#define of_device_id dt_device_match
> +#define of_match_node dt_match_node
> +#define of_parse_phandle_with_args dt_parse_phandle_with_args
> +#define of_find_property dt_find_property
> +#define of_count_phandle_with_args dt_count_phandle_with_args
> +
> +/* Xen: Helpers to get device MMIO and IRQs */
> +struct resource
> +{
> +	u64 addr;
> +	u64 size;
> +	unsigned int type;
> +};
> +
> +#define resource_size(res) (res)->size;
> +
> +#define platform_device dt_device_node
> +
> +#define IORESOURCE_MEM 0
> +#define IORESOURCE_IRQ 1
> +
> +static struct resource *platform_get_resource(struct platform_device *pdev,
> +					      unsigned int type,
> +					      unsigned int num)
> +{
> +	/*
> +	 * The resource is only used between 2 calls of platform_get_resource.
> +	 * It's quite ugly but it's avoid to add too much code in the part
> +	 * imported from Linux
> +	 */
> +	static struct resource res;
> +	int ret = 0;
> +
> +	res.type = type;
> +
> +	switch (type) {
> +	case IORESOURCE_MEM:
> +		ret = dt_device_get_address(pdev, num, &res.addr, &res.size);
> +
> +		return ((ret) ? NULL : &res);
> +
> +	case IORESOURCE_IRQ:
> +		ret = platform_get_irq(pdev, num);
> +		if (ret < 0)
> +			return NULL;
> +
> +		res.addr = ret;
> +		res.size = 1;
> +
> +		return &res;
> +
> +	default:
> +		return NULL;
> +	}
> +}
> +
> +enum irqreturn {
> +	IRQ_NONE	= (0 << 0),
> +	IRQ_HANDLED	= (1 << 0),
> +};
> +
> +typedef enum irqreturn irqreturn_t;
> +
> +/* Device logger functions */
> +#define dev_print(dev, lvl, fmt, ...)						\
> +	 printk(lvl "ipmmu: %s: " fmt, dt_node_full_name(dev_to_dt(dev)), ## __VA_ARGS__)
> +
> +#define dev_dbg(dev, fmt, ...) dev_print(dev, XENLOG_DEBUG, fmt, ## __VA_ARGS__)
> +#define dev_notice(dev, fmt, ...) dev_print(dev, XENLOG_INFO, fmt, ## __VA_ARGS__)
> +#define dev_warn(dev, fmt, ...) dev_print(dev, XENLOG_WARNING, fmt, ## __VA_ARGS__)
> +#define dev_err(dev, fmt, ...) dev_print(dev, XENLOG_ERR, fmt, ## __VA_ARGS__)
> +
> +#define dev_err_ratelimited(dev, fmt, ...)					\
> +	 dev_print(dev, XENLOG_ERR, fmt, ## __VA_ARGS__)
> +
> +#define dev_name(dev) dt_node_full_name(dev_to_dt(dev))
> +
> +/* Alias to Xen allocation helpers */
> +#define kfree xfree
> +#define kmalloc(size, flags)		_xmalloc(size, sizeof(void *))
> +#define kzalloc(size, flags)		_xzalloc(size, sizeof(void *))
> +#define devm_kzalloc(dev, size, flags)	_xzalloc(size, sizeof(void *))
> +#define kmalloc_array(size, n, flags)	_xmalloc_array(size, sizeof(void *), n)
> +#define kcalloc(size, n, flags)		_xzalloc_array(size, sizeof(void *), n)
> +
> +static void __iomem *devm_ioremap_resource(struct device *dev,
> +					   struct resource *res)
> +{
> +	void __iomem *ptr;
> +
> +	if (!res || res->type != IORESOURCE_MEM) {
> +		dev_err(dev, "Invalid resource\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	ptr = ioremap_nocache(res->addr, res->size);
> +	if (!ptr) {
> +		dev_err(dev,
> +			"ioremap failed (addr 0x%"PRIx64" size 0x%"PRIx64")\n",
> +			res->addr, res->size);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	return ptr;
> +}
> +
> +/* Xen doesn't handle IOMMU fault */
> +#define report_iommu_fault(...)	1
> +
> +#define MODULE_DEVICE_TABLE(type, name)
> +#define module_param_named(name, value, type, perm)
> +#define MODULE_PARM_DESC(_parm, desc)
> +
> +/* Xen: Dummy iommu_domain */
> +struct iommu_domain
> +{
> +	atomic_t ref;
> +	/* Used to link iommu_domain contexts for a same domain.
> +	 * There is at least one per-IPMMU to used by the domain.
> +	 * */
> +	struct list_head		list;
> +};
> +
> +/* Xen: Describes informations required for a Xen domain */
> +struct ipmmu_vmsa_xen_domain {
> +	spinlock_t			lock;
> +	/* List of context (i.e iommu_domain) associated to this domain */
> +	struct list_head		contexts;
> +	struct iommu_domain		*base_context;
> +};
> +
> +/*
> + * Xen: Information about each device stored in dev->archdata.iommu
> + *
> + * On Linux the dev->archdata.iommu only stores the arch specific information,
> + * but, on Xen, we also have to store the iommu domain.
> + */
> +struct ipmmu_vmsa_xen_device {
> +	struct iommu_domain *domain;
> +	struct ipmmu_vmsa_archdata *archdata;
> +};
> +
> +#define dev_iommu(dev) ((struct ipmmu_vmsa_xen_device *)dev->archdata.iommu)
> +#define dev_iommu_domain(dev) (dev_iommu(dev)->domain)
> +
> +/***** Start of Linux IPMMU code *****/
> +
>  #define IPMMU_CTX_MAX 8
>
>  struct ipmmu_features {
> @@ -64,7 +245,9 @@ struct ipmmu_vmsa_device {
>  	struct hw_register *reg_backup[IPMMU_CTX_MAX];
>  #endif
>
> +#if 0 /* Xen: Not needed */
>  	struct dma_iommu_mapping *mapping;
> +#endif
>  };
>
>  struct ipmmu_vmsa_domain {
> @@ -77,6 +260,9 @@ struct ipmmu_vmsa_domain {
>
>  	unsigned int context_id;
>  	spinlock_t lock;			/* Protects mappings */
> +
> +	/* Xen: Domain associated to this configuration */
> +	struct domain *d;
>  };
>
>  struct ipmmu_vmsa_archdata {
> @@ -94,14 +280,20 @@ struct ipmmu_vmsa_archdata {
>  static DEFINE_SPINLOCK(ipmmu_devices_lock);
>  static LIST_HEAD(ipmmu_devices);
>
> +#if 0 /* Xen: Not needed */
>  static DEFINE_SPINLOCK(ipmmu_slave_devices_lock);
>  static LIST_HEAD(ipmmu_slave_devices);
> +#endif
>
>  static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom)
>  {
>  	return container_of(dom, struct ipmmu_vmsa_domain, io_domain);
>  }
>
> +/*
> + * Xen: Rewrite Linux helpers to manipulate with archdata on Xen.
> + */
> +#if 0
>  #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>  static struct ipmmu_vmsa_archdata *to_archdata(struct device *dev)
>  {
> @@ -120,6 +312,16 @@ static void set_archdata(struct device *dev, struct ipmmu_vmsa_archdata *p)
>  {
>  }
>  #endif
> +#else
> +static struct ipmmu_vmsa_archdata *to_archdata(struct device *dev)
> +{
> +	return dev_iommu(dev)->archdata;
> +}
> +static void set_archdata(struct device *dev, struct ipmmu_vmsa_archdata *p)
> +{
> +	dev_iommu(dev)->archdata = p;
> +}
> +#endif
>
>  #define TLB_LOOP_TIMEOUT		100	/* 100us */
>
> @@ -355,6 +557,10 @@ static struct hw_register *root_pgtable[IPMMU_CTX_MAX] = {
>
>  static bool ipmmu_is_root(struct ipmmu_vmsa_device *mmu)
>  {
> +	/* Xen: Fix */

Hmmm. Can we get a bit more details?

> +	if (!mmu)
> +		return false;
> +
>  	if (mmu->features->has_cache_leaf_nodes)
>  		return mmu->is_leaf ? false : true;
>  	else
> @@ -405,14 +611,28 @@ static void ipmmu_ctx_write(struct ipmmu_vmsa_domain *domain, unsigned int reg,
>  	ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE + reg, data);
>  }
>
> -static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned int reg,
> +/* Xen: Write the context for cache IPMMU only. */

Same here. Why does it need to be different with Xen?

> +static void ipmmu_ctx_write1(struct ipmmu_vmsa_domain *domain, unsigned int reg,
>  			     u32 data)
>  {
>  	if (domain->mmu != domain->root)
> -		ipmmu_write(domain->mmu,
> -			    domain->context_id * IM_CTX_SIZE + reg, data);
> +		ipmmu_write(domain->mmu, domain->context_id * IM_CTX_SIZE + reg, data);
> +}
>
> -	ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE + reg, data);
> +/*
> + * Xen: Write the context for both root IPMMU and all cache IPMMUs
> + * that assigned to this Xen domain.
> + */
> +static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned int reg,
> +			     u32 data)
> +{
> +	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(domain->d)->arch.priv;
> +	struct iommu_domain *io_domain;
> +
> +	list_for_each_entry(io_domain, &xen_domain->contexts, list)
> +		ipmmu_ctx_write1(to_vmsa_domain(io_domain), reg, data);
> +
> +	ipmmu_ctx_write(domain, reg, data);
>  }
>
>  /* -----------------------------------------------------------------------------
> @@ -488,6 +708,10 @@ static void ipmmu_tlb_flush_all(void *cookie)
>  {
>  	struct ipmmu_vmsa_domain *domain = cookie;
>
> +	/* Xen: Just return if context_id has non-existent value */

Same here.

> +	if (domain->context_id >= domain->root->num_ctx)
> +		return;
> +
>  	ipmmu_tlb_invalidate(domain);
>  }
>
> @@ -549,8 +773,10 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
>  	domain->cfg.ias = 32;
>  	domain->cfg.oas = 40;
>  	domain->cfg.tlb = &ipmmu_gather_ops;
> +#if 0 /* Xen: Not needed */
>  	domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32);
>  	domain->io_domain.geometry.force_aperture = true;
> +#endif
>  	/*
>  	 * TODO: Add support for coherent walk through CCI with DVM and remove
>  	 * cache handling. For now, delegate it to the io-pgtable code.
> @@ -562,6 +788,9 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
>  	if (!domain->iop)
>  		return -EINVAL;
>
> +	/* Xen: Initialize context_id with non-existent value */
> +	domain->context_id = domain->root->num_ctx;

Why do you need to do that for Xen? Overall I think you need a bit more 
explanation of why you need those changes for Xen compare to the Linux 
driver.

> +
>  	/*
>  	 * Find an unused context.
>  	 */
> @@ -578,6 +807,11 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
>
>  	/* TTBR0 */
>  	ttbr = domain->cfg.arm_lpae_s1_cfg.ttbr[0];
> +
> +	/* Xen: */
> +	dev_notice(domain->root->dev, "d%d: Set IPMMU context %u (pgd 0x%"PRIx64")\n",
> +			domain->d->domain_id, domain->context_id, ttbr);

If you want to keep driver close to Linux, then you need to avoid 
unecessary change.

> +
>  	ipmmu_ctx_write(domain, IMTTLBR0, ttbr);
>  	ipmmu_ctx_write(domain, IMTTUBR0, ttbr >> 32);
>
> @@ -616,8 +850,9 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
>  	 * translation table format doesn't use TEX remapping. Don't enable AF
>  	 * software management as we have no use for it. Flush the TLB as
>  	 * required when modifying the context registers.
> +	 * Xen: Enable the context for the root IPMMU only.
>  	 */
> -	ipmmu_ctx_write2(domain, IMCTR,
> +	ipmmu_ctx_write(domain, IMCTR,
>  			 IMCTR_INTEN | IMCTR_FLUSH | IMCTR_MMUEN);
>
>  	return 0;
> @@ -638,13 +873,18 @@ static void ipmmu_domain_free_context(struct ipmmu_vmsa_device *mmu,
>
>  static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain *domain)
>  {
> +	/* Xen: Just return if context_id has non-existent value */
> +	if (domain->context_id >= domain->root->num_ctx)
> +		return;
> +
>  	/*
>  	 * Disable the context. Flush the TLB as required when modifying the
>  	 * context registers.
>  	 *
>  	 * TODO: Is TLB flush really needed ?
> +	 * Xen: Disable the context for the root IPMMU only.
>  	 */
> -	ipmmu_ctx_write2(domain, IMCTR, IMCTR_FLUSH);
> +	ipmmu_ctx_write(domain, IMCTR, IMCTR_FLUSH);
>  	ipmmu_tlb_sync(domain);
>
>  #ifdef CONFIG_RCAR_DDR_BACKUP
> @@ -652,12 +892,16 @@ static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain *domain)
>  #endif
>
>  	ipmmu_domain_free_context(domain->root, domain->context_id);
> +
> +	/* Xen: Initialize context_id with non-existent value */
> +	domain->context_id = domain->root->num_ctx;
>  }
>
>  /* -----------------------------------------------------------------------------
>   * Fault Handling
>   */
>
> +/* Xen: Show domain_id in every printk */
>  static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
>  {
>  	const u32 err_mask = IMSTR_MHIT | IMSTR_ABORT | IMSTR_PF | IMSTR_TF;
> @@ -681,11 +925,11 @@ static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
>
>  	/* Log fatal errors. */
>  	if (status & IMSTR_MHIT)
> -		dev_err_ratelimited(mmu->dev, "Multiple TLB hits @0x%08x\n",
> -				    iova);
> +		dev_err_ratelimited(mmu->dev, "d%d: Multiple TLB hits @0x%08x\n",
> +				domain->d->domain_id, iova);
>  	if (status & IMSTR_ABORT)
> -		dev_err_ratelimited(mmu->dev, "Page Table Walk Abort @0x%08x\n",
> -				    iova);
> +		dev_err_ratelimited(mmu->dev, "d%d: Page Table Walk Abort @0x%08x\n",
> +				domain->d->domain_id, iova);
>
>  	if (!(status & (IMSTR_PF | IMSTR_TF)))
>  		return IRQ_NONE;
> @@ -700,8 +944,8 @@ static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
>  		return IRQ_HANDLED;
>
>  	dev_err_ratelimited(mmu->dev,
> -			    "Unhandled fault: status 0x%08x iova 0x%08x\n",
> -			    status, iova);
> +			"d%d: Unhandled fault: status 0x%08x iova 0x%08x\n",
> +			domain->d->domain_id, status, iova);
>
>  	return IRQ_HANDLED;
>  }
> @@ -730,6 +974,16 @@ static irqreturn_t ipmmu_irq(int irq, void *dev)
>  	return status;
>  }
>
> +/* Xen: Interrupt handlers wrapper */
> +static void ipmmu_irq_xen(int irq, void *dev,
> +				      struct cpu_user_regs *regs)
> +{
> +	ipmmu_irq(irq, dev);
> +}
> +
> +#define ipmmu_irq ipmmu_irq_xen
> +
> +#if 0 /* Xen: Not needed */
>  /* -----------------------------------------------------------------------------
>   * IOMMU Operations
>   */
> @@ -759,6 +1013,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain)
>  	free_io_pgtable_ops(domain->iop);
>  	kfree(domain);
>  }
> +#endif
>
>  static int ipmmu_attach_device(struct iommu_domain *io_domain,
>  			       struct device *dev)
> @@ -787,7 +1042,20 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain,
>  		/* The domain hasn't been used yet, initialize it. */
>  		domain->mmu = mmu;
>  		domain->root = root;
> +
> +/*
> + * Xen: We have already initialized and enabled context for root IPMMU
> + * for this Xen domain. Enable context for given cache IPMMU only.
> + * Flush the TLB as required when modifying the context registers.

Why?

> + */
> +#if 0
>  		ret = ipmmu_domain_init_context(domain);
> +#endif
> +		ipmmu_ctx_write1(domain, IMCTR,
> +				ipmmu_ctx_read(domain, IMCTR) | IMCTR_FLUSH);
> +
> +		dev_info(dev, "Using IPMMU context %u\n", domain->context_id);
> +#if 0 /* Xen: Not needed */
>  		if (ret < 0) {
>  			dev_err(dev, "Unable to initialize IPMMU context\n");
>  			domain->mmu = NULL;
> @@ -795,6 +1063,7 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain,
>  			dev_info(dev, "Using IPMMU context %u\n",
>  				 domain->context_id);
>  		}
> +#endif
>  	} else if (domain->mmu != mmu) {
>  		/*
>  		 * Something is wrong, we can't attach two devices using
> @@ -834,6 +1103,14 @@ static void ipmmu_detach_device(struct iommu_domain *io_domain,
>  	 */
>  }
>
> +/*
> + * Xen: The current implementation of these callbacks is insufficient for us
> + * since they are intended to be called from Linux IOMMU core that
> + * has already done all required actions such as doing various checks,
> + * splitting into memory block the hardware supports and so on.

Can you expand it here? Why can't our IOMMU framework could do that?

IHMO, if we want to get driver from Linux, we need to get an interface 
very close to it. Otherwise it is not worth it because you would have to 
implement for each IOMMU.

My overall feeling at the moment is Xen is not ready to welcome this 
driver directly from Linux. This is also a BSP driver, so no thorough 
review done by the community.

I have been told the BSP driver was in pretty bad state, so I think we 
really need to weight pros and cons of using it.

Cheers,
Oleksandr Tyshchenko Aug. 10, 2017, 2:27 p.m. UTC | #2
Hi, Julien

On Tue, Aug 8, 2017 at 2:34 PM, Julien Grall <julien.grall@arm.com> wrote:
> Hi,
>
> On 26/07/17 16:09, Oleksandr Tyshchenko wrote:
>>
>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>
>> Modify the Linux IPMMU driver to be functional inside Xen.
>> All devices within a single Xen domain must use the same
>> IOMMU context no matter what IOMMU domains they are attached to.
>> This is the main difference between drivers in Linux
>> and Xen. Having 8 separate contexts allow us to passthrough
>> devices to 8 guest domain at the same time.
>>
>> Also wrap following code in #if 0:
>> - All DMA related stuff
>> - Linux PM callbacks
>> - Driver remove callback
>> - iommu_group management
>>
>> Maybe, it would be more correct to move different Linux2Xen wrappers,
>> define-s, helpers from IPMMU-VMSA and SMMU to some common file
>> before introducing IPMMU-VMSA patch series. And this common file
>> might be reused by possible future IOMMUs on ARM.
>
>
> Yes please if we go forward with the Linux way.
OK. I will keep it in mind.

>
>
>>
>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>> CC: Julien Grall <julien.grall@arm.com>
>> CC: Stefano Stabellini <sstabellini@kernel.org>
>> ---
>>  xen/drivers/passthrough/arm/ipmmu-vmsa.c | 984
>> +++++++++++++++++++++++++++++--
>>  1 file changed, 948 insertions(+), 36 deletions(-)
>>
>> diff --git a/xen/drivers/passthrough/arm/ipmmu-vmsa.c
>> b/xen/drivers/passthrough/arm/ipmmu-vmsa.c
>> index 2b380ff..e54b507 100644
>> --- a/xen/drivers/passthrough/arm/ipmmu-vmsa.c
>> +++ b/xen/drivers/passthrough/arm/ipmmu-vmsa.c
>> @@ -6,31 +6,212 @@
>>   * This program is free software; you can redistribute it and/or modify
>>   * it under the terms of the GNU General Public License as published by
>>   * the Free Software Foundation; version 2 of the License.
>> + *
>> + * Based on Linux drivers/iommu/ipmmu-vmsa.c
>> + * => commit f4747eba89c9b5d90fdf0a5458866283c47395d8
>> + * (iommu/ipmmu-vmsa: Restrict IOMMU Domain Geometry to 32-bit address
>> space)
>> + *
>> + * Xen modification:
>> + * Oleksandr Tyshchenko <Oleksandr_Tyshchenko@epam.com>
>> + * Copyright (C) 2016-2017 EPAM Systems Inc.
>>   */
>>
>> -#include <linux/bitmap.h>
>> -#include <linux/delay.h>
>> -#include <linux/dma-iommu.h>
>> -#include <linux/dma-mapping.h>
>> -#include <linux/err.h>
>> -#include <linux/export.h>
>> -#include <linux/interrupt.h>
>> -#include <linux/io.h>
>> -#include <linux/iommu.h>
>> -#include <linux/module.h>
>> -#include <linux/of.h>
>> -#include <linux/of_iommu.h>
>> -#include <linux/platform_device.h>
>> -#include <linux/sizes.h>
>> -#include <linux/slab.h>
>> -
>> -#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
>> -#include <asm/dma-iommu.h>
>> -#include <asm/pgalloc.h>
>> -#endif
>> +#include <xen/config.h>
>> +#include <xen/delay.h>
>> +#include <xen/errno.h>
>> +#include <xen/err.h>
>> +#include <xen/irq.h>
>> +#include <xen/lib.h>
>> +#include <xen/list.h>
>> +#include <xen/mm.h>
>> +#include <xen/vmap.h>
>> +#include <xen/rbtree.h>
>> +#include <xen/sched.h>
>> +#include <xen/sizes.h>
>> +#include <asm/atomic.h>
>> +#include <asm/device.h>
>> +#include <asm/io.h>
>> +#include <asm/platform.h>
>>
>>  #include "io-pgtable.h"
>>
>> +/* TODO:
>> + * 1. Optimize xen_domain->lock usage.
>> + * 2. Show domain_id in every printk which is per Xen domain.
>> + *
>> + */
>> +
>> +/***** Start of Xen specific code *****/
>> +
>> +#define IOMMU_READ     (1 << 0)
>> +#define IOMMU_WRITE    (1 << 1)
>> +#define IOMMU_CACHE    (1 << 2) /* DMA cache coherency */
>> +#define IOMMU_NOEXEC   (1 << 3)
>> +#define IOMMU_MMIO     (1 << 4) /* e.g. things like MSI doorbells */
>> +
>> +#define __fls(x) (fls(x) - 1)
>> +#define __ffs(x) (ffs(x) - 1)
>> +
>> +#define IO_PGTABLE_QUIRK_ARM_NS                BIT(0)
>> +
>> +#define ioread32 readl
>> +#define iowrite32 writel
>> +
>> +#define dev_info dev_notice
>> +
>> +#define devm_request_irq(unused, irq, func, flags, name, dev) \
>> +       request_irq(irq, flags, func, name, dev)
>> +
>> +/* Alias to Xen device tree helpers */
>> +#define device_node dt_device_node
>> +#define of_phandle_args dt_phandle_args
>> +#define of_device_id dt_device_match
>> +#define of_match_node dt_match_node
>> +#define of_parse_phandle_with_args dt_parse_phandle_with_args
>> +#define of_find_property dt_find_property
>> +#define of_count_phandle_with_args dt_count_phandle_with_args
>> +
>> +/* Xen: Helpers to get device MMIO and IRQs */
>> +struct resource
>> +{
>> +       u64 addr;
>> +       u64 size;
>> +       unsigned int type;
>> +};
>> +
>> +#define resource_size(res) (res)->size;
>> +
>> +#define platform_device dt_device_node
>> +
>> +#define IORESOURCE_MEM 0
>> +#define IORESOURCE_IRQ 1
>> +
>> +static struct resource *platform_get_resource(struct platform_device
>> *pdev,
>> +                                             unsigned int type,
>> +                                             unsigned int num)
>> +{
>> +       /*
>> +        * The resource is only used between 2 calls of
>> platform_get_resource.
>> +        * It's quite ugly but it's avoid to add too much code in the part
>> +        * imported from Linux
>> +        */
>> +       static struct resource res;
>> +       int ret = 0;
>> +
>> +       res.type = type;
>> +
>> +       switch (type) {
>> +       case IORESOURCE_MEM:
>> +               ret = dt_device_get_address(pdev, num, &res.addr,
>> &res.size);
>> +
>> +               return ((ret) ? NULL : &res);
>> +
>> +       case IORESOURCE_IRQ:
>> +               ret = platform_get_irq(pdev, num);
>> +               if (ret < 0)
>> +                       return NULL;
>> +
>> +               res.addr = ret;
>> +               res.size = 1;
>> +
>> +               return &res;
>> +
>> +       default:
>> +               return NULL;
>> +       }
>> +}
>> +
>> +enum irqreturn {
>> +       IRQ_NONE        = (0 << 0),
>> +       IRQ_HANDLED     = (1 << 0),
>> +};
>> +
>> +typedef enum irqreturn irqreturn_t;
>> +
>> +/* Device logger functions */
>> +#define dev_print(dev, lvl, fmt, ...)
>> \
>> +        printk(lvl "ipmmu: %s: " fmt, dt_node_full_name(dev_to_dt(dev)),
>> ## __VA_ARGS__)
>> +
>> +#define dev_dbg(dev, fmt, ...) dev_print(dev, XENLOG_DEBUG, fmt, ##
>> __VA_ARGS__)
>> +#define dev_notice(dev, fmt, ...) dev_print(dev, XENLOG_INFO, fmt, ##
>> __VA_ARGS__)
>> +#define dev_warn(dev, fmt, ...) dev_print(dev, XENLOG_WARNING, fmt, ##
>> __VA_ARGS__)
>> +#define dev_err(dev, fmt, ...) dev_print(dev, XENLOG_ERR, fmt, ##
>> __VA_ARGS__)
>> +
>> +#define dev_err_ratelimited(dev, fmt, ...)
>> \
>> +        dev_print(dev, XENLOG_ERR, fmt, ## __VA_ARGS__)
>> +
>> +#define dev_name(dev) dt_node_full_name(dev_to_dt(dev))
>> +
>> +/* Alias to Xen allocation helpers */
>> +#define kfree xfree
>> +#define kmalloc(size, flags)           _xmalloc(size, sizeof(void *))
>> +#define kzalloc(size, flags)           _xzalloc(size, sizeof(void *))
>> +#define devm_kzalloc(dev, size, flags) _xzalloc(size, sizeof(void *))
>> +#define kmalloc_array(size, n, flags)  _xmalloc_array(size, sizeof(void
>> *), n)
>> +#define kcalloc(size, n, flags)                _xzalloc_array(size,
>> sizeof(void *), n)
>> +
>> +static void __iomem *devm_ioremap_resource(struct device *dev,
>> +                                          struct resource *res)
>> +{
>> +       void __iomem *ptr;
>> +
>> +       if (!res || res->type != IORESOURCE_MEM) {
>> +               dev_err(dev, "Invalid resource\n");
>> +               return ERR_PTR(-EINVAL);
>> +       }
>> +
>> +       ptr = ioremap_nocache(res->addr, res->size);
>> +       if (!ptr) {
>> +               dev_err(dev,
>> +                       "ioremap failed (addr 0x%"PRIx64" size
>> 0x%"PRIx64")\n",
>> +                       res->addr, res->size);
>> +               return ERR_PTR(-ENOMEM);
>> +       }
>> +
>> +       return ptr;
>> +}
>> +
>> +/* Xen doesn't handle IOMMU fault */
>> +#define report_iommu_fault(...)        1
>> +
>> +#define MODULE_DEVICE_TABLE(type, name)
>> +#define module_param_named(name, value, type, perm)
>> +#define MODULE_PARM_DESC(_parm, desc)
>> +
>> +/* Xen: Dummy iommu_domain */
>> +struct iommu_domain
>> +{
>> +       atomic_t ref;
>> +       /* Used to link iommu_domain contexts for a same domain.
>> +        * There is at least one per-IPMMU to used by the domain.
>> +        * */
>> +       struct list_head                list;
>> +};
>> +
>> +/* Xen: Describes informations required for a Xen domain */
>> +struct ipmmu_vmsa_xen_domain {
>> +       spinlock_t                      lock;
>> +       /* List of context (i.e iommu_domain) associated to this domain */
>> +       struct list_head                contexts;
>> +       struct iommu_domain             *base_context;
>> +};
>> +
>> +/*
>> + * Xen: Information about each device stored in dev->archdata.iommu
>> + *
>> + * On Linux the dev->archdata.iommu only stores the arch specific
>> information,
>> + * but, on Xen, we also have to store the iommu domain.
>> + */
>> +struct ipmmu_vmsa_xen_device {
>> +       struct iommu_domain *domain;
>> +       struct ipmmu_vmsa_archdata *archdata;
>> +};
>> +
>> +#define dev_iommu(dev) ((struct ipmmu_vmsa_xen_device
>> *)dev->archdata.iommu)
>> +#define dev_iommu_domain(dev) (dev_iommu(dev)->domain)
>> +
>> +/***** Start of Linux IPMMU code *****/
>> +
>>  #define IPMMU_CTX_MAX 8
>>
>>  struct ipmmu_features {
>> @@ -64,7 +245,9 @@ struct ipmmu_vmsa_device {
>>         struct hw_register *reg_backup[IPMMU_CTX_MAX];
>>  #endif
>>
>> +#if 0 /* Xen: Not needed */
>>         struct dma_iommu_mapping *mapping;
>> +#endif
>>  };
>>
>>  struct ipmmu_vmsa_domain {
>> @@ -77,6 +260,9 @@ struct ipmmu_vmsa_domain {
>>
>>         unsigned int context_id;
>>         spinlock_t lock;                        /* Protects mappings */
>> +
>> +       /* Xen: Domain associated to this configuration */
>> +       struct domain *d;
>>  };
>>
>>  struct ipmmu_vmsa_archdata {
>> @@ -94,14 +280,20 @@ struct ipmmu_vmsa_archdata {
>>  static DEFINE_SPINLOCK(ipmmu_devices_lock);
>>  static LIST_HEAD(ipmmu_devices);
>>
>> +#if 0 /* Xen: Not needed */
>>  static DEFINE_SPINLOCK(ipmmu_slave_devices_lock);
>>  static LIST_HEAD(ipmmu_slave_devices);
>> +#endif
>>
>>  static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom)
>>  {
>>         return container_of(dom, struct ipmmu_vmsa_domain, io_domain);
>>  }
>>
>> +/*
>> + * Xen: Rewrite Linux helpers to manipulate with archdata on Xen.
>> + */
>> +#if 0
>>  #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>>  static struct ipmmu_vmsa_archdata *to_archdata(struct device *dev)
>>  {
>> @@ -120,6 +312,16 @@ static void set_archdata(struct device *dev, struct
>> ipmmu_vmsa_archdata *p)
>>  {
>>  }
>>  #endif
>> +#else
>> +static struct ipmmu_vmsa_archdata *to_archdata(struct device *dev)
>> +{
>> +       return dev_iommu(dev)->archdata;
>> +}
>> +static void set_archdata(struct device *dev, struct ipmmu_vmsa_archdata
>> *p)
>> +{
>> +       dev_iommu(dev)->archdata = p;
>> +}
>> +#endif
>>
>>  #define TLB_LOOP_TIMEOUT               100     /* 100us */
>>
>> @@ -355,6 +557,10 @@ static struct hw_register
>> *root_pgtable[IPMMU_CTX_MAX] = {
>>
>>  static bool ipmmu_is_root(struct ipmmu_vmsa_device *mmu)
>>  {
>> +       /* Xen: Fix */
>
>
> Hmmm. Can we get a bit more details?

These is a case when ipmmu_is_root is called with "mmu" being NULL.
https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2330

In ipmmu_vmsa_alloc_page_table() we need to find "root mmu", but we
doesn't have argument to pass.
So, I had two options:

1. Add code searching for it.
...
spin_lock(&ipmmu_devices_lock);
list_for_each_entry(mmu, &ipmmu_devices, list) {
   if (ipmmu_is_root(mmu))
      break;
}
spin_unlock(&ipmmu_devices_lock);
...

2. Use existing ipmmu_find_root() with adding this check for a valid value.
So, if we call ipmmu_find_root() with argument being NULL we will
actually get searching the list.

I decided to use 2 option.

>
>> +       if (!mmu)
>> +               return false;
>> +
>>         if (mmu->features->has_cache_leaf_nodes)
>>                 return mmu->is_leaf ? false : true;
>>         else
>> @@ -405,14 +611,28 @@ static void ipmmu_ctx_write(struct ipmmu_vmsa_domain
>> *domain, unsigned int reg,
>>         ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE + reg,
>> data);
>>  }
>>
>> -static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned
>> int reg,
>> +/* Xen: Write the context for cache IPMMU only. */
>
>
> Same here. Why does it need to be different with Xen?

Well, let me elaborate a bit more about this.

I feel that I need to explain in a few words about IPMMU itself:
Generally speaking,
The IPMMU hardware (R-Car Gen3) has 8 context banks and consists of next parts:
- root IPMMU
- a number of cache IPMMUs

Each cache IPMMU is connected to root IPMMU and has uTLB ports the
master devices can be tied to.
Something, like this:

master device1 ---> cache IPMMU1 [8 ctx] ---> root IPMMU [8 ctx] -> memory
                           |                                          |
master device2 --                                          |
                                                                      |
master device3 ---> cache IPMMU2 [8 ctx] --

Each context bank has registers.
Some registers exist for both root IPMMU and cache IPMMUs -> IMCTR
Some registers exist only for root IPMMU -> IMTTLBRx/IMTTUBRx, IMMAIR0, etc

So, original driver has two helpers:
1. ipmmu_ctx_write() - is intended to write a register in context bank
N* for root IPMMU only.
2. ipmmu_ctx_write2() - is intended to write a register in context
bank N for both root IPMMU and cache IPMMU.
*where N=0-7

AFAIU, original Linux driver provides each IOMMU domain with a
separate IPMMU context:
master device1 + master device2 are in IOMMU domain1 and use IPMMU context 0
master device3 is in IOMMU domain2 and uses IPMMU context 1

So, when attaching device to new IOMMU domain in Linux we have to
initialize context for root IPMMU and enable context (IMCTR register)
for both root and cache IPMMUs.
https://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas-bsp.git/tree/drivers/iommu/ipmmu-vmsa.c?h=v4.9/rcar-3.5.3#n620

In Xen we need additional helper "ipmmu_ctx_write1" for writing a
register in context bank N for cache IPMMU only.
The reason is that we need a way to control cache IPMMU separately
since we have a little bit another model.

All IOMMU domains within a single Xen domain (dom_iommu(d)->arch.priv)
use the same IPMMU context N
which was initialized and enabled at the domain creation time. This
means that all master devices
that are assigned to the guest domain "d" use only this IPMMU context
N which actually contains P2M mapping for domain "d":
master device1 + master device2 are in IOMMU domain1 and use IPMMU context 0
master device3 is in IOMMU domain2 and also uses IPMMU context 0

So, when attaching device to new IOMMU domain in Xen we don't have to
initialize and enable context,
because it has been already done at domain initialization time:
https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2380
we just have to enable context for corresponding cache IPMMU only:
https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L1083

This is the main difference between drivers in Linux and Xen.

So, as you can see there is a need to manipulate context registers for
cache IPMMU without touching root IPMMU,
that's why I added this helper.

Does this make sense?

>
>
>> +static void ipmmu_ctx_write1(struct ipmmu_vmsa_domain *domain, unsigned
>> int reg,
>>                              u32 data)
>>  {
>>         if (domain->mmu != domain->root)
>> -               ipmmu_write(domain->mmu,
>> -                           domain->context_id * IM_CTX_SIZE + reg, data);
>> +               ipmmu_write(domain->mmu, domain->context_id * IM_CTX_SIZE
>> + reg, data);
>> +}
>>
>> -       ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE + reg,
>> data);
>> +/*
>> + * Xen: Write the context for both root IPMMU and all cache IPMMUs
>> + * that assigned to this Xen domain.
>> + */
>> +static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned
>> int reg,
>> +                            u32 data)
>> +{
>> +       struct ipmmu_vmsa_xen_domain *xen_domain =
>> dom_iommu(domain->d)->arch.priv;
>> +       struct iommu_domain *io_domain;
>> +
>> +       list_for_each_entry(io_domain, &xen_domain->contexts, list)
>> +               ipmmu_ctx_write1(to_vmsa_domain(io_domain), reg, data);
>> +
>> +       ipmmu_ctx_write(domain, reg, data);
>>  }
>>
>>  /*
>> -----------------------------------------------------------------------------
>> @@ -488,6 +708,10 @@ static void ipmmu_tlb_flush_all(void *cookie)
>>  {
>>         struct ipmmu_vmsa_domain *domain = cookie;
>>
>> +       /* Xen: Just return if context_id has non-existent value */
>
>
> Same here.

I think that there is a possible race.
In ipmmu_domain_init_context() we are trying to allocate context and
if allocation fails we will call free_io_pgtable_ops(),
but "domain->context_id" hasn't been initialized yet (likely it is 0).
https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L799

And having following call stack:
free_io_pgtable_ops() -> io_pgtable_tlb_flush_all() ->
ipmmu_tlb_flush_all() -> ipmmu_tlb_invalidate()
we will get a mistaken cache flush for a context pointed by
uninitialized "domain->context_id".

That's why I initialized context_id with non-existent value before
allocating context
https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L792
and checked it for a valid value here
https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L712
and everywhere it is need to checked.

>
>> +       if (domain->context_id >= domain->root->num_ctx)
>> +               return;
>> +
>>         ipmmu_tlb_invalidate(domain);
>>  }
>>
>> @@ -549,8 +773,10 @@ static int ipmmu_domain_init_context(struct
>> ipmmu_vmsa_domain *domain)
>>         domain->cfg.ias = 32;
>>         domain->cfg.oas = 40;
>>         domain->cfg.tlb = &ipmmu_gather_ops;
>> +#if 0 /* Xen: Not needed */
>>         domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32);
>>         domain->io_domain.geometry.force_aperture = true;
>> +#endif
>>         /*
>>          * TODO: Add support for coherent walk through CCI with DVM and
>> remove
>>          * cache handling. For now, delegate it to the io-pgtable code.
>> @@ -562,6 +788,9 @@ static int ipmmu_domain_init_context(struct
>> ipmmu_vmsa_domain *domain)
>>         if (!domain->iop)
>>                 return -EINVAL;
>>
>> +       /* Xen: Initialize context_id with non-existent value */
>> +       domain->context_id = domain->root->num_ctx;
>
>
> Why do you need to do that for Xen? Overall I think you need a bit more
> explanation of why you need those changes for Xen compare to the Linux
> driver.
I have just explained above why this change is needed. To avoid possible race.
If an explanation I gave above sounds reasonable, I can put comment in code.

>
>> +
>>         /*
>>          * Find an unused context.
>>          */
>> @@ -578,6 +807,11 @@ static int ipmmu_domain_init_context(struct
>> ipmmu_vmsa_domain *domain)
>>
>>         /* TTBR0 */
>>         ttbr = domain->cfg.arm_lpae_s1_cfg.ttbr[0];
>> +
>> +       /* Xen: */
>> +       dev_notice(domain->root->dev, "d%d: Set IPMMU context %u (pgd
>> 0x%"PRIx64")\n",
>> +                       domain->d->domain_id, domain->context_id, ttbr);
>
>
> If you want to keep driver close to Linux, then you need to avoid unecessary
> change.
Shall I drop it?

>
>
>> +
>>         ipmmu_ctx_write(domain, IMTTLBR0, ttbr);
>>         ipmmu_ctx_write(domain, IMTTUBR0, ttbr >> 32);
>>
>> @@ -616,8 +850,9 @@ static int ipmmu_domain_init_context(struct
>> ipmmu_vmsa_domain *domain)
>>          * translation table format doesn't use TEX remapping. Don't
>> enable AF
>>          * software management as we have no use for it. Flush the TLB as
>>          * required when modifying the context registers.
>> +        * Xen: Enable the context for the root IPMMU only.
>>          */
>> -       ipmmu_ctx_write2(domain, IMCTR,
>> +       ipmmu_ctx_write(domain, IMCTR,
>>                          IMCTR_INTEN | IMCTR_FLUSH | IMCTR_MMUEN);
>>
>>         return 0;
>> @@ -638,13 +873,18 @@ static void ipmmu_domain_free_context(struct
>> ipmmu_vmsa_device *mmu,
>>
>>  static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain
>> *domain)
>>  {
>> +       /* Xen: Just return if context_id has non-existent value */
>> +       if (domain->context_id >= domain->root->num_ctx)
>> +               return;
>> +
>>         /*
>>          * Disable the context. Flush the TLB as required when modifying
>> the
>>          * context registers.
>>          *
>>          * TODO: Is TLB flush really needed ?
>> +        * Xen: Disable the context for the root IPMMU only.
>>          */
>> -       ipmmu_ctx_write2(domain, IMCTR, IMCTR_FLUSH);
>> +       ipmmu_ctx_write(domain, IMCTR, IMCTR_FLUSH);
>>         ipmmu_tlb_sync(domain);
>>
>>  #ifdef CONFIG_RCAR_DDR_BACKUP
>> @@ -652,12 +892,16 @@ static void ipmmu_domain_destroy_context(struct
>> ipmmu_vmsa_domain *domain)
>>  #endif
>>
>>         ipmmu_domain_free_context(domain->root, domain->context_id);
>> +
>> +       /* Xen: Initialize context_id with non-existent value */
>> +       domain->context_id = domain->root->num_ctx;
>>  }
>>
>>  /*
>> -----------------------------------------------------------------------------
>>   * Fault Handling
>>   */
>>
>> +/* Xen: Show domain_id in every printk */
>>  static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
>>  {
>>         const u32 err_mask = IMSTR_MHIT | IMSTR_ABORT | IMSTR_PF |
>> IMSTR_TF;
>> @@ -681,11 +925,11 @@ static irqreturn_t ipmmu_domain_irq(struct
>> ipmmu_vmsa_domain *domain)
>>
>>         /* Log fatal errors. */
>>         if (status & IMSTR_MHIT)
>> -               dev_err_ratelimited(mmu->dev, "Multiple TLB hits
>> @0x%08x\n",
>> -                                   iova);
>> +               dev_err_ratelimited(mmu->dev, "d%d: Multiple TLB hits
>> @0x%08x\n",
>> +                               domain->d->domain_id, iova);
>>         if (status & IMSTR_ABORT)
>> -               dev_err_ratelimited(mmu->dev, "Page Table Walk Abort
>> @0x%08x\n",
>> -                                   iova);
>> +               dev_err_ratelimited(mmu->dev, "d%d: Page Table Walk Abort
>> @0x%08x\n",
>> +                               domain->d->domain_id, iova);
>>
>>         if (!(status & (IMSTR_PF | IMSTR_TF)))
>>                 return IRQ_NONE;
>> @@ -700,8 +944,8 @@ static irqreturn_t ipmmu_domain_irq(struct
>> ipmmu_vmsa_domain *domain)
>>                 return IRQ_HANDLED;
>>
>>         dev_err_ratelimited(mmu->dev,
>> -                           "Unhandled fault: status 0x%08x iova
>> 0x%08x\n",
>> -                           status, iova);
>> +                       "d%d: Unhandled fault: status 0x%08x iova
>> 0x%08x\n",
>> +                       domain->d->domain_id, status, iova);
>>
>>         return IRQ_HANDLED;
>>  }
>> @@ -730,6 +974,16 @@ static irqreturn_t ipmmu_irq(int irq, void *dev)
>>         return status;
>>  }
>>
>> +/* Xen: Interrupt handlers wrapper */
>> +static void ipmmu_irq_xen(int irq, void *dev,
>> +                                     struct cpu_user_regs *regs)
>> +{
>> +       ipmmu_irq(irq, dev);
>> +}
>> +
>> +#define ipmmu_irq ipmmu_irq_xen
>> +
>> +#if 0 /* Xen: Not needed */
>>  /*
>> -----------------------------------------------------------------------------
>>   * IOMMU Operations
>>   */
>> @@ -759,6 +1013,7 @@ static void ipmmu_domain_free(struct iommu_domain
>> *io_domain)
>>         free_io_pgtable_ops(domain->iop);
>>         kfree(domain);
>>  }
>> +#endif
>>
>>  static int ipmmu_attach_device(struct iommu_domain *io_domain,
>>                                struct device *dev)
>> @@ -787,7 +1042,20 @@ static int ipmmu_attach_device(struct iommu_domain
>> *io_domain,
>>                 /* The domain hasn't been used yet, initialize it. */
>>                 domain->mmu = mmu;
>>                 domain->root = root;
>> +
>> +/*
>> + * Xen: We have already initialized and enabled context for root IPMMU
>> + * for this Xen domain. Enable context for given cache IPMMU only.
>> + * Flush the TLB as required when modifying the context registers.
>
>
> Why?

Original Linux driver provides each IOMMU domain with a separate IPMMU context.
So, when attaching device to IOMMU domain which hasn't been
initialized yet we have to
call ipmmu_domain_init_context() for initializing (root only) and
enabling (root + cache * ) context for this IOMMU domain.

* You can see at the end of the "original" ipmmu_domain_init_context()
implementation, that context is enabled for both cache and root IPMMUs
because of "ipmmu_ctx_write2".
https://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas-bsp.git/tree/drivers/iommu/ipmmu-vmsa.c?h=v4.9/rcar-3.5.3#n620

From my point of view, we don't have to do the same when we are
attaching device in Xen, as we keep only one IPMMU context (P2M
mappings) per Xen domain
for using by all assigned to this guest devices.
What is more a number of context banks is limited (8), and if we
followed Linux way here, we would be quickly run out of available
contexts.
But having one IPMMU context per Xen domain allow us to passthrough
devices to 8 guest domain.

Taking into the account described above, we initialize (root only) and
enable (root only ** ) context at the domain creation time
if IOMMU is expected to be used for this guest.
https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2380

** You can see at the end of the "modified"
ipmmu_domain_init_context() implementation, that context is enabled
for root IPMMU only
because of "ipmmu_ctx_write".
https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L882

That's why, here, in ipmmu_attach_device() we don't have to call
ipmmu_domain_init_context() anymore, because
the context has been already initialized and enabled. All what we need
here is to enable this context for cache IPMMU the device
is physically connected to.

Does this make sense?

>
>
>> + */
>> +#if 0
>>                 ret = ipmmu_domain_init_context(domain);
>> +#endif
>> +               ipmmu_ctx_write1(domain, IMCTR,
>> +                               ipmmu_ctx_read(domain, IMCTR) |
>> IMCTR_FLUSH);
>> +
>> +               dev_info(dev, "Using IPMMU context %u\n",
>> domain->context_id);
>> +#if 0 /* Xen: Not needed */
>>                 if (ret < 0) {
>>                         dev_err(dev, "Unable to initialize IPMMU
>> context\n");
>>                         domain->mmu = NULL;
>> @@ -795,6 +1063,7 @@ static int ipmmu_attach_device(struct iommu_domain
>> *io_domain,
>>                         dev_info(dev, "Using IPMMU context %u\n",
>>                                  domain->context_id);
>>                 }
>> +#endif
>>         } else if (domain->mmu != mmu) {
>>                 /*
>>                  * Something is wrong, we can't attach two devices using
>> @@ -834,6 +1103,14 @@ static void ipmmu_detach_device(struct iommu_domain
>> *io_domain,
>>          */
>>  }
>>
>> +/*
>> + * Xen: The current implementation of these callbacks is insufficient for
>> us
>> + * since they are intended to be called from Linux IOMMU core that
>> + * has already done all required actions such as doing various checks,
>> + * splitting into memory block the hardware supports and so on.
>
>
> Can you expand it here? Why can't our IOMMU framework could do that?

If add all required support to IOMMU framework and modify all existing
IOMMU drivers
to follow this support, then yes, it will avoid IOMMU drivers such as
IPMMU-VMSA from having these stuff in.

To be honest, I was trying to touch IOMMU common code and other IOMMU
drivers as little as possible,
but I had to introduce a few changes ("non-shared IOMMU").

>
> IHMO, if we want to get driver from Linux, we need to get an interface very
> close to it. Otherwise it is not worth it because you would have to
> implement for each IOMMU.
You are right.

>
> My overall feeling at the moment is Xen is not ready to welcome this driver
> directly from Linux. This is also a BSP driver, so no thorough review done
> by the community.

As I said in a cover letter the BSP driver had more complete support
than the mainline one.
I would like to clarify what need to be done from my side.
Should I wait for the missing things reach upsteam and then rebase on
the mainline driver?
Or should I rewrite this driver without following Linux?

>
> I have been told the BSP driver was in pretty bad state, so I think we
> really need to weight pros and cons of using it.

I am afraid I didn't get the first part of sentence.

>
> Cheers,
>
> --
> Julien Grall
Julien Grall Aug. 10, 2017, 3:13 p.m. UTC | #3
Hi,

On 10/08/17 15:27, Oleksandr Tyshchenko wrote:
> On Tue, Aug 8, 2017 at 2:34 PM, Julien Grall <julien.grall@arm.com> wrote:
>> On 26/07/17 16:09, Oleksandr Tyshchenko wrote:
>>> @@ -355,6 +557,10 @@ static struct hw_register
>>> *root_pgtable[IPMMU_CTX_MAX] = {
>>>
>>>  static bool ipmmu_is_root(struct ipmmu_vmsa_device *mmu)
>>>  {
>>> +       /* Xen: Fix */
>>
>>
>> Hmmm. Can we get a bit more details?
>
> These is a case when ipmmu_is_root is called with "mmu" being NULL.
> https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2330
>
> In ipmmu_vmsa_alloc_page_table() we need to find "root mmu", but we
> doesn't have argument to pass.
> So, I had two options:
>
> 1. Add code searching for it.
> ...
> spin_lock(&ipmmu_devices_lock);
> list_for_each_entry(mmu, &ipmmu_devices, list) {
>    if (ipmmu_is_root(mmu))
>       break;
> }
> spin_unlock(&ipmmu_devices_lock);
> ...
>
> 2. Use existing ipmmu_find_root() with adding this check for a valid value.
> So, if we call ipmmu_find_root() with argument being NULL we will
> actually get searching the list.
>
> I decided to use 2 option.

Can you please expand the comment then?

>
>>
>>> +       if (!mmu)
>>> +               return false;
>>> +
>>>         if (mmu->features->has_cache_leaf_nodes)
>>>                 return mmu->is_leaf ? false : true;
>>>         else
>>> @@ -405,14 +611,28 @@ static void ipmmu_ctx_write(struct ipmmu_vmsa_domain
>>> *domain, unsigned int reg,
>>>         ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE + reg,
>>> data);
>>>  }
>>>
>>> -static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned
>>> int reg,
>>> +/* Xen: Write the context for cache IPMMU only. */
>>
>>
>> Same here. Why does it need to be different with Xen?
>
> Well, let me elaborate a bit more about this.
>
> I feel that I need to explain in a few words about IPMMU itself:
> Generally speaking,
> The IPMMU hardware (R-Car Gen3) has 8 context banks and consists of next parts:
> - root IPMMU
> - a number of cache IPMMUs
>
> Each cache IPMMU is connected to root IPMMU and has uTLB ports the
> master devices can be tied to.
> Something, like this:
>
> master device1 ---> cache IPMMU1 [8 ctx] ---> root IPMMU [8 ctx] -> memory
>                            |                                          |
> master device2 --                                          |
>                                                                       |
> master device3 ---> cache IPMMU2 [8 ctx] --
>
> Each context bank has registers.
> Some registers exist for both root IPMMU and cache IPMMUs -> IMCTR
> Some registers exist only for root IPMMU -> IMTTLBRx/IMTTUBRx, IMMAIR0, etc
>
> So, original driver has two helpers:
> 1. ipmmu_ctx_write() - is intended to write a register in context bank
> N* for root IPMMU only.
> 2. ipmmu_ctx_write2() - is intended to write a register in context
> bank N for both root IPMMU and cache IPMMU.
> *where N=0-7
>
> AFAIU, original Linux driver provides each IOMMU domain with a
> separate IPMMU context:
> master device1 + master device2 are in IOMMU domain1 and use IPMMU context 0
> master device3 is in IOMMU domain2 and uses IPMMU context 1
>
> So, when attaching device to new IOMMU domain in Linux we have to
> initialize context for root IPMMU and enable context (IMCTR register)
> for both root and cache IPMMUs.
> https://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas-bsp.git/tree/drivers/iommu/ipmmu-vmsa.c?h=v4.9/rcar-3.5.3#n620
>
> In Xen we need additional helper "ipmmu_ctx_write1" for writing a
> register in context bank N for cache IPMMU only.
> The reason is that we need a way to control cache IPMMU separately
> since we have a little bit another model.
>
> All IOMMU domains within a single Xen domain (dom_iommu(d)->arch.priv)
> use the same IPMMU context N
> which was initialized and enabled at the domain creation time. This
> means that all master devices
> that are assigned to the guest domain "d" use only this IPMMU context
> N which actually contains P2M mapping for domain "d":
> master device1 + master device2 are in IOMMU domain1 and use IPMMU context 0
> master device3 is in IOMMU domain2 and also uses IPMMU context 0
>
> So, when attaching device to new IOMMU domain in Xen we don't have to
> initialize and enable context,
> because it has been already done at domain initialization time:
> https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2380
> we just have to enable context for corresponding cache IPMMU only:
> https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L1083
>
> This is the main difference between drivers in Linux and Xen.
>
> So, as you can see there is a need to manipulate context registers for
> cache IPMMU without touching root IPMMU,
> that's why I added this helper.
>
> Does this make sense?

I think it does.

>
>>
>>
>>> +static void ipmmu_ctx_write1(struct ipmmu_vmsa_domain *domain, unsigned
>>> int reg,
>>>                              u32 data)
>>>  {
>>>         if (domain->mmu != domain->root)
>>> -               ipmmu_write(domain->mmu,
>>> -                           domain->context_id * IM_CTX_SIZE + reg, data);
>>> +               ipmmu_write(domain->mmu, domain->context_id * IM_CTX_SIZE
>>> + reg, data);
>>> +}
>>>
>>> -       ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE + reg,
>>> data);
>>> +/*
>>> + * Xen: Write the context for both root IPMMU and all cache IPMMUs
>>> + * that assigned to this Xen domain.
>>> + */
>>> +static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned
>>> int reg,
>>> +                            u32 data)
>>> +{
>>> +       struct ipmmu_vmsa_xen_domain *xen_domain =
>>> dom_iommu(domain->d)->arch.priv;
>>> +       struct iommu_domain *io_domain;
>>> +
>>> +       list_for_each_entry(io_domain, &xen_domain->contexts, list)
>>> +               ipmmu_ctx_write1(to_vmsa_domain(io_domain), reg, data);
>>> +
>>> +       ipmmu_ctx_write(domain, reg, data);
>>>  }
>>>
>>>  /*
>>> -----------------------------------------------------------------------------
>>> @@ -488,6 +708,10 @@ static void ipmmu_tlb_flush_all(void *cookie)
>>>  {
>>>         struct ipmmu_vmsa_domain *domain = cookie;
>>>
>>> +       /* Xen: Just return if context_id has non-existent value */
>>
>>
>> Same here.
>
> I think that there is a possible race.
> In ipmmu_domain_init_context() we are trying to allocate context and
> if allocation fails we will call free_io_pgtable_ops(),
> but "domain->context_id" hasn't been initialized yet (likely it is 0).
> https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L799
>
> And having following call stack:
> free_io_pgtable_ops() -> io_pgtable_tlb_flush_all() ->
> ipmmu_tlb_flush_all() -> ipmmu_tlb_invalidate()
> we will get a mistaken cache flush for a context pointed by
> uninitialized "domain->context_id".
>
> That's why I initialized context_id with non-existent value before
> allocating context
> https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L792
> and checked it for a valid value here
> https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L712
> and everywhere it is need to checked.

The race is in the code added or the one from Linux? If the latter, then 
you should have an action to fix it there. If the former, the I'd like 
to understand how come we introduced a race compare to Linux.

[...]

>>
>>> +
>>>         /*
>>>          * Find an unused context.
>>>          */
>>> @@ -578,6 +807,11 @@ static int ipmmu_domain_init_context(struct
>>> ipmmu_vmsa_domain *domain)
>>>
>>>         /* TTBR0 */
>>>         ttbr = domain->cfg.arm_lpae_s1_cfg.ttbr[0];
>>> +
>>> +       /* Xen: */
>>> +       dev_notice(domain->root->dev, "d%d: Set IPMMU context %u (pgd
>>> 0x%"PRIx64")\n",
>>> +                       domain->d->domain_id, domain->context_id, ttbr);
>>
>>
>> If you want to keep driver close to Linux, then you need to avoid unecessary
>> change.
> Shall I drop it?

Depends. How useful is it? If it is, then may you want to upstream that?

[...]

>>>  static int ipmmu_attach_device(struct iommu_domain *io_domain,
>>>                                struct device *dev)
>>> @@ -787,7 +1042,20 @@ static int ipmmu_attach_device(struct iommu_domain
>>> *io_domain,
>>>                 /* The domain hasn't been used yet, initialize it. */
>>>                 domain->mmu = mmu;
>>>                 domain->root = root;
>>> +
>>> +/*
>>> + * Xen: We have already initialized and enabled context for root IPMMU
>>> + * for this Xen domain. Enable context for given cache IPMMU only.
>>> + * Flush the TLB as required when modifying the context registers.
>>
>>
>> Why?
>
> Original Linux driver provides each IOMMU domain with a separate IPMMU context.
> So, when attaching device to IOMMU domain which hasn't been
> initialized yet we have to
> call ipmmu_domain_init_context() for initializing (root only) and
> enabling (root + cache * ) context for this IOMMU domain.
>
> * You can see at the end of the "original" ipmmu_domain_init_context()
> implementation, that context is enabled for both cache and root IPMMUs
> because of "ipmmu_ctx_write2".
> https://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas-bsp.git/tree/drivers/iommu/ipmmu-vmsa.c?h=v4.9/rcar-3.5.3#n620
>
> From my point of view, we don't have to do the same when we are
> attaching device in Xen, as we keep only one IPMMU context (P2M
> mappings) per Xen domain
> for using by all assigned to this guest devices.
> What is more a number of context banks is limited (8), and if we
> followed Linux way here, we would be quickly run out of available
> contexts.
> But having one IPMMU context per Xen domain allow us to passthrough
> devices to 8 guest domain.

The way you describe it give an impression that the driver is 
fundamentally different in Xen compare to Linux. Am I right?

>
> Taking into the account described above, we initialize (root only) and
> enable (root only ** ) context at the domain creation time
> if IOMMU is expected to be used for this guest.
> https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2380
>
> ** You can see at the end of the "modified"
> ipmmu_domain_init_context() implementation, that context is enabled
> for root IPMMU only
> because of "ipmmu_ctx_write".
> https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L882
>
> That's why, here, in ipmmu_attach_device() we don't have to call
> ipmmu_domain_init_context() anymore, because
> the context has been already initialized and enabled. All what we need
> here is to enable this context for cache IPMMU the device
> is physically connected to.
>
> Does this make sense?
>
>>
>>
>>> + */
>>> +#if 0
>>>                 ret = ipmmu_domain_init_context(domain);
>>> +#endif
>>> +               ipmmu_ctx_write1(domain, IMCTR,
>>> +                               ipmmu_ctx_read(domain, IMCTR) |
>>> IMCTR_FLUSH);
>>> +
>>> +               dev_info(dev, "Using IPMMU context %u\n",
>>> domain->context_id);
>>> +#if 0 /* Xen: Not needed */
>>>                 if (ret < 0) {
>>>                         dev_err(dev, "Unable to initialize IPMMU
>>> context\n");
>>>                         domain->mmu = NULL;
>>> @@ -795,6 +1063,7 @@ static int ipmmu_attach_device(struct iommu_domain
>>> *io_domain,
>>>                         dev_info(dev, "Using IPMMU context %u\n",
>>>                                  domain->context_id);
>>>                 }
>>> +#endif
>>>         } else if (domain->mmu != mmu) {
>>>                 /*
>>>                  * Something is wrong, we can't attach two devices using
>>> @@ -834,6 +1103,14 @@ static void ipmmu_detach_device(struct iommu_domain
>>> *io_domain,
>>>          */
>>>  }
>>>
>>> +/*
>>> + * Xen: The current implementation of these callbacks is insufficient for
>>> us
>>> + * since they are intended to be called from Linux IOMMU core that
>>> + * has already done all required actions such as doing various checks,
>>> + * splitting into memory block the hardware supports and so on.
>>
>>
>> Can you expand it here? Why can't our IOMMU framework could do that?
>
> If add all required support to IOMMU framework and modify all existing
> IOMMU drivers
> to follow this support, then yes, it will avoid IOMMU drivers such as
> IPMMU-VMSA from having these stuff in.
>
> To be honest, I was trying to touch IOMMU common code and other IOMMU
> drivers as little as possible,
> but I had to introduce a few changes ("non-shared IOMMU").

What I am looking is something we can easily maintain in the future. If 
it requires change in the common code then we should do it. If it 
happens to be too complex, then maybe we should not take it from Linux.

>
>>
>> IHMO, if we want to get driver from Linux, we need to get an interface very
>> close to it. Otherwise it is not worth it because you would have to
>> implement for each IOMMU.
> You are right.
>
>>
>> My overall feeling at the moment is Xen is not ready to welcome this driver
>> directly from Linux. This is also a BSP driver, so no thorough review done
>> by the community.
>
> As I said in a cover letter the BSP driver had more complete support
> than the mainline one.

I know. But this means we are going to bring code in Xen that was not 
fully reviewed and don't know the quality of the code.

> I would like to clarify what need to be done from my side.
> Should I wait for the missing things reach upsteam and then rebase on
> the mainline driver?
> Or should I rewrite this driver without following Linux?

I don't have a clear answer here. As I said, we need to weight pros and 
cons to use Linux driver over our own.

At the moment, you are using a BSP driver which has more features but 
modified quite a lot. We don't even know when this is going to be merged 
in Linux.

Keeping code close to Linux requires some hacks that are acceptable if 
you can benefits from the community (bug fix, review...). As the driver 
is taken from the BSP, we don't know if the code will stay in the 
current form nor be able to get bug fix.

Cheers,
Oleksandr Tyshchenko Aug. 21, 2017, 3:53 p.m. UTC | #4
Hi, Julien.

Sorry for the late response.

On Thu, Aug 10, 2017 at 6:13 PM, Julien Grall <julien.grall@arm.com> wrote:
> Hi,
>
> On 10/08/17 15:27, Oleksandr Tyshchenko wrote:
>>
>> On Tue, Aug 8, 2017 at 2:34 PM, Julien Grall <julien.grall@arm.com> wrote:
>>>
>>> On 26/07/17 16:09, Oleksandr Tyshchenko wrote:
>>>>
>>>> @@ -355,6 +557,10 @@ static struct hw_register
>>>> *root_pgtable[IPMMU_CTX_MAX] = {
>>>>
>>>>  static bool ipmmu_is_root(struct ipmmu_vmsa_device *mmu)
>>>>  {
>>>> +       /* Xen: Fix */
>>>
>>>
>>>
>>> Hmmm. Can we get a bit more details?
>>
>>
>> These is a case when ipmmu_is_root is called with "mmu" being NULL.
>>
>> https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2330
>>
>> In ipmmu_vmsa_alloc_page_table() we need to find "root mmu", but we
>> doesn't have argument to pass.
>> So, I had two options:
>>
>> 1. Add code searching for it.
>> ...
>> spin_lock(&ipmmu_devices_lock);
>> list_for_each_entry(mmu, &ipmmu_devices, list) {
>>    if (ipmmu_is_root(mmu))
>>       break;
>> }
>> spin_unlock(&ipmmu_devices_lock);
>> ...
>>
>> 2. Use existing ipmmu_find_root() with adding this check for a valid
>> value.
>> So, if we call ipmmu_find_root() with argument being NULL we will
>> actually get searching the list.
>>
>> I decided to use 2 option.
>
>
> Can you please expand the comment then?
Will do.

>
>
>>
>>>
>>>> +       if (!mmu)
>>>> +               return false;
>>>> +
>>>>         if (mmu->features->has_cache_leaf_nodes)
>>>>                 return mmu->is_leaf ? false : true;
>>>>         else
>>>> @@ -405,14 +611,28 @@ static void ipmmu_ctx_write(struct
>>>> ipmmu_vmsa_domain
>>>> *domain, unsigned int reg,
>>>>         ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE +
>>>> reg,
>>>> data);
>>>>  }
>>>>
>>>> -static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned
>>>> int reg,
>>>> +/* Xen: Write the context for cache IPMMU only. */
>>>
>>>
>>>
>>> Same here. Why does it need to be different with Xen?
>>
>>
>> Well, let me elaborate a bit more about this.
>>
>> I feel that I need to explain in a few words about IPMMU itself:
>> Generally speaking,
>> The IPMMU hardware (R-Car Gen3) has 8 context banks and consists of next
>> parts:
>> - root IPMMU
>> - a number of cache IPMMUs
>>
>> Each cache IPMMU is connected to root IPMMU and has uTLB ports the
>> master devices can be tied to.
>> Something, like this:
>>
>> master device1 ---> cache IPMMU1 [8 ctx] ---> root IPMMU [8 ctx] -> memory
>>                            |                                          |
>> master device2 --                                          |
>>                                                                       |
>> master device3 ---> cache IPMMU2 [8 ctx] --
>>
>> Each context bank has registers.
>> Some registers exist for both root IPMMU and cache IPMMUs -> IMCTR
>> Some registers exist only for root IPMMU -> IMTTLBRx/IMTTUBRx, IMMAIR0,
>> etc
>>
>> So, original driver has two helpers:
>> 1. ipmmu_ctx_write() - is intended to write a register in context bank
>> N* for root IPMMU only.
>> 2. ipmmu_ctx_write2() - is intended to write a register in context
>> bank N for both root IPMMU and cache IPMMU.
>> *where N=0-7
>>
>> AFAIU, original Linux driver provides each IOMMU domain with a
>> separate IPMMU context:
>> master device1 + master device2 are in IOMMU domain1 and use IPMMU context
>> 0
>> master device3 is in IOMMU domain2 and uses IPMMU context 1
>>
>> So, when attaching device to new IOMMU domain in Linux we have to
>> initialize context for root IPMMU and enable context (IMCTR register)
>> for both root and cache IPMMUs.
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas-bsp.git/tree/drivers/iommu/ipmmu-vmsa.c?h=v4.9/rcar-3.5.3#n620
>>
>> In Xen we need additional helper "ipmmu_ctx_write1" for writing a
>> register in context bank N for cache IPMMU only.
>> The reason is that we need a way to control cache IPMMU separately
>> since we have a little bit another model.
>>
>> All IOMMU domains within a single Xen domain (dom_iommu(d)->arch.priv)
>> use the same IPMMU context N
>> which was initialized and enabled at the domain creation time. This
>> means that all master devices
>> that are assigned to the guest domain "d" use only this IPMMU context
>> N which actually contains P2M mapping for domain "d":
>> master device1 + master device2 are in IOMMU domain1 and use IPMMU context
>> 0
>> master device3 is in IOMMU domain2 and also uses IPMMU context 0
>>
>> So, when attaching device to new IOMMU domain in Xen we don't have to
>> initialize and enable context,
>> because it has been already done at domain initialization time:
>>
>> https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2380
>> we just have to enable context for corresponding cache IPMMU only:
>>
>> https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L1083
>>
>> This is the main difference between drivers in Linux and Xen.
>>
>> So, as you can see there is a need to manipulate context registers for
>> cache IPMMU without touching root IPMMU,
>> that's why I added this helper.
>>
>> Does this make sense?
>
>
> I think it does.
good.

>
>
>>
>>>
>>>
>>>> +static void ipmmu_ctx_write1(struct ipmmu_vmsa_domain *domain, unsigned
>>>> int reg,
>>>>                              u32 data)
>>>>  {
>>>>         if (domain->mmu != domain->root)
>>>> -               ipmmu_write(domain->mmu,
>>>> -                           domain->context_id * IM_CTX_SIZE + reg,
>>>> data);
>>>> +               ipmmu_write(domain->mmu, domain->context_id *
>>>> IM_CTX_SIZE
>>>> + reg, data);
>>>> +}
>>>>
>>>> -       ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE +
>>>> reg,
>>>> data);
>>>> +/*
>>>> + * Xen: Write the context for both root IPMMU and all cache IPMMUs
>>>> + * that assigned to this Xen domain.
>>>> + */
>>>> +static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned
>>>> int reg,
>>>> +                            u32 data)
>>>> +{
>>>> +       struct ipmmu_vmsa_xen_domain *xen_domain =
>>>> dom_iommu(domain->d)->arch.priv;
>>>> +       struct iommu_domain *io_domain;
>>>> +
>>>> +       list_for_each_entry(io_domain, &xen_domain->contexts, list)
>>>> +               ipmmu_ctx_write1(to_vmsa_domain(io_domain), reg, data);
>>>> +
>>>> +       ipmmu_ctx_write(domain, reg, data);
>>>>  }
>>>>
>>>>  /*
>>>>
>>>> -----------------------------------------------------------------------------
>>>> @@ -488,6 +708,10 @@ static void ipmmu_tlb_flush_all(void *cookie)
>>>>  {
>>>>         struct ipmmu_vmsa_domain *domain = cookie;
>>>>
>>>> +       /* Xen: Just return if context_id has non-existent value */
>>>
>>>
>>>
>>> Same here.
>>
>>
>> I think that there is a possible race.
>> In ipmmu_domain_init_context() we are trying to allocate context and
>> if allocation fails we will call free_io_pgtable_ops(),
>> but "domain->context_id" hasn't been initialized yet (likely it is 0).
>>
>> https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L799
>>
>> And having following call stack:
>> free_io_pgtable_ops() -> io_pgtable_tlb_flush_all() ->
>> ipmmu_tlb_flush_all() -> ipmmu_tlb_invalidate()
>> we will get a mistaken cache flush for a context pointed by
>> uninitialized "domain->context_id".
>>
>> That's why I initialized context_id with non-existent value before
>> allocating context
>>
>> https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L792
>> and checked it for a valid value here
>>
>> https://github.com/otyshchenko1/xen/blob/fc231a0f2edb3d01d178fb5c27dd6c1065807c81/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L712
>> and everywhere it is need to checked.
>
>
> The race is in the code added or the one from Linux? If the latter, then you
> should have an action to fix it there. If the former, the I'd like to
> understand how come we introduced a race compare to Linux.
I think the latter. I have just pushed a patch in.
https://lists.linuxfoundation.org/pipermail/iommu/2017-August/023857.html

>
> [...]
>
>>>
>>>> +
>>>>         /*
>>>>          * Find an unused context.
>>>>          */
>>>> @@ -578,6 +807,11 @@ static int ipmmu_domain_init_context(struct
>>>> ipmmu_vmsa_domain *domain)
>>>>
>>>>         /* TTBR0 */
>>>>         ttbr = domain->cfg.arm_lpae_s1_cfg.ttbr[0];
>>>> +
>>>> +       /* Xen: */
>>>> +       dev_notice(domain->root->dev, "d%d: Set IPMMU context %u (pgd
>>>> 0x%"PRIx64")\n",
>>>> +                       domain->d->domain_id, domain->context_id, ttbr);
>>>
>>>
>>>
>>> If you want to keep driver close to Linux, then you need to avoid
>>> unecessary
>>> change.
>>
>> Shall I drop it?
>
>
> Depends. How useful is it? If it is, then may you want to upstream that?
Not so useful, but it is better to keep it while driver is in progress.
However, I can move this print out of the ipmmu_domain_init_context().
"Our" ipmmu_vmsa_alloc_page_table() is a good candidate to have it.

>
> [...]
>
>
>>>>  static int ipmmu_attach_device(struct iommu_domain *io_domain,
>>>>                                struct device *dev)
>>>> @@ -787,7 +1042,20 @@ static int ipmmu_attach_device(struct iommu_domain
>>>> *io_domain,
>>>>                 /* The domain hasn't been used yet, initialize it. */
>>>>                 domain->mmu = mmu;
>>>>                 domain->root = root;
>>>> +
>>>> +/*
>>>> + * Xen: We have already initialized and enabled context for root IPMMU
>>>> + * for this Xen domain. Enable context for given cache IPMMU only.
>>>> + * Flush the TLB as required when modifying the context registers.
>>>
>>>
>>>
>>> Why?
>>
>>
>> Original Linux driver provides each IOMMU domain with a separate IPMMU
>> context.
>> So, when attaching device to IOMMU domain which hasn't been
>> initialized yet we have to
>> call ipmmu_domain_init_context() for initializing (root only) and
>> enabling (root + cache * ) context for this IOMMU domain.
>>
>> * You can see at the end of the "original" ipmmu_domain_init_context()
>> implementation, that context is enabled for both cache and root IPMMUs
>> because of "ipmmu_ctx_write2".
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas-bsp.git/tree/drivers/iommu/ipmmu-vmsa.c?h=v4.9/rcar-3.5.3#n620
>>
>> From my point of view, we don't have to do the same when we are
>> attaching device in Xen, as we keep only one IPMMU context (P2M
>> mappings) per Xen domain
>> for using by all assigned to this guest devices.
>> What is more a number of context banks is limited (8), and if we
>> followed Linux way here, we would be quickly run out of available
>> contexts.
>> But having one IPMMU context per Xen domain allow us to passthrough
>> devices to 8 guest domain.
>
>
> The way you describe it give an impression that the driver is fundamentally
> different in Xen compare to Linux. Am I right?
It is hard to say, is "fundamentally different" suitable combination here.

Drivers differ mostly in context assignment.
Also Xen driver has "VMSAv8-64 mode" enabled and asynchronous page
table deallocation sequence.

So, probably, yes.

>
>
>>
>> Taking into the account described above, we initialize (root only) and
>> enable (root only ** ) context at the domain creation time
>> if IOMMU is expected to be used for this guest.
>>
>> https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L2380
>>
>> ** You can see at the end of the "modified"
>> ipmmu_domain_init_context() implementation, that context is enabled
>> for root IPMMU only
>> because of "ipmmu_ctx_write".
>>
>> https://github.com/otyshchenko1/xen/blob/ipmmu_v2/xen/drivers/passthrough/arm/ipmmu-vmsa.c#L882
>>
>> That's why, here, in ipmmu_attach_device() we don't have to call
>> ipmmu_domain_init_context() anymore, because
>> the context has been already initialized and enabled. All what we need
>> here is to enable this context for cache IPMMU the device
>> is physically connected to.
>>
>> Does this make sense?
>>
>>>
>>>
>>>> + */
>>>> +#if 0
>>>>                 ret = ipmmu_domain_init_context(domain);
>>>> +#endif
>>>> +               ipmmu_ctx_write1(domain, IMCTR,
>>>> +                               ipmmu_ctx_read(domain, IMCTR) |
>>>> IMCTR_FLUSH);
>>>> +
>>>> +               dev_info(dev, "Using IPMMU context %u\n",
>>>> domain->context_id);
>>>> +#if 0 /* Xen: Not needed */
>>>>                 if (ret < 0) {
>>>>                         dev_err(dev, "Unable to initialize IPMMU
>>>> context\n");
>>>>                         domain->mmu = NULL;
>>>> @@ -795,6 +1063,7 @@ static int ipmmu_attach_device(struct iommu_domain
>>>> *io_domain,
>>>>                         dev_info(dev, "Using IPMMU context %u\n",
>>>>                                  domain->context_id);
>>>>                 }
>>>> +#endif
>>>>         } else if (domain->mmu != mmu) {
>>>>                 /*
>>>>                  * Something is wrong, we can't attach two devices using
>>>> @@ -834,6 +1103,14 @@ static void ipmmu_detach_device(struct
>>>> iommu_domain
>>>> *io_domain,
>>>>          */
>>>>  }
>>>>
>>>> +/*
>>>> + * Xen: The current implementation of these callbacks is insufficient
>>>> for
>>>> us
>>>> + * since they are intended to be called from Linux IOMMU core that
>>>> + * has already done all required actions such as doing various checks,
>>>> + * splitting into memory block the hardware supports and so on.
>>>
>>>
>>>
>>> Can you expand it here? Why can't our IOMMU framework could do that?
>>
>>
>> If add all required support to IOMMU framework and modify all existing
>> IOMMU drivers
>> to follow this support, then yes, it will avoid IOMMU drivers such as
>> IPMMU-VMSA from having these stuff in.
>>
>> To be honest, I was trying to touch IOMMU common code and other IOMMU
>> drivers as little as possible,
>> but I had to introduce a few changes ("non-shared IOMMU").
>
>
> What I am looking is something we can easily maintain in the future. If it
> requires change in the common code then we should do it. If it happens to be
> too complex, then maybe we should not take it from Linux.
I understand your point.

>
>>
>>>
>>> IHMO, if we want to get driver from Linux, we need to get an interface
>>> very
>>> close to it. Otherwise it is not worth it because you would have to
>>> implement for each IOMMU.
>>
>> You are right.
>>
>>>
>>> My overall feeling at the moment is Xen is not ready to welcome this
>>> driver
>>> directly from Linux. This is also a BSP driver, so no thorough review
>>> done
>>> by the community.
>>
>>
>> As I said in a cover letter the BSP driver had more complete support
>> than the mainline one.
>
>
> I know. But this means we are going to bring code in Xen that was not fully
> reviewed and don't know the quality of the code.
>
>> I would like to clarify what need to be done from my side.
>> Should I wait for the missing things reach upsteam and then rebase on
>> the mainline driver?
>> Or should I rewrite this driver without following Linux?
>
>
> I don't have a clear answer here. As I said, we need to weight pros and cons
> to use Linux driver over our own.
>
> At the moment, you are using a BSP driver which has more features but
> modified quite a lot. We don't even know when this is going to be merged in
> Linux.
>
> Keeping code close to Linux requires some hacks that are acceptable if you
> can benefits from the community (bug fix, review...). As the driver is taken
> from the BSP, we don't know if the code will stay in the current form nor be
> able to get bug fix.

I got it. Completely agree with you.
But, we need to choose which direction we should follow. We have 3
options at the moment
and I am OK with each of them:
1. direct port from BSP (current implementation).
2. direct port from mainline Linux (when it has required support).
3. new driver based on BSP/Linux and contains only relevant to Xen things.

I am starting to think that options 2 or 3 (+) would be more suitable.
What do you think?

>
> Cheers,
>
> --
> Julien Grall
Julien Grall Aug. 23, 2017, 11:41 a.m. UTC | #5
Hi Oleksandr,

On 21/08/17 16:53, Oleksandr Tyshchenko wrote:
> On Thu, Aug 10, 2017 at 6:13 PM, Julien Grall <julien.grall@arm.com> wrote:
>> On 10/08/17 15:27, Oleksandr Tyshchenko wrote:
>>> I would like to clarify what need to be done from my side.
>>> Should I wait for the missing things reach upsteam and then rebase on
>>> the mainline driver?
>>> Or should I rewrite this driver without following Linux?
>>
>>
>> I don't have a clear answer here. As I said, we need to weight pros and cons
>> to use Linux driver over our own.
>>
>> At the moment, you are using a BSP driver which has more features but
>> modified quite a lot. We don't even know when this is going to be merged in
>> Linux.
>>
>> Keeping code close to Linux requires some hacks that are acceptable if you
>> can benefits from the community (bug fix, review...). As the driver is taken
>> from the BSP, we don't know if the code will stay in the current form nor be
>> able to get bug fix.
>
> I got it. Completely agree with you.
> But, we need to choose which direction we should follow. We have 3
> options at the moment
> and I am OK with each of them:
> 1. direct port from BSP (current implementation).
> 2. direct port from mainline Linux (when it has required support).
> 3. new driver based on BSP/Linux and contains only relevant to Xen things.
>
> I am starting to think that options 2 or 3 (+) would be more suitable.
> What do you think?

The option 2 rely on the changes to be merged in Linux. If I understand 
correctly, we don't have any timeline for this.

So I would lean towards option 3 to get a support in Xen.

Stefano, do you have any opinion?

Cheers,
Stefano Stabellini Aug. 25, 2017, 8:06 p.m. UTC | #6
On Wed, 23 Aug 2017, Julien Grall wrote:
> Hi Oleksandr,
> 
> On 21/08/17 16:53, Oleksandr Tyshchenko wrote:
> > On Thu, Aug 10, 2017 at 6:13 PM, Julien Grall <julien.grall@arm.com> wrote:
> > > On 10/08/17 15:27, Oleksandr Tyshchenko wrote:
> > > > I would like to clarify what need to be done from my side.
> > > > Should I wait for the missing things reach upsteam and then rebase on
> > > > the mainline driver?
> > > > Or should I rewrite this driver without following Linux?
> > > 
> > > 
> > > I don't have a clear answer here. As I said, we need to weight pros and
> > > cons
> > > to use Linux driver over our own.
> > > 
> > > At the moment, you are using a BSP driver which has more features but
> > > modified quite a lot. We don't even know when this is going to be merged
> > > in
> > > Linux.
> > > 
> > > Keeping code close to Linux requires some hacks that are acceptable if you
> > > can benefits from the community (bug fix, review...). As the driver is
> > > taken
> > > from the BSP, we don't know if the code will stay in the current form nor
> > > be
> > > able to get bug fix.
> > 
> > I got it. Completely agree with you.
> > But, we need to choose which direction we should follow. We have 3
> > options at the moment
> > and I am OK with each of them:
> > 1. direct port from BSP (current implementation).
> > 2. direct port from mainline Linux (when it has required support).
> > 3. new driver based on BSP/Linux and contains only relevant to Xen things.
> > 
> > I am starting to think that options 2 or 3 (+) would be more suitable.
> > What do you think?
> 
> The option 2 rely on the changes to be merged in Linux. If I understand
> correctly, we don't have any timeline for this.
> 
> So I would lean towards option 3 to get a support in Xen.
> 
> Stefano, do you have any opinion?

I agree with Julien. Option 3 is the way to go. There is only a benefit
in staying close to Linux if their driver is in good state, fully
featured, and well-maintained. And we certainly don't want to block your
work on waiting for somebody else who might or might nor merge his
changes in Linux. In this case, option 3 is best. I warn you, you might
have to maintain this driver in Xen going forward though :-)
Oleksandr Tyshchenko Aug. 28, 2017, 5:29 p.m. UTC | #7
Hi, Stefano, Julien.

On Fri, Aug 25, 2017 at 11:06 PM, Stefano Stabellini
<sstabellini@kernel.org> wrote:
> On Wed, 23 Aug 2017, Julien Grall wrote:
>> Hi Oleksandr,
>>
>> On 21/08/17 16:53, Oleksandr Tyshchenko wrote:
>> > On Thu, Aug 10, 2017 at 6:13 PM, Julien Grall <julien.grall@arm.com> wrote:
>> > > On 10/08/17 15:27, Oleksandr Tyshchenko wrote:
>> > > > I would like to clarify what need to be done from my side.
>> > > > Should I wait for the missing things reach upsteam and then rebase on
>> > > > the mainline driver?
>> > > > Or should I rewrite this driver without following Linux?
>> > >
>> > >
>> > > I don't have a clear answer here. As I said, we need to weight pros and
>> > > cons
>> > > to use Linux driver over our own.
>> > >
>> > > At the moment, you are using a BSP driver which has more features but
>> > > modified quite a lot. We don't even know when this is going to be merged
>> > > in
>> > > Linux.
>> > >
>> > > Keeping code close to Linux requires some hacks that are acceptable if you
>> > > can benefits from the community (bug fix, review...). As the driver is
>> > > taken
>> > > from the BSP, we don't know if the code will stay in the current form nor
>> > > be
>> > > able to get bug fix.
>> >
>> > I got it. Completely agree with you.
>> > But, we need to choose which direction we should follow. We have 3
>> > options at the moment
>> > and I am OK with each of them:
>> > 1. direct port from BSP (current implementation).
>> > 2. direct port from mainline Linux (when it has required support).
>> > 3. new driver based on BSP/Linux and contains only relevant to Xen things.
>> >
>> > I am starting to think that options 2 or 3 (+) would be more suitable.
>> > What do you think?
>>
>> The option 2 rely on the changes to be merged in Linux. If I understand
>> correctly, we don't have any timeline for this.
>>
>> So I would lean towards option 3 to get a support in Xen.
>>
>> Stefano, do you have any opinion?
>
> I agree with Julien. Option 3 is the way to go. There is only a benefit
> in staying close to Linux if their driver is in good state, fully
> featured, and well-maintained. And we certainly don't want to block your
> work on waiting for somebody else who might or might nor merge his
> changes in Linux. In this case, option 3 is best.
Thank you for your suggestions.

> I warn you, you might
> have to maintain this driver in Xen going forward though :-)
Why not :-)
diff mbox

Patch

diff --git a/xen/drivers/passthrough/arm/ipmmu-vmsa.c b/xen/drivers/passthrough/arm/ipmmu-vmsa.c
index 2b380ff..e54b507 100644
--- a/xen/drivers/passthrough/arm/ipmmu-vmsa.c
+++ b/xen/drivers/passthrough/arm/ipmmu-vmsa.c
@@ -6,31 +6,212 @@ 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; version 2 of the License.
+ *
+ * Based on Linux drivers/iommu/ipmmu-vmsa.c
+ * => commit f4747eba89c9b5d90fdf0a5458866283c47395d8
+ * (iommu/ipmmu-vmsa: Restrict IOMMU Domain Geometry to 32-bit address space)
+ *
+ * Xen modification:
+ * Oleksandr Tyshchenko <Oleksandr_Tyshchenko@epam.com>
+ * Copyright (C) 2016-2017 EPAM Systems Inc.
  */
 
-#include <linux/bitmap.h>
-#include <linux/delay.h>
-#include <linux/dma-iommu.h>
-#include <linux/dma-mapping.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/iommu.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/of_iommu.h>
-#include <linux/platform_device.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-
-#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
-#include <asm/dma-iommu.h>
-#include <asm/pgalloc.h>
-#endif
+#include <xen/config.h>
+#include <xen/delay.h>
+#include <xen/errno.h>
+#include <xen/err.h>
+#include <xen/irq.h>
+#include <xen/lib.h>
+#include <xen/list.h>
+#include <xen/mm.h>
+#include <xen/vmap.h>
+#include <xen/rbtree.h>
+#include <xen/sched.h>
+#include <xen/sizes.h>
+#include <asm/atomic.h>
+#include <asm/device.h>
+#include <asm/io.h>
+#include <asm/platform.h>
 
 #include "io-pgtable.h"
 
+/* TODO:
+ * 1. Optimize xen_domain->lock usage.
+ * 2. Show domain_id in every printk which is per Xen domain.
+ *
+ */
+
+/***** Start of Xen specific code *****/
+
+#define IOMMU_READ	(1 << 0)
+#define IOMMU_WRITE	(1 << 1)
+#define IOMMU_CACHE	(1 << 2) /* DMA cache coherency */
+#define IOMMU_NOEXEC	(1 << 3)
+#define IOMMU_MMIO	(1 << 4) /* e.g. things like MSI doorbells */
+
+#define __fls(x) (fls(x) - 1)
+#define __ffs(x) (ffs(x) - 1)
+
+#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
+
+#define ioread32 readl
+#define iowrite32 writel
+
+#define dev_info dev_notice
+
+#define devm_request_irq(unused, irq, func, flags, name, dev) \
+	request_irq(irq, flags, func, name, dev)
+
+/* Alias to Xen device tree helpers */
+#define device_node dt_device_node
+#define of_phandle_args dt_phandle_args
+#define of_device_id dt_device_match
+#define of_match_node dt_match_node
+#define of_parse_phandle_with_args dt_parse_phandle_with_args
+#define of_find_property dt_find_property
+#define of_count_phandle_with_args dt_count_phandle_with_args
+
+/* Xen: Helpers to get device MMIO and IRQs */
+struct resource
+{
+	u64 addr;
+	u64 size;
+	unsigned int type;
+};
+
+#define resource_size(res) (res)->size;
+
+#define platform_device dt_device_node
+
+#define IORESOURCE_MEM 0
+#define IORESOURCE_IRQ 1
+
+static struct resource *platform_get_resource(struct platform_device *pdev,
+					      unsigned int type,
+					      unsigned int num)
+{
+	/*
+	 * The resource is only used between 2 calls of platform_get_resource.
+	 * It's quite ugly but it's avoid to add too much code in the part
+	 * imported from Linux
+	 */
+	static struct resource res;
+	int ret = 0;
+
+	res.type = type;
+
+	switch (type) {
+	case IORESOURCE_MEM:
+		ret = dt_device_get_address(pdev, num, &res.addr, &res.size);
+
+		return ((ret) ? NULL : &res);
+
+	case IORESOURCE_IRQ:
+		ret = platform_get_irq(pdev, num);
+		if (ret < 0)
+			return NULL;
+
+		res.addr = ret;
+		res.size = 1;
+
+		return &res;
+
+	default:
+		return NULL;
+	}
+}
+
+enum irqreturn {
+	IRQ_NONE	= (0 << 0),
+	IRQ_HANDLED	= (1 << 0),
+};
+
+typedef enum irqreturn irqreturn_t;
+
+/* Device logger functions */
+#define dev_print(dev, lvl, fmt, ...)						\
+	 printk(lvl "ipmmu: %s: " fmt, dt_node_full_name(dev_to_dt(dev)), ## __VA_ARGS__)
+
+#define dev_dbg(dev, fmt, ...) dev_print(dev, XENLOG_DEBUG, fmt, ## __VA_ARGS__)
+#define dev_notice(dev, fmt, ...) dev_print(dev, XENLOG_INFO, fmt, ## __VA_ARGS__)
+#define dev_warn(dev, fmt, ...) dev_print(dev, XENLOG_WARNING, fmt, ## __VA_ARGS__)
+#define dev_err(dev, fmt, ...) dev_print(dev, XENLOG_ERR, fmt, ## __VA_ARGS__)
+
+#define dev_err_ratelimited(dev, fmt, ...)					\
+	 dev_print(dev, XENLOG_ERR, fmt, ## __VA_ARGS__)
+
+#define dev_name(dev) dt_node_full_name(dev_to_dt(dev))
+
+/* Alias to Xen allocation helpers */
+#define kfree xfree
+#define kmalloc(size, flags)		_xmalloc(size, sizeof(void *))
+#define kzalloc(size, flags)		_xzalloc(size, sizeof(void *))
+#define devm_kzalloc(dev, size, flags)	_xzalloc(size, sizeof(void *))
+#define kmalloc_array(size, n, flags)	_xmalloc_array(size, sizeof(void *), n)
+#define kcalloc(size, n, flags)		_xzalloc_array(size, sizeof(void *), n)
+
+static void __iomem *devm_ioremap_resource(struct device *dev,
+					   struct resource *res)
+{
+	void __iomem *ptr;
+
+	if (!res || res->type != IORESOURCE_MEM) {
+		dev_err(dev, "Invalid resource\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	ptr = ioremap_nocache(res->addr, res->size);
+	if (!ptr) {
+		dev_err(dev,
+			"ioremap failed (addr 0x%"PRIx64" size 0x%"PRIx64")\n",
+			res->addr, res->size);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return ptr;
+}
+
+/* Xen doesn't handle IOMMU fault */
+#define report_iommu_fault(...)	1
+
+#define MODULE_DEVICE_TABLE(type, name)
+#define module_param_named(name, value, type, perm)
+#define MODULE_PARM_DESC(_parm, desc)
+
+/* Xen: Dummy iommu_domain */
+struct iommu_domain
+{
+	atomic_t ref;
+	/* Used to link iommu_domain contexts for a same domain.
+	 * There is at least one per-IPMMU to used by the domain.
+	 * */
+	struct list_head		list;
+};
+
+/* Xen: Describes informations required for a Xen domain */
+struct ipmmu_vmsa_xen_domain {
+	spinlock_t			lock;
+	/* List of context (i.e iommu_domain) associated to this domain */
+	struct list_head		contexts;
+	struct iommu_domain		*base_context;
+};
+
+/*
+ * Xen: Information about each device stored in dev->archdata.iommu
+ *
+ * On Linux the dev->archdata.iommu only stores the arch specific information,
+ * but, on Xen, we also have to store the iommu domain.
+ */
+struct ipmmu_vmsa_xen_device {
+	struct iommu_domain *domain;
+	struct ipmmu_vmsa_archdata *archdata;
+};
+
+#define dev_iommu(dev) ((struct ipmmu_vmsa_xen_device *)dev->archdata.iommu)
+#define dev_iommu_domain(dev) (dev_iommu(dev)->domain)
+
+/***** Start of Linux IPMMU code *****/
+
 #define IPMMU_CTX_MAX 8
 
 struct ipmmu_features {
@@ -64,7 +245,9 @@  struct ipmmu_vmsa_device {
 	struct hw_register *reg_backup[IPMMU_CTX_MAX];
 #endif
 
+#if 0 /* Xen: Not needed */
 	struct dma_iommu_mapping *mapping;
+#endif
 };
 
 struct ipmmu_vmsa_domain {
@@ -77,6 +260,9 @@  struct ipmmu_vmsa_domain {
 
 	unsigned int context_id;
 	spinlock_t lock;			/* Protects mappings */
+
+	/* Xen: Domain associated to this configuration */
+	struct domain *d;
 };
 
 struct ipmmu_vmsa_archdata {
@@ -94,14 +280,20 @@  struct ipmmu_vmsa_archdata {
 static DEFINE_SPINLOCK(ipmmu_devices_lock);
 static LIST_HEAD(ipmmu_devices);
 
+#if 0 /* Xen: Not needed */
 static DEFINE_SPINLOCK(ipmmu_slave_devices_lock);
 static LIST_HEAD(ipmmu_slave_devices);
+#endif
 
 static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom)
 {
 	return container_of(dom, struct ipmmu_vmsa_domain, io_domain);
 }
 
+/*
+ * Xen: Rewrite Linux helpers to manipulate with archdata on Xen.
+ */
+#if 0
 #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
 static struct ipmmu_vmsa_archdata *to_archdata(struct device *dev)
 {
@@ -120,6 +312,16 @@  static void set_archdata(struct device *dev, struct ipmmu_vmsa_archdata *p)
 {
 }
 #endif
+#else
+static struct ipmmu_vmsa_archdata *to_archdata(struct device *dev)
+{
+	return dev_iommu(dev)->archdata;
+}
+static void set_archdata(struct device *dev, struct ipmmu_vmsa_archdata *p)
+{
+	dev_iommu(dev)->archdata = p;
+}
+#endif
 
 #define TLB_LOOP_TIMEOUT		100	/* 100us */
 
@@ -355,6 +557,10 @@  static struct hw_register *root_pgtable[IPMMU_CTX_MAX] = {
 
 static bool ipmmu_is_root(struct ipmmu_vmsa_device *mmu)
 {
+	/* Xen: Fix */
+	if (!mmu)
+		return false;
+
 	if (mmu->features->has_cache_leaf_nodes)
 		return mmu->is_leaf ? false : true;
 	else
@@ -405,14 +611,28 @@  static void ipmmu_ctx_write(struct ipmmu_vmsa_domain *domain, unsigned int reg,
 	ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE + reg, data);
 }
 
-static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned int reg,
+/* Xen: Write the context for cache IPMMU only. */
+static void ipmmu_ctx_write1(struct ipmmu_vmsa_domain *domain, unsigned int reg,
 			     u32 data)
 {
 	if (domain->mmu != domain->root)
-		ipmmu_write(domain->mmu,
-			    domain->context_id * IM_CTX_SIZE + reg, data);
+		ipmmu_write(domain->mmu, domain->context_id * IM_CTX_SIZE + reg, data);
+}
 
-	ipmmu_write(domain->root, domain->context_id * IM_CTX_SIZE + reg, data);
+/*
+ * Xen: Write the context for both root IPMMU and all cache IPMMUs
+ * that assigned to this Xen domain.
+ */
+static void ipmmu_ctx_write2(struct ipmmu_vmsa_domain *domain, unsigned int reg,
+			     u32 data)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(domain->d)->arch.priv;
+	struct iommu_domain *io_domain;
+
+	list_for_each_entry(io_domain, &xen_domain->contexts, list)
+		ipmmu_ctx_write1(to_vmsa_domain(io_domain), reg, data);
+
+	ipmmu_ctx_write(domain, reg, data);
 }
 
 /* -----------------------------------------------------------------------------
@@ -488,6 +708,10 @@  static void ipmmu_tlb_flush_all(void *cookie)
 {
 	struct ipmmu_vmsa_domain *domain = cookie;
 
+	/* Xen: Just return if context_id has non-existent value */
+	if (domain->context_id >= domain->root->num_ctx)
+		return;
+
 	ipmmu_tlb_invalidate(domain);
 }
 
@@ -549,8 +773,10 @@  static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
 	domain->cfg.ias = 32;
 	domain->cfg.oas = 40;
 	domain->cfg.tlb = &ipmmu_gather_ops;
+#if 0 /* Xen: Not needed */
 	domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32);
 	domain->io_domain.geometry.force_aperture = true;
+#endif
 	/*
 	 * TODO: Add support for coherent walk through CCI with DVM and remove
 	 * cache handling. For now, delegate it to the io-pgtable code.
@@ -562,6 +788,9 @@  static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
 	if (!domain->iop)
 		return -EINVAL;
 
+	/* Xen: Initialize context_id with non-existent value */
+	domain->context_id = domain->root->num_ctx;
+
 	/*
 	 * Find an unused context.
 	 */
@@ -578,6 +807,11 @@  static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
 
 	/* TTBR0 */
 	ttbr = domain->cfg.arm_lpae_s1_cfg.ttbr[0];
+
+	/* Xen: */
+	dev_notice(domain->root->dev, "d%d: Set IPMMU context %u (pgd 0x%"PRIx64")\n",
+			domain->d->domain_id, domain->context_id, ttbr);
+
 	ipmmu_ctx_write(domain, IMTTLBR0, ttbr);
 	ipmmu_ctx_write(domain, IMTTUBR0, ttbr >> 32);
 
@@ -616,8 +850,9 @@  static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
 	 * translation table format doesn't use TEX remapping. Don't enable AF
 	 * software management as we have no use for it. Flush the TLB as
 	 * required when modifying the context registers.
+	 * Xen: Enable the context for the root IPMMU only.
 	 */
-	ipmmu_ctx_write2(domain, IMCTR,
+	ipmmu_ctx_write(domain, IMCTR,
 			 IMCTR_INTEN | IMCTR_FLUSH | IMCTR_MMUEN);
 
 	return 0;
@@ -638,13 +873,18 @@  static void ipmmu_domain_free_context(struct ipmmu_vmsa_device *mmu,
 
 static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain *domain)
 {
+	/* Xen: Just return if context_id has non-existent value */
+	if (domain->context_id >= domain->root->num_ctx)
+		return;
+
 	/*
 	 * Disable the context. Flush the TLB as required when modifying the
 	 * context registers.
 	 *
 	 * TODO: Is TLB flush really needed ?
+	 * Xen: Disable the context for the root IPMMU only.
 	 */
-	ipmmu_ctx_write2(domain, IMCTR, IMCTR_FLUSH);
+	ipmmu_ctx_write(domain, IMCTR, IMCTR_FLUSH);
 	ipmmu_tlb_sync(domain);
 
 #ifdef CONFIG_RCAR_DDR_BACKUP
@@ -652,12 +892,16 @@  static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain *domain)
 #endif
 
 	ipmmu_domain_free_context(domain->root, domain->context_id);
+
+	/* Xen: Initialize context_id with non-existent value */
+	domain->context_id = domain->root->num_ctx;
 }
 
 /* -----------------------------------------------------------------------------
  * Fault Handling
  */
 
+/* Xen: Show domain_id in every printk */
 static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
 {
 	const u32 err_mask = IMSTR_MHIT | IMSTR_ABORT | IMSTR_PF | IMSTR_TF;
@@ -681,11 +925,11 @@  static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
 
 	/* Log fatal errors. */
 	if (status & IMSTR_MHIT)
-		dev_err_ratelimited(mmu->dev, "Multiple TLB hits @0x%08x\n",
-				    iova);
+		dev_err_ratelimited(mmu->dev, "d%d: Multiple TLB hits @0x%08x\n",
+				domain->d->domain_id, iova);
 	if (status & IMSTR_ABORT)
-		dev_err_ratelimited(mmu->dev, "Page Table Walk Abort @0x%08x\n",
-				    iova);
+		dev_err_ratelimited(mmu->dev, "d%d: Page Table Walk Abort @0x%08x\n",
+				domain->d->domain_id, iova);
 
 	if (!(status & (IMSTR_PF | IMSTR_TF)))
 		return IRQ_NONE;
@@ -700,8 +944,8 @@  static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain)
 		return IRQ_HANDLED;
 
 	dev_err_ratelimited(mmu->dev,
-			    "Unhandled fault: status 0x%08x iova 0x%08x\n",
-			    status, iova);
+			"d%d: Unhandled fault: status 0x%08x iova 0x%08x\n",
+			domain->d->domain_id, status, iova);
 
 	return IRQ_HANDLED;
 }
@@ -730,6 +974,16 @@  static irqreturn_t ipmmu_irq(int irq, void *dev)
 	return status;
 }
 
+/* Xen: Interrupt handlers wrapper */
+static void ipmmu_irq_xen(int irq, void *dev,
+				      struct cpu_user_regs *regs)
+{
+	ipmmu_irq(irq, dev);
+}
+
+#define ipmmu_irq ipmmu_irq_xen
+
+#if 0 /* Xen: Not needed */
 /* -----------------------------------------------------------------------------
  * IOMMU Operations
  */
@@ -759,6 +1013,7 @@  static void ipmmu_domain_free(struct iommu_domain *io_domain)
 	free_io_pgtable_ops(domain->iop);
 	kfree(domain);
 }
+#endif
 
 static int ipmmu_attach_device(struct iommu_domain *io_domain,
 			       struct device *dev)
@@ -787,7 +1042,20 @@  static int ipmmu_attach_device(struct iommu_domain *io_domain,
 		/* The domain hasn't been used yet, initialize it. */
 		domain->mmu = mmu;
 		domain->root = root;
+
+/*
+ * Xen: We have already initialized and enabled context for root IPMMU
+ * for this Xen domain. Enable context for given cache IPMMU only.
+ * Flush the TLB as required when modifying the context registers.
+ */
+#if 0
 		ret = ipmmu_domain_init_context(domain);
+#endif
+		ipmmu_ctx_write1(domain, IMCTR,
+				ipmmu_ctx_read(domain, IMCTR) | IMCTR_FLUSH);
+
+		dev_info(dev, "Using IPMMU context %u\n", domain->context_id);
+#if 0 /* Xen: Not needed */
 		if (ret < 0) {
 			dev_err(dev, "Unable to initialize IPMMU context\n");
 			domain->mmu = NULL;
@@ -795,6 +1063,7 @@  static int ipmmu_attach_device(struct iommu_domain *io_domain,
 			dev_info(dev, "Using IPMMU context %u\n",
 				 domain->context_id);
 		}
+#endif
 	} else if (domain->mmu != mmu) {
 		/*
 		 * Something is wrong, we can't attach two devices using
@@ -834,6 +1103,14 @@  static void ipmmu_detach_device(struct iommu_domain *io_domain,
 	 */
 }
 
+/*
+ * Xen: The current implementation of these callbacks is insufficient for us
+ * since they are intended to be called from Linux IOMMU core that
+ * has already done all required actions such as doing various checks,
+ * splitting into memory block the hardware supports and so on.
+ * So, overwrite them with more completely functions.
+ */
+#if 0
 static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova,
 		     phys_addr_t paddr, size_t size, int prot)
 {
@@ -862,7 +1139,177 @@  static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain,
 
 	return domain->iop->iova_to_phys(domain->iop, iova);
 }
+#endif
+
+static size_t ipmmu_pgsize(struct iommu_domain *io_domain,
+		unsigned long addr_merge, size_t size)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+	unsigned int pgsize_idx;
+	size_t pgsize;
+
+	/* Max page size that still fits into 'size' */
+	pgsize_idx = __fls(size);
+
+	/* need to consider alignment requirements ? */
+	if (likely(addr_merge)) {
+		/* Max page size allowed by address */
+		unsigned int align_pgsize_idx = __ffs(addr_merge);
+		pgsize_idx = min(pgsize_idx, align_pgsize_idx);
+	}
+
+	/* build a mask of acceptable page sizes */
+	pgsize = (1UL << (pgsize_idx + 1)) - 1;
+
+	/* throw away page sizes not supported by the hardware */
+	pgsize &= domain->cfg.pgsize_bitmap;
+
+	/* make sure we're still sane */
+	BUG_ON(!pgsize);
+
+	/* pick the biggest page */
+	pgsize_idx = __fls(pgsize);
+	pgsize = 1UL << pgsize_idx;
+
+	return pgsize;
+}
+
+phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain, dma_addr_t iova)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+	if (unlikely(domain->iop->iova_to_phys == NULL))
+		return 0;
+
+	return domain->iop->iova_to_phys(domain->iop, iova);
+}
+
+size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova, size_t size)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+	size_t unmapped_page, unmapped = 0;
+	dma_addr_t max_iova;
+	unsigned int min_pagesz;
+
+	if (unlikely(domain->iop->unmap == NULL ||
+			domain->cfg.pgsize_bitmap == 0UL))
+		return -ENODEV;
+
+	/* find out the minimum page size supported */
+	min_pagesz = 1 << __ffs(domain->cfg.pgsize_bitmap);
+
+	/*
+	 * The virtual address, as well as the size of the mapping, must be
+	 * aligned (at least) to the size of the smallest page supported
+	 * by the hardware
+	 */
+	if (!IS_ALIGNED(iova | size, min_pagesz)) {
+		printk("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
+		       iova, size, min_pagesz);
+		return -EINVAL;
+	}
 
+	/*
+	 * the sum of virtual address and size must be inside the IOVA space
+	 * that hardware supports
+	 */
+	max_iova = (1UL << domain->cfg.ias) - 1;
+	if ((dma_addr_t)iova + size > max_iova) {
+		printk("out-of-bound: iova 0x%lx + size 0x%zx > max_iova 0x%"PRIx64"\n",
+			   iova, size, max_iova);
+		/* TODO Return -EINVAL instead */
+		return 0;
+	}
+
+	/*
+	 * Keep iterating until we either unmap 'size' bytes (or more)
+	 * or we hit an area that isn't mapped.
+	 */
+	while (unmapped < size) {
+		size_t pgsize = ipmmu_pgsize(io_domain, iova, size - unmapped);
+
+		unmapped_page = domain->iop->unmap(domain->iop, iova, pgsize);
+		if (!unmapped_page)
+			break;
+
+		iova += unmapped_page;
+		unmapped += unmapped_page;
+	}
+
+	return unmapped;
+}
+
+int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova,
+		phys_addr_t paddr, size_t size, int prot)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+	unsigned long orig_iova = iova;
+	dma_addr_t max_iova;
+	unsigned int min_pagesz;
+	size_t orig_size = size;
+	int ret = 0;
+
+	if (unlikely(domain->iop->map == NULL ||
+			domain->cfg.pgsize_bitmap == 0UL))
+		return -ENODEV;
+
+	/* find out the minimum page size supported */
+	min_pagesz = 1 << __ffs(domain->cfg.pgsize_bitmap);
+
+	/*
+	 * both the virtual address and the physical one, as well as
+	 * the size of the mapping, must be aligned (at least) to the
+	 * size of the smallest page supported by the hardware
+	 */
+	if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
+		printk("unaligned: iova 0x%lx pa 0x%"PRIx64" size 0x%zx min_pagesz 0x%x\n",
+		       iova, paddr, size, min_pagesz);
+		return -EINVAL;
+	}
+
+	/*
+	 * the sum of virtual address and size must be inside the IOVA space
+	 * that hardware supports
+	 */
+	max_iova = (1UL << domain->cfg.ias) - 1;
+	if ((dma_addr_t)iova + size > max_iova) {
+		printk("out-of-bound: iova 0x%lx + size 0x%zx > max_iova 0x%"PRIx64"\n",
+		       iova, size, max_iova);
+		/* TODO Return -EINVAL instead */
+		return 0;
+	}
+
+	while (size) {
+		size_t pgsize = ipmmu_pgsize(io_domain, iova | paddr, size);
+
+		ret = domain->iop->map(domain->iop, iova, paddr, pgsize, prot);
+		if (ret == -EEXIST) {
+			phys_addr_t exist_paddr = ipmmu_iova_to_phys(io_domain, iova);
+			if (exist_paddr == paddr)
+				ret = 0;
+			else if (exist_paddr) {
+				printk("remap: iova 0x%lx pa 0x%"PRIx64" pgsize 0x%zx\n",
+						iova, paddr, pgsize);
+				ipmmu_unmap(io_domain, iova, pgsize);
+				ret = domain->iop->map(domain->iop, iova, paddr, pgsize, prot);
+			}
+		}
+		if (ret)
+			break;
+
+		iova += pgsize;
+		paddr += pgsize;
+		size -= pgsize;
+	}
+
+	/* unroll mapping in case something went wrong */
+	if (ret && orig_size != size)
+		ipmmu_unmap(io_domain, orig_iova, orig_size - size);
+
+	return ret;
+}
+
+#if 0 /* Xen: Not needed */
 static struct device *ipmmu_find_sibling_device(struct device *dev)
 {
 	struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu;
@@ -898,6 +1345,7 @@  static struct iommu_group *ipmmu_find_group(struct device *dev)
 
 	return group;
 }
+#endif
 
 static int ipmmu_find_utlbs(struct ipmmu_vmsa_device *mmu, struct device *dev,
 			    unsigned int *utlbs, unsigned int num_utlbs)
@@ -913,7 +1361,9 @@  static int ipmmu_find_utlbs(struct ipmmu_vmsa_device *mmu, struct device *dev,
 		if (ret < 0)
 			return ret;
 
+#if 0 /* Xen: Not needed */
 		of_node_put(args.np);
+#endif
 
 		if (args.np != mmu->dev->of_node || args.args_count != 1)
 			return -EINVAL;
@@ -924,6 +1374,19 @@  static int ipmmu_find_utlbs(struct ipmmu_vmsa_device *mmu, struct device *dev,
 	return 0;
 }
 
+/* Xen: To roll back actions that took place it init */
+static __maybe_unused void ipmmu_destroy_platform_device(struct device *dev)
+{
+	struct ipmmu_vmsa_archdata *archdata = to_archdata(dev);
+
+	if (!archdata)
+		return;
+
+	kfree(archdata->utlbs);
+	kfree(archdata);
+	set_archdata(dev, NULL);
+}
+
 static int ipmmu_init_platform_device(struct device *dev)
 {
 	struct ipmmu_vmsa_archdata *archdata;
@@ -996,6 +1459,11 @@  static int ipmmu_init_platform_device(struct device *dev)
 	archdata->num_utlbs = num_utlbs;
 	archdata->dev = dev;
 	set_archdata(dev, archdata);
+
+	/* Xen: */
+	dev_notice(dev, "initialized master device (IPMMU %s micro-TLBs %u)\n",
+			dev_name(mmu->dev), num_utlbs);
+
 	return 0;
 
 error:
@@ -1003,6 +1471,7 @@  error:
 	return ret;
 }
 
+#if 0 /* Xen: Not needed */
 #if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
 
 static int ipmmu_add_device(struct device *dev)
@@ -1233,6 +1702,7 @@  static const struct iommu_ops ipmmu_ops = {
 };
 
 #endif /* CONFIG_IOMMU_DMA */
+#endif
 
 /* -----------------------------------------------------------------------------
  * Probe/remove and init
@@ -1274,12 +1744,20 @@  static const struct of_device_id ipmmu_of_ids[] = {
 		.compatible = "renesas,ipmmu-r8a7796",
 		.data = &ipmmu_features_rcar_gen3,
 	}, {
+		/* Xen: It is not clear how to deal with it */
+		.compatible = "renesas,ipmmu-pmb-r8a7795",
+		.data = NULL,
+	}, {
 		/* Terminator */
 	},
 };
 
 MODULE_DEVICE_TABLE(of, ipmmu_of_ids);
 
+/*
+ * Xen: We don't have refcount for allocated memory so manually free memory
+ * when an error occured.
+ */
 static int ipmmu_probe(struct platform_device *pdev)
 {
 	struct ipmmu_vmsa_device *mmu;
@@ -1303,13 +1781,17 @@  static int ipmmu_probe(struct platform_device *pdev)
 	spin_lock_init(&mmu->lock);
 	bitmap_zero(mmu->ctx, IPMMU_CTX_MAX);
 	mmu->features = match->data;
+#if 0 /* Xen: Not needed */
 	dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+#endif
 
 	/* Map I/O memory and request IRQ. */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	mmu->base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(mmu->base))
-		return PTR_ERR(mmu->base);
+	if (IS_ERR(mmu->base)) {
+		ret = PTR_ERR(mmu->base);
+		goto out;
+	}
 
 	/*
 	 * The IPMMU has two register banks, for secure and non-secure modes.
@@ -1351,14 +1833,15 @@  static int ipmmu_probe(struct platform_device *pdev)
 	if (ipmmu_is_root(mmu)) {
 		if (irq < 0) {
 			dev_err(&pdev->dev, "no IRQ found\n");
-			return irq;
+			ret = irq;
+			goto out;
 		}
 
 		ret = devm_request_irq(&pdev->dev, irq, ipmmu_irq, 0,
 				       dev_name(&pdev->dev), mmu);
 		if (ret < 0) {
 			dev_err(&pdev->dev, "failed to request IRQ %d\n", irq);
-			return ret;
+			goto out;
 		}
 
 		ipmmu_device_reset(mmu);
@@ -1374,11 +1857,25 @@  static int ipmmu_probe(struct platform_device *pdev)
 	list_add(&mmu->list, &ipmmu_devices);
 	spin_unlock(&ipmmu_devices_lock);
 
+#if 0 /* Xen: Not needed */
 	platform_set_drvdata(pdev, mmu);
+#endif
+
+	/* Xen: */
+	dev_notice(&pdev->dev, "registered %s IPMMU\n",
+		ipmmu_is_root(mmu) ? "root" : "cache");
 
 	return 0;
+
+out:
+	if (!IS_ERR(mmu->base))
+		iounmap(mmu->base);
+	kfree(mmu);
+
+	return ret;
 }
 
+#if 0 /* Xen: Not needed */
 static int ipmmu_remove(struct platform_device *pdev)
 {
 	struct ipmmu_vmsa_device *mmu = platform_get_drvdata(pdev);
@@ -1645,3 +2142,418 @@  IOMMU_OF_DECLARE(ipmmu_r8a7796_iommu_of, "renesas,ipmmu-r8a7796",
 MODULE_DESCRIPTION("IOMMU API for Renesas VMSA-compatible IPMMU");
 MODULE_AUTHOR("Laurent Pinchart <laurent.pinchart@ideasonboard.com>");
 MODULE_LICENSE("GPL v2");
+#endif
+
+/***** Start of Xen specific code *****/
+
+static int __must_check ipmmu_vmsa_iotlb_flush(struct domain *d,
+		unsigned long gfn, unsigned int page_count)
+{
+	return 0;
+}
+
+static struct iommu_domain *ipmmu_vmsa_get_domain(struct domain *d,
+						struct device *dev)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(d)->arch.priv;
+	struct iommu_domain *io_domain;
+	struct ipmmu_vmsa_device *mmu;
+
+	mmu = to_archdata(dev)->mmu;
+	if (!mmu)
+		return NULL;
+
+	/*
+	 * Loop through the &xen_domain->contexts to locate a context
+	 * assigned to this IPMMU
+	 */
+	list_for_each_entry(io_domain, &xen_domain->contexts, list) {
+		if (to_vmsa_domain(io_domain)->mmu == mmu)
+			return io_domain;
+	}
+
+	return NULL;
+}
+
+static void ipmmu_vmsa_destroy_domain(struct iommu_domain *io_domain)
+{
+	struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+
+	list_del(&io_domain->list);
+
+	if (domain->mmu != domain->root) {
+		/*
+		 * Disable the context for cache IPMMU only. Flush the TLB as required
+		 * when modifying the context registers.
+		 */
+		ipmmu_ctx_write1(domain, IMCTR, IMCTR_FLUSH);
+	} else {
+		/*
+		 * Free main domain resources. We assume that all devices have already
+		 * been detached.
+		 */
+		ipmmu_domain_destroy_context(domain);
+		free_io_pgtable_ops(domain->iop);
+	}
+
+	kfree(domain);
+}
+
+static int ipmmu_vmsa_assign_dev(struct domain *d, u8 devfn,
+			       struct device *dev, u32 flag)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(d)->arch.priv;
+	struct iommu_domain *io_domain;
+	struct ipmmu_vmsa_domain *domain;
+	int ret = 0;
+
+	if (!xen_domain || !xen_domain->base_context)
+		return -EINVAL;
+
+	if (!dev->archdata.iommu) {
+		dev->archdata.iommu = xzalloc(struct ipmmu_vmsa_xen_device);
+		if (!dev->archdata.iommu)
+			return -ENOMEM;
+	}
+
+	if (!to_archdata(dev)) {
+		ret = ipmmu_init_platform_device(dev);
+		if (ret)
+			return ret;
+	}
+
+	spin_lock(&xen_domain->lock);
+
+	if (dev_iommu_domain(dev)) {
+		dev_err(dev, "already attached to IPMMU domain\n");
+		ret = -EEXIST;
+		goto out;
+	}
+
+	/*
+	 * Check to see if a context bank (iommu_domain) already exists for
+	 * this Xen domain under the same IPMMU
+	 */
+	io_domain = ipmmu_vmsa_get_domain(d, dev);
+	if (!io_domain) {
+		domain = xzalloc(struct ipmmu_vmsa_domain);
+		if (!domain) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		spin_lock_init(&domain->lock);
+
+		domain->d = d;
+		domain->context_id = to_vmsa_domain(xen_domain->base_context)->context_id;
+
+		io_domain = &domain->io_domain;
+
+		/* Chain the new context to the Xen domain */
+		list_add(&io_domain->list, &xen_domain->contexts);
+	}
+
+	ret = ipmmu_attach_device(io_domain, dev);
+	if (ret) {
+		if (io_domain->ref.counter == 0)
+			ipmmu_vmsa_destroy_domain(io_domain);
+	} else {
+		atomic_inc(&io_domain->ref);
+		dev_iommu_domain(dev) = io_domain;
+	}
+
+out:
+	spin_unlock(&xen_domain->lock);
+
+	return ret;
+}
+
+static int ipmmu_vmsa_deassign_dev(struct domain *d, struct device *dev)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(d)->arch.priv;
+	struct iommu_domain *io_domain = dev_iommu_domain(dev);
+
+	if (!io_domain || to_vmsa_domain(io_domain)->d != d) {
+		dev_err(dev, " not attached to domain %d\n", d->domain_id);
+		return -ESRCH;
+	}
+
+	spin_lock(&xen_domain->lock);
+
+	ipmmu_detach_device(io_domain, dev);
+	dev_iommu_domain(dev) = NULL;
+	atomic_dec(&io_domain->ref);
+
+	if (io_domain->ref.counter == 0)
+		ipmmu_vmsa_destroy_domain(io_domain);
+
+	spin_unlock(&xen_domain->lock);
+
+	return 0;
+}
+
+static int ipmmu_vmsa_reassign_dev(struct domain *s, struct domain *t,
+				 u8 devfn,  struct device *dev)
+{
+	int ret = 0;
+
+	/* Don't allow remapping on other domain than hwdom */
+	if (t && t != hardware_domain)
+		return -EPERM;
+
+	if (t == s)
+		return 0;
+
+	ret = ipmmu_vmsa_deassign_dev(s, dev);
+	if (ret)
+		return ret;
+
+	if (t) {
+		/* No flags are defined for ARM. */
+		ret = ipmmu_vmsa_assign_dev(t, devfn, dev, 0);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int ipmmu_vmsa_alloc_page_table(struct domain *d)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(d)->arch.priv;
+	struct ipmmu_vmsa_domain *domain;
+	struct ipmmu_vmsa_device *root;
+	int ret;
+
+	if (xen_domain->base_context)
+		return 0;
+
+	root = ipmmu_find_root(NULL);
+	if (!root) {
+		printk("d%d: Unable to locate root IPMMU\n", d->domain_id);
+		return -EAGAIN;
+	}
+
+	domain = xzalloc(struct ipmmu_vmsa_domain);
+	if (!domain)
+		return -ENOMEM;
+
+	spin_lock_init(&domain->lock);
+	INIT_LIST_HEAD(&domain->io_domain.list);
+	domain->d = d;
+	domain->root = root;
+
+	spin_lock(&xen_domain->lock);
+	ret = ipmmu_domain_init_context(domain);
+	if (ret < 0) {
+		dev_err(root->dev, "d%d: Unable to initialize IPMMU context\n",
+				d->domain_id);
+		spin_unlock(&xen_domain->lock);
+		xfree(domain);
+		return ret;
+	}
+	xen_domain->base_context = &domain->io_domain;
+	spin_unlock(&xen_domain->lock);
+
+	return 0;
+}
+
+static int ipmmu_vmsa_domain_init(struct domain *d, bool use_iommu)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain;
+
+	xen_domain = xzalloc(struct ipmmu_vmsa_xen_domain);
+	if (!xen_domain)
+		return -ENOMEM;
+
+	spin_lock_init(&xen_domain->lock);
+	INIT_LIST_HEAD(&xen_domain->contexts);
+
+	dom_iommu(d)->arch.priv = xen_domain;
+
+	if (use_iommu) {
+		int ret = ipmmu_vmsa_alloc_page_table(d);
+
+		if (ret) {
+			xfree(xen_domain);
+			dom_iommu(d)->arch.priv = NULL;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void __hwdom_init ipmmu_vmsa_hwdom_init(struct domain *d)
+{
+}
+
+static void ipmmu_vmsa_domain_teardown(struct domain *d)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(d)->arch.priv;
+
+	if (!xen_domain)
+		return;
+
+	spin_lock(&xen_domain->lock);
+	if (xen_domain->base_context) {
+		ipmmu_vmsa_destroy_domain(xen_domain->base_context);
+		xen_domain->base_context = NULL;
+	}
+	spin_unlock(&xen_domain->lock);
+
+	ASSERT(list_empty(&xen_domain->contexts));
+	xfree(xen_domain);
+	dom_iommu(d)->arch.priv = NULL;
+}
+
+static int __must_check ipmmu_vmsa_map_pages(struct domain *d,
+		unsigned long gfn, unsigned long mfn, unsigned int order,
+		unsigned int flags)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(d)->arch.priv;
+	size_t size = PAGE_SIZE * (1UL << order);
+	int ret, prot = 0;
+
+	if (!xen_domain || !xen_domain->base_context)
+		return -EINVAL;
+
+	if (flags & IOMMUF_writable)
+		prot |= IOMMU_WRITE;
+	if (flags & IOMMUF_readable)
+		prot |= IOMMU_READ;
+
+	spin_lock(&xen_domain->lock);
+	ret = ipmmu_map(xen_domain->base_context, pfn_to_paddr(gfn),
+			pfn_to_paddr(mfn), size, prot);
+	spin_unlock(&xen_domain->lock);
+
+	return ret;
+}
+
+static int __must_check ipmmu_vmsa_unmap_pages(struct domain *d,
+		unsigned long gfn, unsigned int order)
+{
+	struct ipmmu_vmsa_xen_domain *xen_domain = dom_iommu(d)->arch.priv;
+	size_t ret, size = PAGE_SIZE * (1UL << order);
+
+	if (!xen_domain || !xen_domain->base_context)
+		return -EINVAL;
+
+	spin_lock(&xen_domain->lock);
+	ret = ipmmu_unmap(xen_domain->base_context, pfn_to_paddr(gfn), size);
+	spin_unlock(&xen_domain->lock);
+
+	/*
+	 * We don't check how many bytes were actually unmapped. Otherwise we
+	 * should have raised an error every time we hit an area that isn't mapped.
+	 * And the p2m's attempt to unmap the same page twice can lead to crash or
+	 * panic. We think it is better to have corresponding warns inside
+	 * page table allocator for complaining about that rather than
+	 * breaking the whole system.
+	 */
+	return IS_ERR_VALUE(ret) ? ret : 0;
+}
+
+static void ipmmu_vmsa_dump_p2m_table(struct domain *d)
+{
+	/* TODO: This platform callback should be implemented. */
+}
+
+static const struct iommu_ops ipmmu_vmsa_iommu_ops = {
+	.init = ipmmu_vmsa_domain_init,
+	.hwdom_init = ipmmu_vmsa_hwdom_init,
+	.teardown = ipmmu_vmsa_domain_teardown,
+	.iotlb_flush = ipmmu_vmsa_iotlb_flush,
+	.assign_device = ipmmu_vmsa_assign_dev,
+	.reassign_device = ipmmu_vmsa_reassign_dev,
+	.map_pages = ipmmu_vmsa_map_pages,
+	.unmap_pages = ipmmu_vmsa_unmap_pages,
+	.dump_p2m_table = ipmmu_vmsa_dump_p2m_table,
+};
+
+static __init const struct ipmmu_vmsa_device *find_ipmmu(const struct device *dev)
+{
+	struct ipmmu_vmsa_device *mmu;
+	bool found = false;
+
+	spin_lock(&ipmmu_devices_lock);
+	list_for_each_entry(mmu, &ipmmu_devices, list) {
+		if (mmu->dev == dev) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&ipmmu_devices_lock);
+
+	return (found) ? mmu : NULL;
+}
+
+static __init void populate_ipmmu_masters(const struct ipmmu_vmsa_device *mmu)
+{
+	struct dt_device_node *np;
+
+	dt_for_each_device_node(dt_host, np) {
+		if (mmu->dev->of_node != dt_parse_phandle(np, "iommus", 0))
+			continue;
+
+		/* Let Xen know that the device is protected by an IPMMU */
+		dt_device_set_protected(np);
+
+		dev_notice(mmu->dev, "found master device %s\n", dt_node_full_name(np));
+	}
+}
+
+/* TODO: What to do if we failed to init cache/root IPMMU? */
+static __init int ipmmu_vmsa_init(struct dt_device_node *dev,
+				   const void *data)
+{
+	int rc;
+	const struct ipmmu_vmsa_device *mmu;
+	static bool set_ops_done = false;
+
+	/*
+	 * Even if the device can't be initialized, we don't want to
+	 * give the IPMMU device to dom0.
+	 */
+	dt_device_set_used_by(dev, DOMID_XEN);
+
+	rc = ipmmu_probe(dev);
+	if (rc) {
+		dev_err(&dev->dev, "failed to init IPMMU\n");
+		return rc;
+	}
+
+	/*
+	 * Since IPMMU is composed of two parts (a number of cache IPMMUs and
+	 * the root IPMMU) this function will be called more than once.
+	 * Use the flag below to avoid setting IOMMU ops if they already set.
+	 */
+	if (!set_ops_done) {
+		iommu_set_ops(&ipmmu_vmsa_iommu_ops);
+		set_ops_done = true;
+	}
+
+	/* Find the last IPMMU added. */
+	mmu = find_ipmmu(dt_to_dev(dev));
+	BUG_ON(mmu == NULL);
+
+	/* Mark all masters that connected to the last IPMMU as protected. */
+	populate_ipmmu_masters(mmu);
+
+	/*
+	 * The IPMMU can't utilize P2M table since it doesn't use the same
+	 * page-table format as the CPU.
+	 */
+	if (iommu_hap_pt_share) {
+		iommu_hap_pt_share = false;
+		dev_notice(&dev->dev,
+			"disable sharing P2M table between the CPU and IPMMU\n");
+	}
+
+	return 0;
+}
+
+DT_DEVICE_START(ipmmu, "Renesas IPMMU-VMSA", DEVICE_IOMMU)
+	.dt_match = ipmmu_of_ids,
+	.init = ipmmu_vmsa_init,
+DT_DEVICE_END