diff mbox series

[v4,02/26] iommu/sva: Manage process address spaces

Message ID 20200224182401.353359-3-jean-philippe@linaro.org (mailing list archive)
State New, archived
Headers show
Series iommu: Shared Virtual Addressing and SMMUv3 support | expand

Commit Message

Jean-Philippe Brucker Feb. 24, 2020, 6:23 p.m. UTC
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>

Add a small library to help IOMMU drivers manage process address spaces
bound to their devices. Register an MMU notifier to track modification
on each address space bound to one or more devices.

IOMMU drivers must implement the io_mm_ops and can then use the helpers
provided by this library to easily implement the SVA API introduced by
commit 26b25a2b98e4. The io_mm_ops are:

void *alloc(struct mm_struct *)
  Allocate a PASID context private to the IOMMU driver. There is a
  single context per mm. IOMMU drivers may perform arch-specific
  operations in there, for example pinning down a CPU ASID (on Arm).

int attach(struct device *, int pasid, void *ctx, bool attach_domain)
  Attach a context to the device, by setting up the PASID table entry.

int invalidate(struct device *, int pasid, void *ctx,
               unsigned long vaddr, size_t size)
  Invalidate TLB entries for this address range.

int detach(struct device *, int pasid, void *ctx, bool detach_domain)
  Detach a context from the device, by clearing the PASID table entry
  and invalidating cached entries.

void free(void *ctx)
  Free a context.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
---
 drivers/iommu/Kconfig     |   7 +
 drivers/iommu/Makefile    |   1 +
 drivers/iommu/iommu-sva.c | 561 ++++++++++++++++++++++++++++++++++++++
 drivers/iommu/iommu-sva.h |  64 +++++
 drivers/iommu/iommu.c     |   1 +
 include/linux/iommu.h     |   3 +
 6 files changed, 637 insertions(+)
 create mode 100644 drivers/iommu/iommu-sva.c
 create mode 100644 drivers/iommu/iommu-sva.h

Comments

Jonathan Cameron Feb. 26, 2020, 12:35 p.m. UTC | #1
On Mon, 24 Feb 2020 19:23:37 +0100
Jean-Philippe Brucker <jean-philippe@linaro.org> wrote:

> From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> 
> Add a small library to help IOMMU drivers manage process address spaces
> bound to their devices. Register an MMU notifier to track modification
> on each address space bound to one or more devices.
> 
> IOMMU drivers must implement the io_mm_ops and can then use the helpers
> provided by this library to easily implement the SVA API introduced by
> commit 26b25a2b98e4. The io_mm_ops are:
> 
> void *alloc(struct mm_struct *)
>   Allocate a PASID context private to the IOMMU driver. There is a
>   single context per mm. IOMMU drivers may perform arch-specific
>   operations in there, for example pinning down a CPU ASID (on Arm).
> 
> int attach(struct device *, int pasid, void *ctx, bool attach_domain)
>   Attach a context to the device, by setting up the PASID table entry.
> 
> int invalidate(struct device *, int pasid, void *ctx,
>                unsigned long vaddr, size_t size)
>   Invalidate TLB entries for this address range.
> 
> int detach(struct device *, int pasid, void *ctx, bool detach_domain)
>   Detach a context from the device, by clearing the PASID table entry
>   and invalidating cached entries.
> 
> void free(void *ctx)
>   Free a context.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>

Hi Jean-Phillippe,

A few trivial comments from me in line.  Otherwise this all seems sensible.

Jonathan

> ---
>  drivers/iommu/Kconfig     |   7 +
>  drivers/iommu/Makefile    |   1 +
>  drivers/iommu/iommu-sva.c | 561 ++++++++++++++++++++++++++++++++++++++
>  drivers/iommu/iommu-sva.h |  64 +++++
>  drivers/iommu/iommu.c     |   1 +
>  include/linux/iommu.h     |   3 +
>  6 files changed, 637 insertions(+)
>  create mode 100644 drivers/iommu/iommu-sva.c
>  create mode 100644 drivers/iommu/iommu-sva.h
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index d2fade984999..acca20e2da2f 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -102,6 +102,13 @@ config IOMMU_DMA
>  	select IRQ_MSI_IOMMU
>  	select NEED_SG_DMA_LENGTH
>  
> +# Shared Virtual Addressing library
> +config IOMMU_SVA
> +	bool
> +	select IOASID
> +	select IOMMU_API
> +	select MMU_NOTIFIER
> +
>  config FSL_PAMU
>  	bool "Freescale IOMMU support"
>  	depends on PCI
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 9f33fdb3bb05..40c800dd4e3e 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -37,3 +37,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
>  obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
>  obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
>  obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
> +obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
> diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
> new file mode 100644
> index 000000000000..64f1d1c82383
> --- /dev/null
> +++ b/drivers/iommu/iommu-sva.c
> @@ -0,0 +1,561 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Manage PASIDs and bind process address spaces to devices.
> + *
> + * Copyright (C) 2018 ARM Ltd.

Worth updating the date?

> + */
> +
> +#include <linux/idr.h>
> +#include <linux/ioasid.h>
> +#include <linux/iommu.h>
> +#include <linux/sched/mm.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +
> +#include "iommu-sva.h"
> +
> +/**
> + * DOC: io_mm model
> + *
> + * The io_mm keeps track of process address spaces shared between CPU and IOMMU.
> + * The following example illustrates the relation between structures
> + * iommu_domain, io_mm and iommu_sva. The iommu_sva struct is a bond between
> + * io_mm and device. A device can have multiple io_mm and an io_mm may be bound
> + * to multiple devices.
> + *              ___________________________
> + *             |  IOMMU domain A           |
> + *             |  ________________         |
> + *             | |  IOMMU group   |        +------- io_pgtables
> + *             | |                |        |
> + *             | |   dev 00:00.0 ----+------- bond 1 --- io_mm X
> + *             | |________________|   \    |
> + *             |                       '----- bond 2 ---.
> + *             |___________________________|             \
> + *              ___________________________               \
> + *             |  IOMMU domain B           |             io_mm Y
> + *             |  ________________         |             / /
> + *             | |  IOMMU group   |        |            / /
> + *             | |                |        |           / /
> + *             | |   dev 00:01.0 ------------ bond 3 -' /
> + *             | |   dev 00:01.1 ------------ bond 4 --'
> + *             | |________________|        |
> + *             |                           +------- io_pgtables
> + *             |___________________________|
> + *
> + * In this example, device 00:00.0 is in domain A, devices 00:01.* are in domain
> + * B. All devices within the same domain access the same address spaces. Device
> + * 00:00.0 accesses address spaces X and Y, each corresponding to an mm_struct.
> + * Devices 00:01.* only access address space Y. In addition each
> + * IOMMU_DOMAIN_DMA domain has a private address space, io_pgtable, that is
> + * managed with iommu_map()/iommu_unmap(), and isn't shared with the CPU MMU.
> + *
> + * To obtain the above configuration, users would for instance issue the
> + * following calls:
> + *
> + *     iommu_sva_bind_device(dev 00:00.0, mm X, ...) -> bond 1
> + *     iommu_sva_bind_device(dev 00:00.0, mm Y, ...) -> bond 2
> + *     iommu_sva_bind_device(dev 00:01.0, mm Y, ...) -> bond 3
> + *     iommu_sva_bind_device(dev 00:01.1, mm Y, ...) -> bond 4
> + *
> + * A single Process Address Space ID (PASID) is allocated for each mm. In the
> + * example, devices use PASID 1 to read/write into address space X and PASID 2
> + * to read/write into address space Y. Calling iommu_sva_get_pasid() on bond 1
> + * returns 1, and calling it on bonds 2-4 returns 2.
> + *
> + * Hardware tables describing this configuration in the IOMMU would typically
> + * look like this:
> + *
> + *                                PASID tables
> + *                                 of domain A
> + *                              .->+--------+
> + *                             / 0 |        |-------> io_pgtable
> + *                            /    +--------+
> + *            Device tables  /   1 |        |-------> pgd X
> + *              +--------+  /      +--------+
> + *      00:00.0 |      A |-'     2 |        |--.
> + *              +--------+         +--------+   \
> + *              :        :       3 |        |    \
> + *              +--------+         +--------+     --> pgd Y
> + *      00:01.0 |      B |--.                    /
> + *              +--------+   \                  |
> + *      00:01.1 |      B |----+   PASID tables  |
> + *              +--------+     \   of domain B  |
> + *                              '->+--------+   |
> + *                               0 |        |-- | --> io_pgtable
> + *                                 +--------+   |
> + *                               1 |        |   |
> + *                                 +--------+   |
> + *                               2 |        |---'
> + *                                 +--------+
> + *                               3 |        |
> + *                                 +--------+
> + *
> + * With this model, a single call binds all devices in a given domain to an
> + * address space. Other devices in the domain will get the same bond implicitly.
> + * However, users must issue one bind() for each device, because IOMMUs may
> + * implement SVA differently. Furthermore, mandating one bind() per device
> + * allows the driver to perform sanity-checks on device capabilities.

> + *
> + * In some IOMMUs, one entry of the PASID table (typically the first one) can
> + * hold non-PASID translations. In this case PASID 0 is reserved and the first
> + * entry points to the io_pgtable pointer. In other IOMMUs the io_pgtable
> + * pointer is held in the device table and PASID 0 is available to the
> + * allocator.

Is it worth hammering home in here that we can only do this because the PASID space
is global (with exception of PASID 0)?  It's a convenient simplification but not
necessarily a hardware restriction so perhaps we should remind people somewhere in here?

> + */
> +
> +struct io_mm {
> +	struct list_head		devices;
> +	struct mm_struct		*mm;
> +	struct mmu_notifier		notifier;
> +
> +	/* Late initialization */
> +	const struct io_mm_ops		*ops;
> +	void				*ctx;
> +	int				pasid;
> +};
> +
> +#define to_io_mm(mmu_notifier)	container_of(mmu_notifier, struct io_mm, notifier)
> +#define to_iommu_bond(handle)	container_of(handle, struct iommu_bond, sva)

Code ordering wise, do we want this after the definition of iommu_bond?

For both of these it's a bit non obvious what they come 'from'.
I wouldn't naturally assume to_io_mm gets me from notifier to the io_mm
for example.  Not sure it matters though if these are only used in a few
places.

> +
> +struct iommu_bond {
> +	struct iommu_sva		sva;
> +	struct io_mm __rcu		*io_mm;
> +
> +	struct list_head		mm_head;
> +	void				*drvdata;
> +	struct rcu_head			rcu_head;
> +	refcount_t			refs;
> +};
> +
> +static DECLARE_IOASID_SET(shared_pasid);
> +
> +static struct mmu_notifier_ops iommu_mmu_notifier_ops;
> +
> +/*
> + * Serializes modifications of bonds.
> + * Lock order: Device SVA mutex; global SVA mutex; IOASID lock
> + */
> +static DEFINE_MUTEX(iommu_sva_lock);
> +
> +struct io_mm_alloc_params {
> +	const struct io_mm_ops *ops;
> +	int min_pasid, max_pasid;
> +};
> +
> +static struct mmu_notifier *io_mm_alloc(struct mm_struct *mm, void *privdata)
> +{
> +	int ret;
> +	struct io_mm *io_mm;
> +	struct io_mm_alloc_params *params = privdata;
> +
> +	io_mm = kzalloc(sizeof(*io_mm), GFP_KERNEL);
> +	if (!io_mm)
> +		return ERR_PTR(-ENOMEM);
> +
> +	io_mm->mm = mm;
> +	io_mm->ops = params->ops;
> +	INIT_LIST_HEAD(&io_mm->devices);
> +
> +	io_mm->pasid = ioasid_alloc(&shared_pasid, params->min_pasid,
> +				    params->max_pasid, io_mm->mm);
> +	if (io_mm->pasid == INVALID_IOASID) {
> +		ret = -ENOSPC;
> +		goto err_free_io_mm;
> +	}
> +
> +	io_mm->ctx = params->ops->alloc(mm);
> +	if (IS_ERR(io_mm->ctx)) {
> +		ret = PTR_ERR(io_mm->ctx);
> +		goto err_free_pasid;
> +	}
> +	return &io_mm->notifier;
> +
> +err_free_pasid:
> +	ioasid_free(io_mm->pasid);
> +err_free_io_mm:
> +	kfree(io_mm);
> +	return ERR_PTR(ret);
> +}
> +
> +static void io_mm_free(struct mmu_notifier *mn)
> +{
> +	struct io_mm *io_mm = to_io_mm(mn);
> +
> +	WARN_ON(!list_empty(&io_mm->devices));
> +
> +	io_mm->ops->release(io_mm->ctx);
> +	ioasid_free(io_mm->pasid);
> +	kfree(io_mm);
> +}
> +
> +/*
> + * io_mm_get - Allocate an io_mm or get the existing one for the given mm
> + * @mm: the mm
> + * @ops: callbacks for the IOMMU driver
> + * @min_pasid: minimum PASID value (inclusive)
> + * @max_pasid: maximum PASID value (inclusive)
> + *
> + * Returns a valid io_mm or an error pointer.
> + */
> +static struct io_mm *io_mm_get(struct mm_struct *mm,
> +			       const struct io_mm_ops *ops,
> +			       int min_pasid, int max_pasid)
> +{
> +	struct io_mm *io_mm;
> +	struct mmu_notifier *mn;
> +	struct io_mm_alloc_params params = {
> +		.ops		= ops,
> +		.min_pasid	= min_pasid,
> +		.max_pasid	= max_pasid,
> +	};
> +
> +	/*
> +	 * A single notifier can exist for this (ops, mm) pair. Allocate it if
> +	 * necessary.
> +	 */
> +	mn = mmu_notifier_get(&iommu_mmu_notifier_ops, mm, &params);
> +	if (IS_ERR(mn))
> +		return ERR_CAST(mn);
> +	io_mm = to_io_mm(mn);
> +
> +	if (WARN_ON(io_mm->ops != ops)) {
> +		mmu_notifier_put(mn);
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	return io_mm;
> +}
> +
> +static void io_mm_put(struct io_mm *io_mm)
> +{
> +	mmu_notifier_put(&io_mm->notifier);
> +}
> +
> +static struct iommu_sva *
> +io_mm_attach(struct device *dev, struct io_mm *io_mm, void *drvdata)
> +{
> +	int ret = 0;

I'm fairly sure this is set in all paths below.  Now, of course the
compiler might not think that in which case fair enough :)

> +	bool attach_domain = true;
> +	struct iommu_bond *bond, *tmp;
> +	struct iommu_domain *domain, *other;
> +	struct iommu_sva_param *param = dev->iommu_param->sva_param;
> +
> +	domain = iommu_get_domain_for_dev(dev);
> +
> +	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
> +	if (!bond)
> +		return ERR_PTR(-ENOMEM);
> +
> +	bond->sva.dev	= dev;
> +	bond->drvdata	= drvdata;
> +	refcount_set(&bond->refs, 1);
> +	RCU_INIT_POINTER(bond->io_mm, io_mm);
> +
> +	mutex_lock(&iommu_sva_lock);
> +	/* Is it already bound to the device or domain? */
> +	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
> +		if (tmp->sva.dev != dev) {
> +			other = iommu_get_domain_for_dev(tmp->sva.dev);
> +			if (domain == other)
> +				attach_domain = false;
> +
> +			continue;
> +		}
> +
> +		if (WARN_ON(tmp->drvdata != drvdata)) {
> +			ret = -EINVAL;
> +			goto err_free;
> +		}
> +
> +		/*
> +		 * Hold a single io_mm reference per bond. Note that we can't
> +		 * return an error after this, otherwise the caller would drop
> +		 * an additional reference to the io_mm.
> +		 */
> +		refcount_inc(&tmp->refs);
> +		io_mm_put(io_mm);
> +		kfree(bond);

Free outside the lock would be ever so slightly more logical given we allocated
before taking the lock.

> +		mutex_unlock(&iommu_sva_lock);
> +		return &tmp->sva;
> +	}
> +
> +	list_add_rcu(&bond->mm_head, &io_mm->devices);
> +	param->nr_bonds++;
> +	mutex_unlock(&iommu_sva_lock);
> +
> +	ret = io_mm->ops->attach(bond->sva.dev, io_mm->pasid, io_mm->ctx,
> +				 attach_domain);
> +	if (ret)
> +		goto err_remove;
> +
> +	return &bond->sva;
> +
> +err_remove:
> +	/*
> +	 * At this point concurrent threads may have started to access the
> +	 * io_mm->devices list in order to invalidate address ranges, which
> +	 * requires to free the bond via kfree_rcu()
> +	 */
> +	mutex_lock(&iommu_sva_lock);
> +	param->nr_bonds--;
> +	list_del_rcu(&bond->mm_head);
> +
> +err_free:
> +	mutex_unlock(&iommu_sva_lock);
> +	kfree_rcu(bond, rcu_head);

I don't suppose it matters really but we don't need the rcu free if
we follow the err_free goto.  Perhaps we are cleaner in this case
to not use a unified exit path but do that case inline?

> +	return ERR_PTR(ret);
> +}
> +
> +static void io_mm_detach_locked(struct iommu_bond *bond)
> +{
> +	struct io_mm *io_mm;
> +	struct iommu_bond *tmp;
> +	bool detach_domain = true;
> +	struct iommu_domain *domain, *other;
> +
> +	io_mm = rcu_dereference_protected(bond->io_mm,
> +					  lockdep_is_held(&iommu_sva_lock));
> +	if (!io_mm)
> +		return;
> +
> +	domain = iommu_get_domain_for_dev(bond->sva.dev);
> +
> +	/* Are other devices in the same domain still attached to this mm? */
> +	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
> +		if (tmp == bond)
> +			continue;
> +		other = iommu_get_domain_for_dev(tmp->sva.dev);
> +		if (domain == other) {
> +			detach_domain = false;
> +			break;
> +		}
> +	}
> +
> +	io_mm->ops->detach(bond->sva.dev, io_mm->pasid, io_mm->ctx,
> +			   detach_domain);
> +
> +	list_del_rcu(&bond->mm_head);
> +	RCU_INIT_POINTER(bond->io_mm, NULL);
> +
> +	/* Free after RCU grace period */
> +	io_mm_put(io_mm);
> +}
> +
> +/*
> + * io_mm_release - release MMU notifier
> + *
> + * Called when the mm exits. Some devices may still be bound to the io_mm. A few
> + * things need to be done before it is safe to release:
> + *
> + * - Tell the device driver to stop using this PASID.
> + * - Clear the PASID table and invalidate TLBs.
> + * - Drop all references to this io_mm.
> + */
> +static void io_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> +	struct iommu_bond *bond, *next;
> +	struct io_mm *io_mm = to_io_mm(mn);
> +
> +	mutex_lock(&iommu_sva_lock);
> +	list_for_each_entry_safe(bond, next, &io_mm->devices, mm_head) {
> +		struct device *dev = bond->sva.dev;
> +		struct iommu_sva *sva = &bond->sva;
> +
> +		if (sva->ops && sva->ops->mm_exit &&
> +		    sva->ops->mm_exit(dev, sva, bond->drvdata))
> +			dev_WARN(dev, "possible leak of PASID %u",
> +				 io_mm->pasid);
> +
> +		/* unbind() frees the bond, we just detach it */
> +		io_mm_detach_locked(bond);
> +	}
> +	mutex_unlock(&iommu_sva_lock);
> +}
> +
> +static void io_mm_invalidate_range(struct mmu_notifier *mn,
> +				   struct mm_struct *mm, unsigned long start,
> +				   unsigned long end)
> +{
> +	struct iommu_bond *bond;
> +	struct io_mm *io_mm = to_io_mm(mn);
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(bond, &io_mm->devices, mm_head)
> +		io_mm->ops->invalidate(bond->sva.dev, io_mm->pasid, io_mm->ctx,
> +				       start, end - start);
> +	rcu_read_unlock();
> +}
> +
> +static struct mmu_notifier_ops iommu_mmu_notifier_ops = {
> +	.alloc_notifier		= io_mm_alloc,
> +	.free_notifier		= io_mm_free,
> +	.release		= io_mm_release,
> +	.invalidate_range	= io_mm_invalidate_range,
> +};
> +
> +struct iommu_sva *
> +iommu_sva_bind_generic(struct device *dev, struct mm_struct *mm,
> +		       const struct io_mm_ops *ops, void *drvdata)
> +{
> +	struct io_mm *io_mm;
> +	struct iommu_sva *handle;
> +	struct iommu_param *param = dev->iommu_param;
> +
> +	if (!param)
> +		return ERR_PTR(-ENODEV);
> +
> +	mutex_lock(&param->sva_lock);
> +	if (!param->sva_param) {
> +		handle = ERR_PTR(-ENODEV);
> +		goto out_unlock;
> +	}
> +
> +	io_mm = io_mm_get(mm, ops, param->sva_param->min_pasid,
> +			  param->sva_param->max_pasid);
> +	if (IS_ERR(io_mm)) {
> +		handle = ERR_CAST(io_mm);
> +		goto out_unlock;
> +	}
> +
> +	handle = io_mm_attach(dev, io_mm, drvdata);
> +	if (IS_ERR(handle))
> +		io_mm_put(io_mm);
> +
> +out_unlock:
> +	mutex_unlock(&param->sva_lock);
> +	return handle;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_bind_generic);
> +
> +static void iommu_sva_unbind_locked(struct iommu_bond *bond)
> +{
> +	struct device *dev = bond->sva.dev;
> +	struct iommu_sva_param *param = dev->iommu_param->sva_param;
> +
> +	if (!refcount_dec_and_test(&bond->refs))
> +		return;
> +
> +	io_mm_detach_locked(bond);
> +	param->nr_bonds--;
> +	kfree_rcu(bond, rcu_head);
> +}
> +
> +void iommu_sva_unbind_generic(struct iommu_sva *handle)
> +{
> +	struct iommu_param *param = handle->dev->iommu_param;
> +
> +	if (WARN_ON(!param))
> +		return;
> +
> +	mutex_lock(&param->sva_lock);
> +	mutex_lock(&iommu_sva_lock);
> +	iommu_sva_unbind_locked(to_iommu_bond(handle));
> +	mutex_unlock(&iommu_sva_lock);
> +	mutex_unlock(&param->sva_lock);
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_unbind_generic);
> +
> +/**
> + * iommu_sva_enable() - Enable Shared Virtual Addressing for a device
> + * @dev: the device
> + * @sva_param: the parameters.
> + *
> + * Called by an IOMMU driver to setup the SVA parameters
> + * @sva_param is duplicated and can be freed when this function returns.
> + *
> + * Return 0 if initialization succeeded, or an error.
> + */
> +int iommu_sva_enable(struct device *dev, struct iommu_sva_param *sva_param)
> +{
> +	int ret;
> +	struct iommu_sva_param *new_param;
> +	struct iommu_param *param = dev->iommu_param;
> +
> +	if (!param)
> +		return -ENODEV;
> +
> +	new_param = kmemdup(sva_param, sizeof(*new_param), GFP_KERNEL);
> +	if (!new_param)
> +		return -ENOMEM;
> +
> +	mutex_lock(&param->sva_lock);
> +	if (param->sva_param) {
> +		ret = -EEXIST;
> +		goto err_unlock;
> +	}
> +
> +	dev->iommu_param->sva_param = new_param;
> +	mutex_unlock(&param->sva_lock);
> +	return 0;
> +
> +err_unlock:
> +	mutex_unlock(&param->sva_lock);
> +	kfree(new_param);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_enable);
> +
> +/**
> + * iommu_sva_disable() - Disable Shared Virtual Addressing for a device
> + * @dev: the device
> + *
> + * IOMMU drivers call this to disable SVA.
> + */
> +int iommu_sva_disable(struct device *dev)
> +{
> +	int ret = 0;
> +	struct iommu_param *param = dev->iommu_param;
> +
> +	if (!param)
> +		return -EINVAL;
> +
> +	mutex_lock(&param->sva_lock);
> +	if (!param->sva_param) {
> +		ret = -ENODEV;
> +		goto out_unlock;
> +	}
> +
> +	/* Require that all contexts are unbound */
> +	if (param->sva_param->nr_bonds) {
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +
> +	kfree(param->sva_param);
> +	param->sva_param = NULL;
> +out_unlock:
> +	mutex_unlock(&param->sva_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_disable);
> +
> +bool iommu_sva_enabled(struct device *dev)
> +{
> +	bool enabled;
> +	struct iommu_param *param = dev->iommu_param;
> +
> +	if (!param)
> +		return false;
> +
> +	mutex_lock(&param->sva_lock);
> +	enabled = !!param->sva_param;
> +	mutex_unlock(&param->sva_lock);
> +	return enabled;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_enabled);
> +
> +int iommu_sva_get_pasid_generic(struct iommu_sva *handle)
> +{
> +	struct io_mm *io_mm;
> +	int pasid = IOMMU_PASID_INVALID;
> +	struct iommu_bond *bond = to_iommu_bond(handle);
> +
> +	rcu_read_lock();
> +	io_mm = rcu_dereference(bond->io_mm);
> +	if (io_mm)
> +		pasid = io_mm->pasid;
> +	rcu_read_unlock();
> +	return pasid;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_get_pasid_generic);
> diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
> new file mode 100644
> index 000000000000..dd55c2db0936
> --- /dev/null
> +++ b/drivers/iommu/iommu-sva.h
> @@ -0,0 +1,64 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * SVA library for IOMMU drivers
> + */
> +#ifndef _IOMMU_SVA_H
> +#define _IOMMU_SVA_H
> +
> +#include <linux/iommu.h>
> +#include <linux/kref.h>
> +#include <linux/mmu_notifier.h>
> +
> +struct io_mm_ops {
> +	/* Allocate a PASID context for an mm */
> +	void *(*alloc)(struct mm_struct *mm);
> +
> +	/*
> +	 * Attach a PASID context to a device. Write the entry into the PASID
> +	 * table.
> +	 *
> +	 * @attach_domain is true when no other device in the IOMMU domain is
> +	 *   already attached to this context. IOMMU drivers that share the
> +	 *   PASID tables within a domain don't need to write the PASID entry
> +	 *   when @attach_domain is false.
> +	 */
> +	int (*attach)(struct device *dev, int pasid, void *ctx,
> +		      bool attach_domain);
> +
> +	/*
> +	 * Detach a PASID context from a device. Clear the entry from the PASID
> +	 * table and invalidate if necessary.
> +	 *
> +	 * @detach_domain is true when no other device in the IOMMU domain is
> +	 *   still attached to this context. IOMMU drivers that share the PASID
> +	 *   table within a domain don't need to clear the PASID entry when
> +	 *   @detach_domain is false, only invalidate the caches.
> +	 */
> +	void (*detach)(struct device *dev, int pasid, void *ctx,
> +		       bool detach_domain);
> +
> +	/* Invalidate a range of addresses. Cannot sleep. */
> +	void (*invalidate)(struct device *dev, int pasid, void *ctx,
> +			   unsigned long vaddr, size_t size);
> +
> +	/* Free a context. Cannot sleep. */
> +	void (*release)(void *ctx);
> +};
> +
> +struct iommu_sva_param {
> +	u32			min_pasid;
> +	u32			max_pasid;
> +	int			nr_bonds;
> +};
> +
> +struct iommu_sva *
> +iommu_sva_bind_generic(struct device *dev, struct mm_struct *mm,
> +		       const struct io_mm_ops *ops, void *drvdata);
> +void iommu_sva_unbind_generic(struct iommu_sva *handle);
> +int iommu_sva_get_pasid_generic(struct iommu_sva *handle);
> +
> +int iommu_sva_enable(struct device *dev, struct iommu_sva_param *sva_param);
> +int iommu_sva_disable(struct device *dev);
> +bool iommu_sva_enabled(struct device *dev);
> +
> +#endif /* _IOMMU_SVA_H */
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 3e3528436e0b..c8bd972c1788 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -164,6 +164,7 @@ static struct iommu_param *iommu_get_dev_param(struct device *dev)
>  		return NULL;
>  
>  	mutex_init(&param->lock);
> +	mutex_init(&param->sva_lock);
>  	dev->iommu_param = param;
>  	return param;
>  }
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 1739f8a7a4b4..83397ae88d2d 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -368,6 +368,7 @@ struct iommu_fault_param {
>   * struct iommu_param - collection of per-device IOMMU data
>   *
>   * @fault_param: IOMMU detected device fault reporting data
> + * @sva_param: IOMMU parameter for SVA
>   *
>   * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
>   *	struct iommu_group	*iommu_group;
> @@ -376,6 +377,8 @@ struct iommu_fault_param {
>  struct iommu_param {
>  	struct mutex lock;
>  	struct iommu_fault_param *fault_param;
> +	struct mutex sva_lock;
> +	struct iommu_sva_param *sva_param;
>  };
>  
>  int  iommu_device_register(struct iommu_device *iommu);
Jacob Pan Feb. 26, 2020, 7:13 p.m. UTC | #2
Hi Jean,

A few comments inline. I am also trying to converge to the common sva
APIs. I sent out the first step w/o iopage fault and the generic ops
you have here.

On Mon, 24 Feb 2020 19:23:37 +0100
Jean-Philippe Brucker <jean-philippe@linaro.org> wrote:

> From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> 
> Add a small library to help IOMMU drivers manage process address
> spaces bound to their devices. Register an MMU notifier to track
> modification on each address space bound to one or more devices.
> 
> IOMMU drivers must implement the io_mm_ops and can then use the
> helpers provided by this library to easily implement the SVA API
> introduced by commit 26b25a2b98e4. The io_mm_ops are:
> 
> void *alloc(struct mm_struct *)
>   Allocate a PASID context private to the IOMMU driver. There is a
>   single context per mm. IOMMU drivers may perform arch-specific
>   operations in there, for example pinning down a CPU ASID (on Arm).
> 
> int attach(struct device *, int pasid, void *ctx, bool attach_domain)
>   Attach a context to the device, by setting up the PASID table entry.
> 
> int invalidate(struct device *, int pasid, void *ctx,
>                unsigned long vaddr, size_t size)
>   Invalidate TLB entries for this address range.
> 
> int detach(struct device *, int pasid, void *ctx, bool detach_domain)
>   Detach a context from the device, by clearing the PASID table entry
>   and invalidating cached entries.
> 
> void free(void *ctx)
you meant release()?

>   Free a context.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> ---
>  drivers/iommu/Kconfig     |   7 +
>  drivers/iommu/Makefile    |   1 +
>  drivers/iommu/iommu-sva.c | 561
> ++++++++++++++++++++++++++++++++++++++ drivers/iommu/iommu-sva.h |
> 64 +++++ drivers/iommu/iommu.c     |   1 +
>  include/linux/iommu.h     |   3 +
>  6 files changed, 637 insertions(+)
>  create mode 100644 drivers/iommu/iommu-sva.c
>  create mode 100644 drivers/iommu/iommu-sva.h
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index d2fade984999..acca20e2da2f 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -102,6 +102,13 @@ config IOMMU_DMA
>  	select IRQ_MSI_IOMMU
>  	select NEED_SG_DMA_LENGTH
>  
> +# Shared Virtual Addressing library
> +config IOMMU_SVA
> +	bool
> +	select IOASID
> +	select IOMMU_API
> +	select MMU_NOTIFIER
> +
>  config FSL_PAMU
>  	bool "Freescale IOMMU support"
>  	depends on PCI
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 9f33fdb3bb05..40c800dd4e3e 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -37,3 +37,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
>  obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
>  obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
>  obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
> +obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
> diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
> new file mode 100644
> index 000000000000..64f1d1c82383
> --- /dev/null
> +++ b/drivers/iommu/iommu-sva.c
> @@ -0,0 +1,561 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Manage PASIDs and bind process address spaces to devices.
> + *
> + * Copyright (C) 2018 ARM Ltd.
> + */
> +
> +#include <linux/idr.h>
> +#include <linux/ioasid.h>
> +#include <linux/iommu.h>
> +#include <linux/sched/mm.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +
> +#include "iommu-sva.h"
> +
> +/**
> + * DOC: io_mm model
> + *
> + * The io_mm keeps track of process address spaces shared between
> CPU and IOMMU.
> + * The following example illustrates the relation between structures
> + * iommu_domain, io_mm and iommu_sva. The iommu_sva struct is a bond
> between
> + * io_mm and device. A device can have multiple io_mm and an io_mm
> may be bound
> + * to multiple devices.
> + *              ___________________________
> + *             |  IOMMU domain A           |
> + *             |  ________________         |
> + *             | |  IOMMU group   |        +------- io_pgtables
> + *             | |                |        |
> + *             | |   dev 00:00.0 ----+------- bond 1 --- io_mm X
> + *             | |________________|   \    |
> + *             |                       '----- bond 2 ---.
> + *             |___________________________|             \
> + *              ___________________________               \
> + *             |  IOMMU domain B           |             io_mm Y
> + *             |  ________________         |             / /
> + *             | |  IOMMU group   |        |            / /
> + *             | |                |        |           / /
> + *             | |   dev 00:01.0 ------------ bond 3 -' /
> + *             | |   dev 00:01.1 ------------ bond 4 --'
> + *             | |________________|        |
> + *             |                           +------- io_pgtables
> + *             |___________________________|
> + *
> + * In this example, device 00:00.0 is in domain A, devices 00:01.*
> are in domain
> + * B. All devices within the same domain access the same address
> spaces.
Hmm, devices in domain A has access to both X & Y, isn't it
contradictory?

> Device
> + * 00:00.0 accesses address spaces X and Y, each corresponding to an
> mm_struct.
> + * Devices 00:01.* only access address space Y. In addition each
> + * IOMMU_DOMAIN_DMA domain has a private address space, io_pgtable,
> that is
> + * managed with iommu_map()/iommu_unmap(), and isn't shared with the
> CPU MMU.
So this would allow IOVA and SVA co-exist in the same address space?
I guess this is the PASID 0 for DMA request w/o PASID. If that is the
case, perhaps needs more explanation since the private address space
also has a private PASID within the domain.

> + *
> + * To obtain the above configuration, users would for instance issue
> the
> + * following calls:
> + *
> + *     iommu_sva_bind_device(dev 00:00.0, mm X, ...) -> bond 1
> + *     iommu_sva_bind_device(dev 00:00.0, mm Y, ...) -> bond 2
> + *     iommu_sva_bind_device(dev 00:01.0, mm Y, ...) -> bond 3
> + *     iommu_sva_bind_device(dev 00:01.1, mm Y, ...) -> bond 4
> + *
> + * A single Process Address Space ID (PASID) is allocated for each
> mm. In the
> + * example, devices use PASID 1 to read/write into address space X
> and PASID 2
> + * to read/write into address space Y. Calling iommu_sva_get_pasid()
> on bond 1
> + * returns 1, and calling it on bonds 2-4 returns 2.
> + *
> + * Hardware tables describing this configuration in the IOMMU would
> typically
> + * look like this:
> + *
> + *                                PASID tables
> + *                                 of domain A
> + *                              .->+--------+
> + *                             / 0 |        |-------> io_pgtable
> + *                            /    +--------+
> + *            Device tables  /   1 |        |-------> pgd X
> + *              +--------+  /      +--------+
> + *      00:00.0 |      A |-'     2 |        |--.
> + *              +--------+         +--------+   \
> + *              :        :       3 |        |    \
> + *              +--------+         +--------+     --> pgd Y
> + *      00:01.0 |      B |--.                    /
> + *              +--------+   \                  |
> + *      00:01.1 |      B |----+   PASID tables  |
> + *              +--------+     \   of domain B  |
> + *                              '->+--------+   |
> + *                               0 |        |-- | --> io_pgtable
> + *                                 +--------+   |
> + *                               1 |        |   |
> + *                                 +--------+   |
> + *                               2 |        |---'
> + *                                 +--------+
> + *                               3 |        |
> + *                                 +--------+
> + *
> + * With this model, a single call binds all devices in a given
> domain to an
> + * address space. Other devices in the domain will get the same bond
> implicitly.
> + * However, users must issue one bind() for each device, because
> IOMMUs may
> + * implement SVA differently. Furthermore, mandating one bind() per
> device
> + * allows the driver to perform sanity-checks on device capabilities.
> + *
> + * In some IOMMUs, one entry of the PASID table (typically the first
> one) can
> + * hold non-PASID translations. In this case PASID 0 is reserved and
> the first
> + * entry points to the io_pgtable pointer. In other IOMMUs the
> io_pgtable
> + * pointer is held in the device table and PASID 0 is available to
> the
> + * allocator.
> + */
> +
> +struct io_mm {
> +	struct list_head		devices;
> +	struct mm_struct		*mm;
> +	struct mmu_notifier		notifier;
> +
> +	/* Late initialization */
> +	const struct io_mm_ops		*ops;
> +	void				*ctx;
> +	int				pasid;
> +};
> +
> +#define to_io_mm(mmu_notifier)	container_of(mmu_notifier,
> struct io_mm, notifier) +#define to_iommu_bond(handle)
> container_of(handle, struct iommu_bond, sva) +
> +struct iommu_bond {
> +	struct iommu_sva		sva;
> +	struct io_mm __rcu		*io_mm;
> +
> +	struct list_head		mm_head;
> +	void				*drvdata;
> +	struct rcu_head			rcu_head;
> +	refcount_t			refs;
> +};
> +
> +static DECLARE_IOASID_SET(shared_pasid);
> +
> +static struct mmu_notifier_ops iommu_mmu_notifier_ops;
> +
> +/*
> + * Serializes modifications of bonds.
> + * Lock order: Device SVA mutex; global SVA mutex; IOASID lock
> + */
> +static DEFINE_MUTEX(iommu_sva_lock);
> +
> +struct io_mm_alloc_params {
> +	const struct io_mm_ops *ops;
> +	int min_pasid, max_pasid;
> +};
> +
> +static struct mmu_notifier *io_mm_alloc(struct mm_struct *mm, void
> *privdata) +{
> +	int ret;
> +	struct io_mm *io_mm;
> +	struct io_mm_alloc_params *params = privdata;
> +
> +	io_mm = kzalloc(sizeof(*io_mm), GFP_KERNEL);
> +	if (!io_mm)
> +		return ERR_PTR(-ENOMEM);
> +
> +	io_mm->mm = mm;
> +	io_mm->ops = params->ops;
> +	INIT_LIST_HEAD(&io_mm->devices);
> +
> +	io_mm->pasid = ioasid_alloc(&shared_pasid, params->min_pasid,
> +				    params->max_pasid, io_mm->mm);
> +	if (io_mm->pasid == INVALID_IOASID) {
> +		ret = -ENOSPC;
> +		goto err_free_io_mm;
> +	}
> +
> +	io_mm->ctx = params->ops->alloc(mm);
> +	if (IS_ERR(io_mm->ctx)) {
> +		ret = PTR_ERR(io_mm->ctx);
> +		goto err_free_pasid;
> +	}
> +	return &io_mm->notifier;
> +
> +err_free_pasid:
> +	ioasid_free(io_mm->pasid);
> +err_free_io_mm:
> +	kfree(io_mm);
> +	return ERR_PTR(ret);
> +}
> +
> +static void io_mm_free(struct mmu_notifier *mn)
> +{
> +	struct io_mm *io_mm = to_io_mm(mn);
> +
> +	WARN_ON(!list_empty(&io_mm->devices));
> +
> +	io_mm->ops->release(io_mm->ctx);
> +	ioasid_free(io_mm->pasid);
> +	kfree(io_mm);
> +}
> +
> +/*
> + * io_mm_get - Allocate an io_mm or get the existing one for the
> given mm
> + * @mm: the mm
> + * @ops: callbacks for the IOMMU driver
> + * @min_pasid: minimum PASID value (inclusive)
> + * @max_pasid: maximum PASID value (inclusive)
> + *
> + * Returns a valid io_mm or an error pointer.
> + */
> +static struct io_mm *io_mm_get(struct mm_struct *mm,
> +			       const struct io_mm_ops *ops,
> +			       int min_pasid, int max_pasid)
> +{
> +	struct io_mm *io_mm;
> +	struct mmu_notifier *mn;
> +	struct io_mm_alloc_params params = {
> +		.ops		= ops,
> +		.min_pasid	= min_pasid,
> +		.max_pasid	= max_pasid,
> +	};
> +
> +	/*
> +	 * A single notifier can exist for this (ops, mm) pair.
> Allocate it if
> +	 * necessary.
> +	 */
> +	mn = mmu_notifier_get(&iommu_mmu_notifier_ops, mm, &params);
> +	if (IS_ERR(mn))
> +		return ERR_CAST(mn);
> +	io_mm = to_io_mm(mn);
> +
> +	if (WARN_ON(io_mm->ops != ops)) {
> +		mmu_notifier_put(mn);
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	return io_mm;
> +}
> +
> +static void io_mm_put(struct io_mm *io_mm)
> +{
> +	mmu_notifier_put(&io_mm->notifier);
> +}
> +
> +static struct iommu_sva *
> +io_mm_attach(struct device *dev, struct io_mm *io_mm, void *drvdata)
> +{
> +	int ret = 0;
> +	bool attach_domain = true;
> +	struct iommu_bond *bond, *tmp;
> +	struct iommu_domain *domain, *other;
> +	struct iommu_sva_param *param = dev->iommu_param->sva_param;
> +
> +	domain = iommu_get_domain_for_dev(dev);
> +
> +	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
> +	if (!bond)
> +		return ERR_PTR(-ENOMEM);
> +
> +	bond->sva.dev	= dev;
> +	bond->drvdata	= drvdata;
> +	refcount_set(&bond->refs, 1);
> +	RCU_INIT_POINTER(bond->io_mm, io_mm);
> +
> +	mutex_lock(&iommu_sva_lock);
> +	/* Is it already bound to the device or domain? */
> +	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
> +		if (tmp->sva.dev != dev) {
> +			other =
> iommu_get_domain_for_dev(tmp->sva.dev);
> +			if (domain == other)
> +				attach_domain = false;
> +
> +			continue;
At this point, we already know this is a new device trying to attach to
one of io_mm's existing domains. So there is no need to continue
checking, right? Perhaps check like this?
-               if (tmp->sva.dev != dev) {
+               if (tmp->sva.dev != dev && attach_domain) {


> +		}
> +
> +		if (WARN_ON(tmp->drvdata != drvdata)) {
> +			ret = -EINVAL;
> +			goto err_free;
> +		}
> +
> +		/*
> +		 * Hold a single io_mm reference per bond. Note that
> we can't
> +		 * return an error after this, otherwise the caller
> would drop
> +		 * an additional reference to the io_mm.
> +		 */
> +		refcount_inc(&tmp->refs);
> +		io_mm_put(io_mm);
> +		kfree(bond);
Can bond be allocated after searching for existing bond or domain? If
so, we can avoid free bond here.

> +		mutex_unlock(&iommu_sva_lock);
> +		return &tmp->sva;
> +	}
> +
> +	list_add_rcu(&bond->mm_head, &io_mm->devices);
> +	param->nr_bonds++;
> +	mutex_unlock(&iommu_sva_lock);
> +
> +	ret = io_mm->ops->attach(bond->sva.dev, io_mm->pasid,
> io_mm->ctx,
> +				 attach_domain);
For VT-d, if a device trying to do SVA bind, there would not be a DMA
domain. SVA should own the entire address space, no IOVA. So this
attach() call is for VT-d driver to setup the first PASID table entry
regardless attach_domain is true or false?

> +	if (ret)
> +		goto err_remove;
> +
> +	return &bond->sva;
> +
> +err_remove:
> +	/*
> +	 * At this point concurrent threads may have started to
> access the
> +	 * io_mm->devices list in order to invalidate address
> ranges, which
> +	 * requires to free the bond via kfree_rcu()
> +	 */
> +	mutex_lock(&iommu_sva_lock);
> +	param->nr_bonds--;
> +	list_del_rcu(&bond->mm_head);
> +
> +err_free:
> +	mutex_unlock(&iommu_sva_lock);
> +	kfree_rcu(bond, rcu_head);
> +	return ERR_PTR(ret);
> +}
> +
> +static void io_mm_detach_locked(struct iommu_bond *bond)
> +{
> +	struct io_mm *io_mm;
> +	struct iommu_bond *tmp;
> +	bool detach_domain = true;
> +	struct iommu_domain *domain, *other;
> +
> +	io_mm = rcu_dereference_protected(bond->io_mm,
> +
> lockdep_is_held(&iommu_sva_lock));
> +	if (!io_mm)
> +		return;
> +
> +	domain = iommu_get_domain_for_dev(bond->sva.dev);
> +
> +	/* Are other devices in the same domain still attached to
> this mm? */
> +	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
> +		if (tmp == bond)
> +			continue;
> +		other = iommu_get_domain_for_dev(tmp->sva.dev);
> +		if (domain == other) {
> +			detach_domain = false;
> +			break;
> +		}
> +	}
> +
> +	io_mm->ops->detach(bond->sva.dev, io_mm->pasid, io_mm->ctx,
> +			   detach_domain);
> +
> +	list_del_rcu(&bond->mm_head);
> +	RCU_INIT_POINTER(bond->io_mm, NULL);
> +
> +	/* Free after RCU grace period */
> +	io_mm_put(io_mm);
> +}
> +
> +/*
> + * io_mm_release - release MMU notifier
> + *
> + * Called when the mm exits. Some devices may still be bound to the
> io_mm. A few
> + * things need to be done before it is safe to release:
> + *
> + * - Tell the device driver to stop using this PASID.
> + * - Clear the PASID table and invalidate TLBs.
> + * - Drop all references to this io_mm.
> + */
> +static void io_mm_release(struct mmu_notifier *mn, struct mm_struct
> *mm) +{
> +	struct iommu_bond *bond, *next;
> +	struct io_mm *io_mm = to_io_mm(mn);
> +
> +	mutex_lock(&iommu_sva_lock);
> +	list_for_each_entry_safe(bond, next, &io_mm->devices,
> mm_head) {
> +		struct device *dev = bond->sva.dev;
> +		struct iommu_sva *sva = &bond->sva;
> +
> +		if (sva->ops && sva->ops->mm_exit &&
> +		    sva->ops->mm_exit(dev, sva, bond->drvdata))
> +			dev_WARN(dev, "possible leak of PASID %u",
> +				 io_mm->pasid);
> +
> +		/* unbind() frees the bond, we just detach it */
> +		io_mm_detach_locked(bond);
> +	}
> +	mutex_unlock(&iommu_sva_lock);
> +}
> +
> +static void io_mm_invalidate_range(struct mmu_notifier *mn,
> +				   struct mm_struct *mm, unsigned
> long start,
> +				   unsigned long end)
> +{
> +	struct iommu_bond *bond;
> +	struct io_mm *io_mm = to_io_mm(mn);
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(bond, &io_mm->devices, mm_head)
> +		io_mm->ops->invalidate(bond->sva.dev, io_mm->pasid,
> io_mm->ctx,
> +				       start, end - start);
> +	rcu_read_unlock();
> +}
> +
> +static struct mmu_notifier_ops iommu_mmu_notifier_ops = {
> +	.alloc_notifier		= io_mm_alloc,
> +	.free_notifier		= io_mm_free,
> +	.release		= io_mm_release,
> +	.invalidate_range	= io_mm_invalidate_range,
> +};
> +
> +struct iommu_sva *
> +iommu_sva_bind_generic(struct device *dev, struct mm_struct *mm,
> +		       const struct io_mm_ops *ops, void *drvdata)
> +{
> +	struct io_mm *io_mm;
> +	struct iommu_sva *handle;
> +	struct iommu_param *param = dev->iommu_param;
> +
> +	if (!param)
> +		return ERR_PTR(-ENODEV);
> +
> +	mutex_lock(&param->sva_lock);
> +	if (!param->sva_param) {
> +		handle = ERR_PTR(-ENODEV);
> +		goto out_unlock;
> +	}
> +
> +	io_mm = io_mm_get(mm, ops, param->sva_param->min_pasid,
> +			  param->sva_param->max_pasid);
> +	if (IS_ERR(io_mm)) {
> +		handle = ERR_CAST(io_mm);
> +		goto out_unlock;
> +	}
> +
> +	handle = io_mm_attach(dev, io_mm, drvdata);
> +	if (IS_ERR(handle))
> +		io_mm_put(io_mm);
> +
> +out_unlock:
> +	mutex_unlock(&param->sva_lock);
> +	return handle;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_bind_generic);
> +
> +static void iommu_sva_unbind_locked(struct iommu_bond *bond)
> +{
> +	struct device *dev = bond->sva.dev;
> +	struct iommu_sva_param *param = dev->iommu_param->sva_param;
> +
> +	if (!refcount_dec_and_test(&bond->refs))
> +		return;
> +
dont you need to free bond here?

> +	io_mm_detach_locked(bond);
> +	param->nr_bonds--;
> +	kfree_rcu(bond, rcu_head);
> +}
> +
> +void iommu_sva_unbind_generic(struct iommu_sva *handle)
> +{
> +	struct iommu_param *param = handle->dev->iommu_param;
> +
> +	if (WARN_ON(!param))
> +		return;
> +
> +	mutex_lock(&param->sva_lock);
> +	mutex_lock(&iommu_sva_lock);
> +	iommu_sva_unbind_locked(to_iommu_bond(handle));
> +	mutex_unlock(&iommu_sva_lock);
> +	mutex_unlock(&param->sva_lock);
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_unbind_generic);
> +
> +/**
> + * iommu_sva_enable() - Enable Shared Virtual Addressing for a device
> + * @dev: the device
> + * @sva_param: the parameters.
> + *
> + * Called by an IOMMU driver to setup the SVA parameters
> + * @sva_param is duplicated and can be freed when this function
> returns.
> + *
> + * Return 0 if initialization succeeded, or an error.
> + */
IOMMU vendor driver usually dont know when the device SVA feature will
be used until bind call. So we pretty much have to call this for every
device during init time?

> +int iommu_sva_enable(struct device *dev, struct iommu_sva_param
> *sva_param) +{
> +	int ret;
> +	struct iommu_sva_param *new_param;
> +	struct iommu_param *param = dev->iommu_param;
> +
> +	if (!param)
> +		return -ENODEV;
> +
> +	new_param = kmemdup(sva_param, sizeof(*new_param),
> GFP_KERNEL);
> +	if (!new_param)
> +		return -ENOMEM;
> +
> +	mutex_lock(&param->sva_lock);
> +	if (param->sva_param) {
> +		ret = -EEXIST;
> +		goto err_unlock;
> +	}
> +
> +	dev->iommu_param->sva_param = new_param;
> +	mutex_unlock(&param->sva_lock);
> +	return 0;
> +
> +err_unlock:
> +	mutex_unlock(&param->sva_lock);
> +	kfree(new_param);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_enable);
> +
> +/**
> + * iommu_sva_disable() - Disable Shared Virtual Addressing for a
> device
> + * @dev: the device
> + *
> + * IOMMU drivers call this to disable SVA.
> + */
> +int iommu_sva_disable(struct device *dev)
> +{
> +	int ret = 0;
> +	struct iommu_param *param = dev->iommu_param;
> +
> +	if (!param)
> +		return -EINVAL;
> +
> +	mutex_lock(&param->sva_lock);
> +	if (!param->sva_param) {
> +		ret = -ENODEV;
> +		goto out_unlock;
> +	}
> +
> +	/* Require that all contexts are unbound */
> +	if (param->sva_param->nr_bonds) {
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +
> +	kfree(param->sva_param);
> +	param->sva_param = NULL;
> +out_unlock:
> +	mutex_unlock(&param->sva_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_disable);
> +
> +bool iommu_sva_enabled(struct device *dev)
> +{
> +	bool enabled;
> +	struct iommu_param *param = dev->iommu_param;
> +
> +	if (!param)
> +		return false;
> +
> +	mutex_lock(&param->sva_lock);
> +	enabled = !!param->sva_param;
> +	mutex_unlock(&param->sva_lock);
> +	return enabled;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_enabled);
> +
> +int iommu_sva_get_pasid_generic(struct iommu_sva *handle)
> +{
> +	struct io_mm *io_mm;
> +	int pasid = IOMMU_PASID_INVALID;
> +	struct iommu_bond *bond = to_iommu_bond(handle);
> +
> +	rcu_read_lock();
> +	io_mm = rcu_dereference(bond->io_mm);
> +	if (io_mm)
> +		pasid = io_mm->pasid;
> +	rcu_read_unlock();
> +	return pasid;
> +}
> +EXPORT_SYMBOL_GPL(iommu_sva_get_pasid_generic);
> diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
> new file mode 100644
> index 000000000000..dd55c2db0936
> --- /dev/null
> +++ b/drivers/iommu/iommu-sva.h
> @@ -0,0 +1,64 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * SVA library for IOMMU drivers
> + */
> +#ifndef _IOMMU_SVA_H
> +#define _IOMMU_SVA_H
> +
> +#include <linux/iommu.h>
> +#include <linux/kref.h>
> +#include <linux/mmu_notifier.h>
> +
> +struct io_mm_ops {
> +	/* Allocate a PASID context for an mm */
> +	void *(*alloc)(struct mm_struct *mm);
> +
> +	/*
> +	 * Attach a PASID context to a device. Write the entry into
> the PASID
> +	 * table.
> +	 *
> +	 * @attach_domain is true when no other device in the IOMMU
> domain is
> +	 *   already attached to this context. IOMMU drivers that
> share the
> +	 *   PASID tables within a domain don't need to write the
> PASID entry
> +	 *   when @attach_domain is false.
> +	 */
If we have per device PASID table, then we need to set up PASID table
entry regardless of the domain sharing. What is confusing to me is that
domain is for DMA isolation on request w/o PASID, but with SVA we don't
really care about domains. Sorry, it has been a long time since we
discussed this. I think will work for VT-d but just wanted to make sure
I understand the intentions.

> +	int (*attach)(struct device *dev, int pasid, void *ctx,
> +		      bool attach_domain);
> +
> +	/*
> +	 * Detach a PASID context from a device. Clear the entry
> from the PASID
> +	 * table and invalidate if necessary.
> +	 *
> +	 * @detach_domain is true when no other device in the IOMMU
> domain is
> +	 *   still attached to this context. IOMMU drivers that
> share the PASID
> +	 *   table within a domain don't need to clear the PASID
> entry when
> +	 *   @detach_domain is false, only invalidate the caches.
> +	 */
> +	void (*detach)(struct device *dev, int pasid, void *ctx,
> +		       bool detach_domain);
> +
> +	/* Invalidate a range of addresses. Cannot sleep. */
> +	void (*invalidate)(struct device *dev, int pasid, void *ctx,
> +			   unsigned long vaddr, size_t size);
> +
> +	/* Free a context. Cannot sleep. */
> +	void (*release)(void *ctx);
> +};
> +
> +struct iommu_sva_param {
> +	u32			min_pasid;
> +	u32			max_pasid;
> +	int			nr_bonds;
> +};
> +
> +struct iommu_sva *
> +iommu_sva_bind_generic(struct device *dev, struct mm_struct *mm,
> +		       const struct io_mm_ops *ops, void *drvdata);
> +void iommu_sva_unbind_generic(struct iommu_sva *handle);
> +int iommu_sva_get_pasid_generic(struct iommu_sva *handle);
> +
> +int iommu_sva_enable(struct device *dev, struct iommu_sva_param
> *sva_param); +int iommu_sva_disable(struct device *dev);
> +bool iommu_sva_enabled(struct device *dev);
> +
> +#endif /* _IOMMU_SVA_H */
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 3e3528436e0b..c8bd972c1788 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -164,6 +164,7 @@ static struct iommu_param
> *iommu_get_dev_param(struct device *dev) return NULL;
>  
>  	mutex_init(&param->lock);
> +	mutex_init(&param->sva_lock);
>  	dev->iommu_param = param;
>  	return param;
>  }
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 1739f8a7a4b4..83397ae88d2d 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -368,6 +368,7 @@ struct iommu_fault_param {
>   * struct iommu_param - collection of per-device IOMMU data
>   *
>   * @fault_param: IOMMU detected device fault reporting data
> + * @sva_param: IOMMU parameter for SVA
>   *
>   * TODO: migrate other per device data pointers under
> iommu_dev_data, e.g.
>   *	struct iommu_group	*iommu_group;
> @@ -376,6 +377,8 @@ struct iommu_fault_param {
>  struct iommu_param {
>  	struct mutex lock;
>  	struct iommu_fault_param *fault_param;
> +	struct mutex sva_lock;
> +	struct iommu_sva_param *sva_param;
>  };
>  
>  int  iommu_device_register(struct iommu_device *iommu);

Thanks,

Jacob
Jean-Philippe Brucker Feb. 28, 2020, 2:40 p.m. UTC | #3
On Wed, Feb 26, 2020 at 11:13:20AM -0800, Jacob Pan wrote:
> Hi Jean,
> 
> A few comments inline. I am also trying to converge to the common sva
> APIs. I sent out the first step w/o iopage fault and the generic ops
> you have here.

Great, thanks for sending it out, it's on my list to look at

> On Mon, 24 Feb 2020 19:23:37 +0100
> Jean-Philippe Brucker <jean-philippe@linaro.org> wrote:
> 
> > From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> > 
> > Add a small library to help IOMMU drivers manage process address
> > spaces bound to their devices. Register an MMU notifier to track
> > modification on each address space bound to one or more devices.
> > 
> > IOMMU drivers must implement the io_mm_ops and can then use the
> > helpers provided by this library to easily implement the SVA API
> > introduced by commit 26b25a2b98e4. The io_mm_ops are:
> > 
> > void *alloc(struct mm_struct *)
> >   Allocate a PASID context private to the IOMMU driver. There is a
> >   single context per mm. IOMMU drivers may perform arch-specific
> >   operations in there, for example pinning down a CPU ASID (on Arm).
> > 
> > int attach(struct device *, int pasid, void *ctx, bool attach_domain)
> >   Attach a context to the device, by setting up the PASID table entry.
> > 
> > int invalidate(struct device *, int pasid, void *ctx,
> >                unsigned long vaddr, size_t size)
> >   Invalidate TLB entries for this address range.
> > 
> > int detach(struct device *, int pasid, void *ctx, bool detach_domain)
> >   Detach a context from the device, by clearing the PASID table entry
> >   and invalidating cached entries.
> > 
> > void free(void *ctx)
> you meant release()?

Yes

[...]
> > +/**
> > + * DOC: io_mm model
> > + *
> > + * The io_mm keeps track of process address spaces shared between
> > CPU and IOMMU.
> > + * The following example illustrates the relation between structures
> > + * iommu_domain, io_mm and iommu_sva. The iommu_sva struct is a bond
> > between
> > + * io_mm and device. A device can have multiple io_mm and an io_mm
> > may be bound
> > + * to multiple devices.
> > + *              ___________________________
> > + *             |  IOMMU domain A           |
> > + *             |  ________________         |
> > + *             | |  IOMMU group   |        +------- io_pgtables
> > + *             | |                |        |
> > + *             | |   dev 00:00.0 ----+------- bond 1 --- io_mm X
> > + *             | |________________|   \    |
> > + *             |                       '----- bond 2 ---.
> > + *             |___________________________|             \
> > + *              ___________________________               \
> > + *             |  IOMMU domain B           |             io_mm Y
> > + *             |  ________________         |             / /
> > + *             | |  IOMMU group   |        |            / /
> > + *             | |                |        |           / /
> > + *             | |   dev 00:01.0 ------------ bond 3 -' /
> > + *             | |   dev 00:01.1 ------------ bond 4 --'
> > + *             | |________________|        |
> > + *             |                           +------- io_pgtables
> > + *             |___________________________|
> > + *
> > + * In this example, device 00:00.0 is in domain A, devices 00:01.*
> > are in domain
> > + * B. All devices within the same domain access the same address
> > spaces.
> Hmm, devices in domain A has access to both X & Y, isn't it
> contradictory?

I guess it's unclear, this is meant to explain that any device in domain B
for example, would access all address spaces bound to any other device in
that domain.

> 
> > Device
> > + * 00:00.0 accesses address spaces X and Y, each corresponding to an
> > mm_struct.
> > + * Devices 00:01.* only access address space Y. In addition each
> > + * IOMMU_DOMAIN_DMA domain has a private address space, io_pgtable,
> > that is
> > + * managed with iommu_map()/iommu_unmap(), and isn't shared with the
> > CPU MMU.
> So this would allow IOVA and SVA co-exist in the same address space?

Hmm, not in the same address space, but they can co-exist in a device. In
fact the endpoint I'm testing (hisi zip accelerator) already needs normal
DMA alongside SVA for queue management. This one is integrated on an
Arm-based platform so shouldn't be a concern for VT-d at the moment, but
I suspect we might see more of this kind of device with mixed DMA.

In addition on Arm MSI addresses are translated by the IOMMU, and since
they are requests w/o PASID they need the private address space on entry 0.

Are you not planning to use the RID_PASID entry of Scalable-Mode
Context-Entry in VT-d?

> I guess this is the PASID 0 for DMA request w/o PASID. If that is the
> case, perhaps needs more explanation since the private address space
> also has a private PASID within the domain.

The last sentence refers to this private address space used for requests
w/o PASID. I don't like referring to it as "PASID 0" since it might be
more confusing. It's entry 0 of the PASID table reserved for requests
without PASID.

I think I should just remove this here sentence and try to make the last
paragraph of the comment, which referes to the same thing, clearer. I'll
also drop io_pgtables from the above diagram to keep things on point.

> > + *
> > + * To obtain the above configuration, users would for instance issue
> > the
> > + * following calls:
> > + *
> > + *     iommu_sva_bind_device(dev 00:00.0, mm X, ...) -> bond 1
> > + *     iommu_sva_bind_device(dev 00:00.0, mm Y, ...) -> bond 2
> > + *     iommu_sva_bind_device(dev 00:01.0, mm Y, ...) -> bond 3
> > + *     iommu_sva_bind_device(dev 00:01.1, mm Y, ...) -> bond 4
> > + *
> > + * A single Process Address Space ID (PASID) is allocated for each
> > mm. In the
> > + * example, devices use PASID 1 to read/write into address space X
> > and PASID 2
> > + * to read/write into address space Y. Calling iommu_sva_get_pasid()
> > on bond 1
> > + * returns 1, and calling it on bonds 2-4 returns 2.
> > + *
> > + * Hardware tables describing this configuration in the IOMMU would
> > typically
> > + * look like this:
> > + *
> > + *                                PASID tables
> > + *                                 of domain A
> > + *                              .->+--------+
> > + *                             / 0 |        |-------> io_pgtable
> > + *                            /    +--------+
> > + *            Device tables  /   1 |        |-------> pgd X
> > + *              +--------+  /      +--------+
> > + *      00:00.0 |      A |-'     2 |        |--.
> > + *              +--------+         +--------+   \
> > + *              :        :       3 |        |    \
> > + *              +--------+         +--------+     --> pgd Y
> > + *      00:01.0 |      B |--.                    /
> > + *              +--------+   \                  |
> > + *      00:01.1 |      B |----+   PASID tables  |
> > + *              +--------+     \   of domain B  |
> > + *                              '->+--------+   |
> > + *                               0 |        |-- | --> io_pgtable
> > + *                                 +--------+   |
> > + *                               1 |        |   |
> > + *                                 +--------+   |
> > + *                               2 |        |---'
> > + *                                 +--------+
> > + *                               3 |        |
> > + *                                 +--------+
> > + *
> > + * With this model, a single call binds all devices in a given
> > domain to an
> > + * address space. Other devices in the domain will get the same bond
> > implicitly.
> > + * However, users must issue one bind() for each device, because
> > IOMMUs may
> > + * implement SVA differently. Furthermore, mandating one bind() per
> > device
> > + * allows the driver to perform sanity-checks on device capabilities.
> > + *
> > + * In some IOMMUs, one entry of the PASID table (typically the first
> > one) can
> > + * hold non-PASID translations. In this case PASID 0 is reserved and
> > the first
> > + * entry points to the io_pgtable pointer. In other IOMMUs the
> > io_pgtable
> > + * pointer is held in the device table and PASID 0 is available to
> > the
> > + * allocator.
> > + */
[...]
> > +static struct iommu_sva *
> > +io_mm_attach(struct device *dev, struct io_mm *io_mm, void *drvdata)
> > +{
> > +	int ret = 0;
> > +	bool attach_domain = true;
> > +	struct iommu_bond *bond, *tmp;
> > +	struct iommu_domain *domain, *other;
> > +	struct iommu_sva_param *param = dev->iommu_param->sva_param;
> > +
> > +	domain = iommu_get_domain_for_dev(dev);
> > +
> > +	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
> > +	if (!bond)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	bond->sva.dev	= dev;
> > +	bond->drvdata	= drvdata;
> > +	refcount_set(&bond->refs, 1);
> > +	RCU_INIT_POINTER(bond->io_mm, io_mm);
> > +
> > +	mutex_lock(&iommu_sva_lock);
> > +	/* Is it already bound to the device or domain? */
> > +	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
> > +		if (tmp->sva.dev != dev) {
> > +			other =
> > iommu_get_domain_for_dev(tmp->sva.dev);
> > +			if (domain == other)
> > +				attach_domain = false;
> > +
> > +			continue;
> At this point, we already know this is a new device trying to attach to
> one of io_mm's existing domains.
>
> So there is no need to continue
> checking, right? Perhaps check like this?
> -               if (tmp->sva.dev != dev) {
> +               if (tmp->sva.dev != dev && attach_domain) {

That doesn't seem right, we need the 'continue'. I'll turn this around
into 'if (tmp->sva.dev == dev)' to make things more readable.

> > +		}
> > +
> > +		if (WARN_ON(tmp->drvdata != drvdata)) {
> > +			ret = -EINVAL;
> > +			goto err_free;
> > +		}
> > +
> > +		/*
> > +		 * Hold a single io_mm reference per bond. Note that
> > we can't
> > +		 * return an error after this, otherwise the caller
> > would drop
> > +		 * an additional reference to the io_mm.
> > +		 */
> > +		refcount_inc(&tmp->refs);
> > +		io_mm_put(io_mm);
> > +		kfree(bond);
> Can bond be allocated after searching for existing bond or domain? If
> so, we can avoid free bond here.

Yes, and I think we can simplify the whole function further. I think I
wrote it that way to have the kzalloc() be outside iommu_sva_lock, back
when it was a spinlock.

> > +		mutex_unlock(&iommu_sva_lock);
> > +		return &tmp->sva;
> > +	}
> > +
> > +	list_add_rcu(&bond->mm_head, &io_mm->devices);
> > +	param->nr_bonds++;
> > +	mutex_unlock(&iommu_sva_lock);
> > +
> > +	ret = io_mm->ops->attach(bond->sva.dev, io_mm->pasid,
> > io_mm->ctx,
> > +				 attach_domain);
> For VT-d, if a device trying to do SVA bind, there would not be a DMA
> domain. SVA should own the entire address space, no IOVA.

Do you mean PASID table rather than address space?

> So this
> attach() call is for VT-d driver to setup the first PASID table entry
> regardless attach_domain is true or false?

Yes ignoring the attach_domain parameter should be fine (more below). 

[...]
> > +static void iommu_sva_unbind_locked(struct iommu_bond *bond)
> > +{
> > +	struct device *dev = bond->sva.dev;
> > +	struct iommu_sva_param *param = dev->iommu_param->sva_param;
> > +
> > +	if (!refcount_dec_and_test(&bond->refs))
> > +		return;
> > +
> dont you need to free bond here?

We free it in the rcu callback below

> > +	io_mm_detach_locked(bond);
> > +	param->nr_bonds--;
> > +	kfree_rcu(bond, rcu_head);
> > +}
> > +
> > +void iommu_sva_unbind_generic(struct iommu_sva *handle)
> > +{
> > +	struct iommu_param *param = handle->dev->iommu_param;
> > +
> > +	if (WARN_ON(!param))
> > +		return;
> > +
> > +	mutex_lock(&param->sva_lock);
> > +	mutex_lock(&iommu_sva_lock);
> > +	iommu_sva_unbind_locked(to_iommu_bond(handle));
> > +	mutex_unlock(&iommu_sva_lock);
> > +	mutex_unlock(&param->sva_lock);
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_sva_unbind_generic);
> > +
> > +/**
> > + * iommu_sva_enable() - Enable Shared Virtual Addressing for a device
> > + * @dev: the device
> > + * @sva_param: the parameters.
> > + *
> > + * Called by an IOMMU driver to setup the SVA parameters
> > + * @sva_param is duplicated and can be freed when this function
> > returns.
> > + *
> > + * Return 0 if initialization succeeded, or an error.
> > + */
> IOMMU vendor driver usually dont know when the device SVA feature will
> be used until bind call. So we pretty much have to call this for every
> device during init time?

Not necessarily. Before bind the device driver should call
iommu_dev_enable_feature(dev, IOMMU_FEAT_SVA), which is when SMMUv3
invokes iommu_sva_enable()

[...]
> > +struct io_mm_ops {
> > +	/* Allocate a PASID context for an mm */
> > +	void *(*alloc)(struct mm_struct *mm);
> > +
> > +	/*
> > +	 * Attach a PASID context to a device. Write the entry into
> > the PASID
> > +	 * table.
> > +	 *
> > +	 * @attach_domain is true when no other device in the IOMMU
> > domain is
> > +	 *   already attached to this context. IOMMU drivers that
> > share the
> > +	 *   PASID tables within a domain don't need to write the
> > PASID entry
> > +	 *   when @attach_domain is false.
> > +	 */
> If we have per device PASID table, then we need to set up PASID table
> entry regardless of the domain sharing.

Yes, the attach_domain is a hint for IOMMU drivers that handle PASID
tables per domain (SMMUv3). If PASID tables are per device then it can be
ignored. I added it to the interface because it's a lot more difficult to
check from within the SMMU driver, whereas iommu-sva already iterates over
all devices attached to an io_mm. Arguably the hint isn't as useful on
attach than on detach, where we must not clear the PASID table entry if
other devices in the domain are still using it.

> What is confusing to me is that
> domain is for DMA isolation on request w/o PASID, but with SVA we don't
> really care about domains. Sorry, it has been a long time since we
> discussed this. I think will work for VT-d but just wanted to make sure
> I understand the intentions.

No problem, it has been a while and I don't remember the rationale for
every choice. It's good to question whether they're still valid.

I find the per-domain PASID table to be a good model when reasoning about
IOMMU groups. In pci_device_group() a single group is created for devices
whose Requester ID alias, and they all get the same domain. In a buggy
system, if a device can issue DMA with the RID of another, then regardless
of PASID the IOMMU cannot isolate them. Having per-device PASID table
doesn't add any isolation but may hide the flaw from the user, if they
think that binding an mm to device A prevents a DMA-aliased device B from
accessing it.

This is hypothetical because we don't allow SVA for multi-device groups at
the moment (sanity-check would be messy) but maybe buggy implementations
will want this support in the future.

In the normal case, one device per domain, having PASID tables on the
domain rather than device doesn't make a difference. It makes a difference
if the user wants to put multiple devices in the same domain (e.g. VFIO
container). I don't know if that's a use-case.

Thanks,
Jean
Jean-Philippe Brucker Feb. 28, 2020, 2:43 p.m. UTC | #4
On Wed, Feb 26, 2020 at 12:35:06PM +0000, Jonathan Cameron wrote:
> > + * A single Process Address Space ID (PASID) is allocated for each mm. In the
> > + * example, devices use PASID 1 to read/write into address space X and PASID 2
> > + * to read/write into address space Y. Calling iommu_sva_get_pasid() on bond 1
> > + * returns 1, and calling it on bonds 2-4 returns 2.
> > + *
> > + * Hardware tables describing this configuration in the IOMMU would typically
> > + * look like this:
> > + *
> > + *                                PASID tables
> > + *                                 of domain A
> > + *                              .->+--------+
> > + *                             / 0 |        |-------> io_pgtable
> > + *                            /    +--------+
> > + *            Device tables  /   1 |        |-------> pgd X
> > + *              +--------+  /      +--------+
> > + *      00:00.0 |      A |-'     2 |        |--.
> > + *              +--------+         +--------+   \
> > + *              :        :       3 |        |    \
> > + *              +--------+         +--------+     --> pgd Y
> > + *      00:01.0 |      B |--.                    /
> > + *              +--------+   \                  |
> > + *      00:01.1 |      B |----+   PASID tables  |
> > + *              +--------+     \   of domain B  |
> > + *                              '->+--------+   |
> > + *                               0 |        |-- | --> io_pgtable
> > + *                                 +--------+   |
> > + *                               1 |        |   |
> > + *                                 +--------+   |
> > + *                               2 |        |---'
> > + *                                 +--------+
> > + *                               3 |        |
> > + *                                 +--------+
> > + *
> > + * With this model, a single call binds all devices in a given domain to an
> > + * address space. Other devices in the domain will get the same bond implicitly.
> > + * However, users must issue one bind() for each device, because IOMMUs may
> > + * implement SVA differently. Furthermore, mandating one bind() per device
> > + * allows the driver to perform sanity-checks on device capabilities.
> 
> > + *
> > + * In some IOMMUs, one entry of the PASID table (typically the first one) can
> > + * hold non-PASID translations. In this case PASID 0 is reserved and the first
> > + * entry points to the io_pgtable pointer. In other IOMMUs the io_pgtable
> > + * pointer is held in the device table and PASID 0 is available to the
> > + * allocator.
> 
> Is it worth hammering home in here that we can only do this because the PASID space
> is global (with exception of PASID 0)?  It's a convenient simplification but not
> necessarily a hardware restriction so perhaps we should remind people somewhere in here?

I could add this four paragraphs up:

"A single Process Address Space ID (PASID) is allocated for each mm. It is
a choice made for the Linux SVA implementation, not a hardware
restriction."

> > + */
> > +
> > +struct io_mm {
> > +	struct list_head		devices;
> > +	struct mm_struct		*mm;
> > +	struct mmu_notifier		notifier;
> > +
> > +	/* Late initialization */
> > +	const struct io_mm_ops		*ops;
> > +	void				*ctx;
> > +	int				pasid;
> > +};
> > +
> > +#define to_io_mm(mmu_notifier)	container_of(mmu_notifier, struct io_mm, notifier)
> > +#define to_iommu_bond(handle)	container_of(handle, struct iommu_bond, sva)
> 
> Code ordering wise, do we want this after the definition of iommu_bond?
> 
> For both of these it's a bit non obvious what they come 'from'.
> I wouldn't naturally assume to_io_mm gets me from notifier to the io_mm
> for example.  Not sure it matters though if these are only used in a few
> places.

Right, I can rename the first one to mn_to_io_mm(). The second one I think
might be good enough.


> > +static struct iommu_sva *
> > +io_mm_attach(struct device *dev, struct io_mm *io_mm, void *drvdata)
> > +{
> > +	int ret = 0;
> 
> I'm fairly sure this is set in all paths below.  Now, of course the
> compiler might not think that in which case fair enough :)
> 
> > +	bool attach_domain = true;
> > +	struct iommu_bond *bond, *tmp;
> > +	struct iommu_domain *domain, *other;
> > +	struct iommu_sva_param *param = dev->iommu_param->sva_param;
> > +
> > +	domain = iommu_get_domain_for_dev(dev);
> > +
> > +	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
> > +	if (!bond)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	bond->sva.dev	= dev;
> > +	bond->drvdata	= drvdata;
> > +	refcount_set(&bond->refs, 1);
> > +	RCU_INIT_POINTER(bond->io_mm, io_mm);
> > +
> > +	mutex_lock(&iommu_sva_lock);
> > +	/* Is it already bound to the device or domain? */
> > +	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
> > +		if (tmp->sva.dev != dev) {
> > +			other = iommu_get_domain_for_dev(tmp->sva.dev);
> > +			if (domain == other)
> > +				attach_domain = false;
> > +
> > +			continue;
> > +		}
> > +
> > +		if (WARN_ON(tmp->drvdata != drvdata)) {
> > +			ret = -EINVAL;
> > +			goto err_free;
> > +		}
> > +
> > +		/*
> > +		 * Hold a single io_mm reference per bond. Note that we can't
> > +		 * return an error after this, otherwise the caller would drop
> > +		 * an additional reference to the io_mm.
> > +		 */
> > +		refcount_inc(&tmp->refs);
> > +		io_mm_put(io_mm);
> > +		kfree(bond);
> 
> Free outside the lock would be ever so slightly more logical given we allocated
> before taking the lock.
> 
> > +		mutex_unlock(&iommu_sva_lock);
> > +		return &tmp->sva;
> > +	}
> > +
> > +	list_add_rcu(&bond->mm_head, &io_mm->devices);
> > +	param->nr_bonds++;
> > +	mutex_unlock(&iommu_sva_lock);
> > +
> > +	ret = io_mm->ops->attach(bond->sva.dev, io_mm->pasid, io_mm->ctx,
> > +				 attach_domain);
> > +	if (ret)
> > +		goto err_remove;
> > +
> > +	return &bond->sva;
> > +
> > +err_remove:
> > +	/*
> > +	 * At this point concurrent threads may have started to access the
> > +	 * io_mm->devices list in order to invalidate address ranges, which
> > +	 * requires to free the bond via kfree_rcu()
> > +	 */
> > +	mutex_lock(&iommu_sva_lock);
> > +	param->nr_bonds--;
> > +	list_del_rcu(&bond->mm_head);
> > +
> > +err_free:
> > +	mutex_unlock(&iommu_sva_lock);
> > +	kfree_rcu(bond, rcu_head);
> 
> I don't suppose it matters really but we don't need the rcu free if
> we follow the err_free goto.  Perhaps we are cleaner in this case
> to not use a unified exit path but do that case inline?

Agreed, though I moved the kzalloc() later as suggested by Jacob, I think
it looks a little better and simplifies the error paths

Thanks,
Jean
Jason Gunthorpe Feb. 28, 2020, 2:57 p.m. UTC | #5
On Fri, Feb 28, 2020 at 03:40:07PM +0100, Jean-Philippe Brucker wrote:
> > > Device
> > > + * 00:00.0 accesses address spaces X and Y, each corresponding to an
> > > mm_struct.
> > > + * Devices 00:01.* only access address space Y. In addition each
> > > + * IOMMU_DOMAIN_DMA domain has a private address space, io_pgtable,
> > > that is
> > > + * managed with iommu_map()/iommu_unmap(), and isn't shared with the
> > > CPU MMU.
> > So this would allow IOVA and SVA co-exist in the same address space?
> 
> Hmm, not in the same address space, but they can co-exist in a device. In
> fact the endpoint I'm testing (hisi zip accelerator) already needs normal
> DMA alongside SVA for queue management. This one is integrated on an
> Arm-based platform so shouldn't be a concern for VT-d at the moment, but
> I suspect we might see more of this kind of device with mixed DMA.

Probably the most interesting usecases for PASID definately require
this, so this is more than a "suspect we might see"

We want to see the privileged kernel control the general behavior of
the PCI function and delegate only some DMAs to PASIDs associated with
the user mm_struct. The device is always trusted the label its DMA
properly.

These programming models are already being used for years now with the
opencapi implementation.

Jason
Jonathan Cameron Feb. 28, 2020, 4:26 p.m. UTC | #6
On Fri, 28 Feb 2020 15:43:04 +0100
Jean-Philippe Brucker <jean-philippe@linaro.org> wrote:

> On Wed, Feb 26, 2020 at 12:35:06PM +0000, Jonathan Cameron wrote:
> > > + * A single Process Address Space ID (PASID) is allocated for each mm. In the
> > > + * example, devices use PASID 1 to read/write into address space X and PASID 2
> > > + * to read/write into address space Y. Calling iommu_sva_get_pasid() on bond 1
> > > + * returns 1, and calling it on bonds 2-4 returns 2.
> > > + *
> > > + * Hardware tables describing this configuration in the IOMMU would typically
> > > + * look like this:
> > > + *
> > > + *                                PASID tables
> > > + *                                 of domain A
> > > + *                              .->+--------+
> > > + *                             / 0 |        |-------> io_pgtable
> > > + *                            /    +--------+
> > > + *            Device tables  /   1 |        |-------> pgd X
> > > + *              +--------+  /      +--------+
> > > + *      00:00.0 |      A |-'     2 |        |--.
> > > + *              +--------+         +--------+   \
> > > + *              :        :       3 |        |    \
> > > + *              +--------+         +--------+     --> pgd Y
> > > + *      00:01.0 |      B |--.                    /
> > > + *              +--------+   \                  |
> > > + *      00:01.1 |      B |----+   PASID tables  |
> > > + *              +--------+     \   of domain B  |
> > > + *                              '->+--------+   |
> > > + *                               0 |        |-- | --> io_pgtable
> > > + *                                 +--------+   |
> > > + *                               1 |        |   |
> > > + *                                 +--------+   |
> > > + *                               2 |        |---'
> > > + *                                 +--------+
> > > + *                               3 |        |
> > > + *                                 +--------+
> > > + *
> > > + * With this model, a single call binds all devices in a given domain to an
> > > + * address space. Other devices in the domain will get the same bond implicitly.
> > > + * However, users must issue one bind() for each device, because IOMMUs may
> > > + * implement SVA differently. Furthermore, mandating one bind() per device
> > > + * allows the driver to perform sanity-checks on device capabilities.  
> >   
> > > + *
> > > + * In some IOMMUs, one entry of the PASID table (typically the first one) can
> > > + * hold non-PASID translations. In this case PASID 0 is reserved and the first
> > > + * entry points to the io_pgtable pointer. In other IOMMUs the io_pgtable
> > > + * pointer is held in the device table and PASID 0 is available to the
> > > + * allocator.  
> > 
> > Is it worth hammering home in here that we can only do this because the PASID space
> > is global (with exception of PASID 0)?  It's a convenient simplification but not
> > necessarily a hardware restriction so perhaps we should remind people somewhere in here?  
> 
> I could add this four paragraphs up:
> 
> "A single Process Address Space ID (PASID) is allocated for each mm. It is
> a choice made for the Linux SVA implementation, not a hardware
> restriction."

Perfect.

> 
> > > + */
> > > +
> > > +struct io_mm {
> > > +	struct list_head		devices;
> > > +	struct mm_struct		*mm;
> > > +	struct mmu_notifier		notifier;
> > > +
> > > +	/* Late initialization */
> > > +	const struct io_mm_ops		*ops;
> > > +	void				*ctx;
> > > +	int				pasid;
> > > +};
> > > +
> > > +#define to_io_mm(mmu_notifier)	container_of(mmu_notifier, struct io_mm, notifier)
> > > +#define to_iommu_bond(handle)	container_of(handle, struct iommu_bond, sva)  
> > 
> > Code ordering wise, do we want this after the definition of iommu_bond?
> > 
> > For both of these it's a bit non obvious what they come 'from'.
> > I wouldn't naturally assume to_io_mm gets me from notifier to the io_mm
> > for example.  Not sure it matters though if these are only used in a few
> > places.  
> 
> Right, I can rename the first one to mn_to_io_mm(). The second one I think
> might be good enough.

Agreed. The second one does feel more natural.

> 
> 
> > > +static struct iommu_sva *
> > > +io_mm_attach(struct device *dev, struct io_mm *io_mm, void *drvdata)
> > > +{
> > > +	int ret = 0;  
> > 
> > I'm fairly sure this is set in all paths below.  Now, of course the
> > compiler might not think that in which case fair enough :)
> >   
> > > +	bool attach_domain = true;
> > > +	struct iommu_bond *bond, *tmp;
> > > +	struct iommu_domain *domain, *other;
> > > +	struct iommu_sva_param *param = dev->iommu_param->sva_param;
> > > +
> > > +	domain = iommu_get_domain_for_dev(dev);
> > > +
> > > +	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
> > > +	if (!bond)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	bond->sva.dev	= dev;
> > > +	bond->drvdata	= drvdata;
> > > +	refcount_set(&bond->refs, 1);
> > > +	RCU_INIT_POINTER(bond->io_mm, io_mm);
> > > +
> > > +	mutex_lock(&iommu_sva_lock);
> > > +	/* Is it already bound to the device or domain? */
> > > +	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
> > > +		if (tmp->sva.dev != dev) {
> > > +			other = iommu_get_domain_for_dev(tmp->sva.dev);
> > > +			if (domain == other)
> > > +				attach_domain = false;
> > > +
> > > +			continue;
> > > +		}
> > > +
> > > +		if (WARN_ON(tmp->drvdata != drvdata)) {
> > > +			ret = -EINVAL;
> > > +			goto err_free;
> > > +		}
> > > +
> > > +		/*
> > > +		 * Hold a single io_mm reference per bond. Note that we can't
> > > +		 * return an error after this, otherwise the caller would drop
> > > +		 * an additional reference to the io_mm.
> > > +		 */
> > > +		refcount_inc(&tmp->refs);
> > > +		io_mm_put(io_mm);
> > > +		kfree(bond);  
> > 
> > Free outside the lock would be ever so slightly more logical given we allocated
> > before taking the lock.
> >   
> > > +		mutex_unlock(&iommu_sva_lock);
> > > +		return &tmp->sva;
> > > +	}
> > > +
> > > +	list_add_rcu(&bond->mm_head, &io_mm->devices);
> > > +	param->nr_bonds++;
> > > +	mutex_unlock(&iommu_sva_lock);
> > > +
> > > +	ret = io_mm->ops->attach(bond->sva.dev, io_mm->pasid, io_mm->ctx,
> > > +				 attach_domain);
> > > +	if (ret)
> > > +		goto err_remove;
> > > +
> > > +	return &bond->sva;
> > > +
> > > +err_remove:
> > > +	/*
> > > +	 * At this point concurrent threads may have started to access the
> > > +	 * io_mm->devices list in order to invalidate address ranges, which
> > > +	 * requires to free the bond via kfree_rcu()
> > > +	 */
> > > +	mutex_lock(&iommu_sva_lock);
> > > +	param->nr_bonds--;
> > > +	list_del_rcu(&bond->mm_head);
> > > +
> > > +err_free:
> > > +	mutex_unlock(&iommu_sva_lock);
> > > +	kfree_rcu(bond, rcu_head);  
> > 
> > I don't suppose it matters really but we don't need the rcu free if
> > we follow the err_free goto.  Perhaps we are cleaner in this case
> > to not use a unified exit path but do that case inline?  
> 
> Agreed, though I moved the kzalloc() later as suggested by Jacob, I think
> it looks a little better and simplifies the error paths
> 
> Thanks,
> Jean
Jonathan
diff mbox series

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index d2fade984999..acca20e2da2f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -102,6 +102,13 @@  config IOMMU_DMA
 	select IRQ_MSI_IOMMU
 	select NEED_SG_DMA_LENGTH
 
+# Shared Virtual Addressing library
+config IOMMU_SVA
+	bool
+	select IOASID
+	select IOMMU_API
+	select MMU_NOTIFIER
+
 config FSL_PAMU
 	bool "Freescale IOMMU support"
 	depends on PCI
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 9f33fdb3bb05..40c800dd4e3e 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -37,3 +37,4 @@  obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
+obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
new file mode 100644
index 000000000000..64f1d1c82383
--- /dev/null
+++ b/drivers/iommu/iommu-sva.c
@@ -0,0 +1,561 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Manage PASIDs and bind process address spaces to devices.
+ *
+ * Copyright (C) 2018 ARM Ltd.
+ */
+
+#include <linux/idr.h>
+#include <linux/ioasid.h>
+#include <linux/iommu.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "iommu-sva.h"
+
+/**
+ * DOC: io_mm model
+ *
+ * The io_mm keeps track of process address spaces shared between CPU and IOMMU.
+ * The following example illustrates the relation between structures
+ * iommu_domain, io_mm and iommu_sva. The iommu_sva struct is a bond between
+ * io_mm and device. A device can have multiple io_mm and an io_mm may be bound
+ * to multiple devices.
+ *              ___________________________
+ *             |  IOMMU domain A           |
+ *             |  ________________         |
+ *             | |  IOMMU group   |        +------- io_pgtables
+ *             | |                |        |
+ *             | |   dev 00:00.0 ----+------- bond 1 --- io_mm X
+ *             | |________________|   \    |
+ *             |                       '----- bond 2 ---.
+ *             |___________________________|             \
+ *              ___________________________               \
+ *             |  IOMMU domain B           |             io_mm Y
+ *             |  ________________         |             / /
+ *             | |  IOMMU group   |        |            / /
+ *             | |                |        |           / /
+ *             | |   dev 00:01.0 ------------ bond 3 -' /
+ *             | |   dev 00:01.1 ------------ bond 4 --'
+ *             | |________________|        |
+ *             |                           +------- io_pgtables
+ *             |___________________________|
+ *
+ * In this example, device 00:00.0 is in domain A, devices 00:01.* are in domain
+ * B. All devices within the same domain access the same address spaces. Device
+ * 00:00.0 accesses address spaces X and Y, each corresponding to an mm_struct.
+ * Devices 00:01.* only access address space Y. In addition each
+ * IOMMU_DOMAIN_DMA domain has a private address space, io_pgtable, that is
+ * managed with iommu_map()/iommu_unmap(), and isn't shared with the CPU MMU.
+ *
+ * To obtain the above configuration, users would for instance issue the
+ * following calls:
+ *
+ *     iommu_sva_bind_device(dev 00:00.0, mm X, ...) -> bond 1
+ *     iommu_sva_bind_device(dev 00:00.0, mm Y, ...) -> bond 2
+ *     iommu_sva_bind_device(dev 00:01.0, mm Y, ...) -> bond 3
+ *     iommu_sva_bind_device(dev 00:01.1, mm Y, ...) -> bond 4
+ *
+ * A single Process Address Space ID (PASID) is allocated for each mm. In the
+ * example, devices use PASID 1 to read/write into address space X and PASID 2
+ * to read/write into address space Y. Calling iommu_sva_get_pasid() on bond 1
+ * returns 1, and calling it on bonds 2-4 returns 2.
+ *
+ * Hardware tables describing this configuration in the IOMMU would typically
+ * look like this:
+ *
+ *                                PASID tables
+ *                                 of domain A
+ *                              .->+--------+
+ *                             / 0 |        |-------> io_pgtable
+ *                            /    +--------+
+ *            Device tables  /   1 |        |-------> pgd X
+ *              +--------+  /      +--------+
+ *      00:00.0 |      A |-'     2 |        |--.
+ *              +--------+         +--------+   \
+ *              :        :       3 |        |    \
+ *              +--------+         +--------+     --> pgd Y
+ *      00:01.0 |      B |--.                    /
+ *              +--------+   \                  |
+ *      00:01.1 |      B |----+   PASID tables  |
+ *              +--------+     \   of domain B  |
+ *                              '->+--------+   |
+ *                               0 |        |-- | --> io_pgtable
+ *                                 +--------+   |
+ *                               1 |        |   |
+ *                                 +--------+   |
+ *                               2 |        |---'
+ *                                 +--------+
+ *                               3 |        |
+ *                                 +--------+
+ *
+ * With this model, a single call binds all devices in a given domain to an
+ * address space. Other devices in the domain will get the same bond implicitly.
+ * However, users must issue one bind() for each device, because IOMMUs may
+ * implement SVA differently. Furthermore, mandating one bind() per device
+ * allows the driver to perform sanity-checks on device capabilities.
+ *
+ * In some IOMMUs, one entry of the PASID table (typically the first one) can
+ * hold non-PASID translations. In this case PASID 0 is reserved and the first
+ * entry points to the io_pgtable pointer. In other IOMMUs the io_pgtable
+ * pointer is held in the device table and PASID 0 is available to the
+ * allocator.
+ */
+
+struct io_mm {
+	struct list_head		devices;
+	struct mm_struct		*mm;
+	struct mmu_notifier		notifier;
+
+	/* Late initialization */
+	const struct io_mm_ops		*ops;
+	void				*ctx;
+	int				pasid;
+};
+
+#define to_io_mm(mmu_notifier)	container_of(mmu_notifier, struct io_mm, notifier)
+#define to_iommu_bond(handle)	container_of(handle, struct iommu_bond, sva)
+
+struct iommu_bond {
+	struct iommu_sva		sva;
+	struct io_mm __rcu		*io_mm;
+
+	struct list_head		mm_head;
+	void				*drvdata;
+	struct rcu_head			rcu_head;
+	refcount_t			refs;
+};
+
+static DECLARE_IOASID_SET(shared_pasid);
+
+static struct mmu_notifier_ops iommu_mmu_notifier_ops;
+
+/*
+ * Serializes modifications of bonds.
+ * Lock order: Device SVA mutex; global SVA mutex; IOASID lock
+ */
+static DEFINE_MUTEX(iommu_sva_lock);
+
+struct io_mm_alloc_params {
+	const struct io_mm_ops *ops;
+	int min_pasid, max_pasid;
+};
+
+static struct mmu_notifier *io_mm_alloc(struct mm_struct *mm, void *privdata)
+{
+	int ret;
+	struct io_mm *io_mm;
+	struct io_mm_alloc_params *params = privdata;
+
+	io_mm = kzalloc(sizeof(*io_mm), GFP_KERNEL);
+	if (!io_mm)
+		return ERR_PTR(-ENOMEM);
+
+	io_mm->mm = mm;
+	io_mm->ops = params->ops;
+	INIT_LIST_HEAD(&io_mm->devices);
+
+	io_mm->pasid = ioasid_alloc(&shared_pasid, params->min_pasid,
+				    params->max_pasid, io_mm->mm);
+	if (io_mm->pasid == INVALID_IOASID) {
+		ret = -ENOSPC;
+		goto err_free_io_mm;
+	}
+
+	io_mm->ctx = params->ops->alloc(mm);
+	if (IS_ERR(io_mm->ctx)) {
+		ret = PTR_ERR(io_mm->ctx);
+		goto err_free_pasid;
+	}
+	return &io_mm->notifier;
+
+err_free_pasid:
+	ioasid_free(io_mm->pasid);
+err_free_io_mm:
+	kfree(io_mm);
+	return ERR_PTR(ret);
+}
+
+static void io_mm_free(struct mmu_notifier *mn)
+{
+	struct io_mm *io_mm = to_io_mm(mn);
+
+	WARN_ON(!list_empty(&io_mm->devices));
+
+	io_mm->ops->release(io_mm->ctx);
+	ioasid_free(io_mm->pasid);
+	kfree(io_mm);
+}
+
+/*
+ * io_mm_get - Allocate an io_mm or get the existing one for the given mm
+ * @mm: the mm
+ * @ops: callbacks for the IOMMU driver
+ * @min_pasid: minimum PASID value (inclusive)
+ * @max_pasid: maximum PASID value (inclusive)
+ *
+ * Returns a valid io_mm or an error pointer.
+ */
+static struct io_mm *io_mm_get(struct mm_struct *mm,
+			       const struct io_mm_ops *ops,
+			       int min_pasid, int max_pasid)
+{
+	struct io_mm *io_mm;
+	struct mmu_notifier *mn;
+	struct io_mm_alloc_params params = {
+		.ops		= ops,
+		.min_pasid	= min_pasid,
+		.max_pasid	= max_pasid,
+	};
+
+	/*
+	 * A single notifier can exist for this (ops, mm) pair. Allocate it if
+	 * necessary.
+	 */
+	mn = mmu_notifier_get(&iommu_mmu_notifier_ops, mm, &params);
+	if (IS_ERR(mn))
+		return ERR_CAST(mn);
+	io_mm = to_io_mm(mn);
+
+	if (WARN_ON(io_mm->ops != ops)) {
+		mmu_notifier_put(mn);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return io_mm;
+}
+
+static void io_mm_put(struct io_mm *io_mm)
+{
+	mmu_notifier_put(&io_mm->notifier);
+}
+
+static struct iommu_sva *
+io_mm_attach(struct device *dev, struct io_mm *io_mm, void *drvdata)
+{
+	int ret = 0;
+	bool attach_domain = true;
+	struct iommu_bond *bond, *tmp;
+	struct iommu_domain *domain, *other;
+	struct iommu_sva_param *param = dev->iommu_param->sva_param;
+
+	domain = iommu_get_domain_for_dev(dev);
+
+	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
+	if (!bond)
+		return ERR_PTR(-ENOMEM);
+
+	bond->sva.dev	= dev;
+	bond->drvdata	= drvdata;
+	refcount_set(&bond->refs, 1);
+	RCU_INIT_POINTER(bond->io_mm, io_mm);
+
+	mutex_lock(&iommu_sva_lock);
+	/* Is it already bound to the device or domain? */
+	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
+		if (tmp->sva.dev != dev) {
+			other = iommu_get_domain_for_dev(tmp->sva.dev);
+			if (domain == other)
+				attach_domain = false;
+
+			continue;
+		}
+
+		if (WARN_ON(tmp->drvdata != drvdata)) {
+			ret = -EINVAL;
+			goto err_free;
+		}
+
+		/*
+		 * Hold a single io_mm reference per bond. Note that we can't
+		 * return an error after this, otherwise the caller would drop
+		 * an additional reference to the io_mm.
+		 */
+		refcount_inc(&tmp->refs);
+		io_mm_put(io_mm);
+		kfree(bond);
+		mutex_unlock(&iommu_sva_lock);
+		return &tmp->sva;
+	}
+
+	list_add_rcu(&bond->mm_head, &io_mm->devices);
+	param->nr_bonds++;
+	mutex_unlock(&iommu_sva_lock);
+
+	ret = io_mm->ops->attach(bond->sva.dev, io_mm->pasid, io_mm->ctx,
+				 attach_domain);
+	if (ret)
+		goto err_remove;
+
+	return &bond->sva;
+
+err_remove:
+	/*
+	 * At this point concurrent threads may have started to access the
+	 * io_mm->devices list in order to invalidate address ranges, which
+	 * requires to free the bond via kfree_rcu()
+	 */
+	mutex_lock(&iommu_sva_lock);
+	param->nr_bonds--;
+	list_del_rcu(&bond->mm_head);
+
+err_free:
+	mutex_unlock(&iommu_sva_lock);
+	kfree_rcu(bond, rcu_head);
+	return ERR_PTR(ret);
+}
+
+static void io_mm_detach_locked(struct iommu_bond *bond)
+{
+	struct io_mm *io_mm;
+	struct iommu_bond *tmp;
+	bool detach_domain = true;
+	struct iommu_domain *domain, *other;
+
+	io_mm = rcu_dereference_protected(bond->io_mm,
+					  lockdep_is_held(&iommu_sva_lock));
+	if (!io_mm)
+		return;
+
+	domain = iommu_get_domain_for_dev(bond->sva.dev);
+
+	/* Are other devices in the same domain still attached to this mm? */
+	list_for_each_entry(tmp, &io_mm->devices, mm_head) {
+		if (tmp == bond)
+			continue;
+		other = iommu_get_domain_for_dev(tmp->sva.dev);
+		if (domain == other) {
+			detach_domain = false;
+			break;
+		}
+	}
+
+	io_mm->ops->detach(bond->sva.dev, io_mm->pasid, io_mm->ctx,
+			   detach_domain);
+
+	list_del_rcu(&bond->mm_head);
+	RCU_INIT_POINTER(bond->io_mm, NULL);
+
+	/* Free after RCU grace period */
+	io_mm_put(io_mm);
+}
+
+/*
+ * io_mm_release - release MMU notifier
+ *
+ * Called when the mm exits. Some devices may still be bound to the io_mm. A few
+ * things need to be done before it is safe to release:
+ *
+ * - Tell the device driver to stop using this PASID.
+ * - Clear the PASID table and invalidate TLBs.
+ * - Drop all references to this io_mm.
+ */
+static void io_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct iommu_bond *bond, *next;
+	struct io_mm *io_mm = to_io_mm(mn);
+
+	mutex_lock(&iommu_sva_lock);
+	list_for_each_entry_safe(bond, next, &io_mm->devices, mm_head) {
+		struct device *dev = bond->sva.dev;
+		struct iommu_sva *sva = &bond->sva;
+
+		if (sva->ops && sva->ops->mm_exit &&
+		    sva->ops->mm_exit(dev, sva, bond->drvdata))
+			dev_WARN(dev, "possible leak of PASID %u",
+				 io_mm->pasid);
+
+		/* unbind() frees the bond, we just detach it */
+		io_mm_detach_locked(bond);
+	}
+	mutex_unlock(&iommu_sva_lock);
+}
+
+static void io_mm_invalidate_range(struct mmu_notifier *mn,
+				   struct mm_struct *mm, unsigned long start,
+				   unsigned long end)
+{
+	struct iommu_bond *bond;
+	struct io_mm *io_mm = to_io_mm(mn);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bond, &io_mm->devices, mm_head)
+		io_mm->ops->invalidate(bond->sva.dev, io_mm->pasid, io_mm->ctx,
+				       start, end - start);
+	rcu_read_unlock();
+}
+
+static struct mmu_notifier_ops iommu_mmu_notifier_ops = {
+	.alloc_notifier		= io_mm_alloc,
+	.free_notifier		= io_mm_free,
+	.release		= io_mm_release,
+	.invalidate_range	= io_mm_invalidate_range,
+};
+
+struct iommu_sva *
+iommu_sva_bind_generic(struct device *dev, struct mm_struct *mm,
+		       const struct io_mm_ops *ops, void *drvdata)
+{
+	struct io_mm *io_mm;
+	struct iommu_sva *handle;
+	struct iommu_param *param = dev->iommu_param;
+
+	if (!param)
+		return ERR_PTR(-ENODEV);
+
+	mutex_lock(&param->sva_lock);
+	if (!param->sva_param) {
+		handle = ERR_PTR(-ENODEV);
+		goto out_unlock;
+	}
+
+	io_mm = io_mm_get(mm, ops, param->sva_param->min_pasid,
+			  param->sva_param->max_pasid);
+	if (IS_ERR(io_mm)) {
+		handle = ERR_CAST(io_mm);
+		goto out_unlock;
+	}
+
+	handle = io_mm_attach(dev, io_mm, drvdata);
+	if (IS_ERR(handle))
+		io_mm_put(io_mm);
+
+out_unlock:
+	mutex_unlock(&param->sva_lock);
+	return handle;
+}
+EXPORT_SYMBOL_GPL(iommu_sva_bind_generic);
+
+static void iommu_sva_unbind_locked(struct iommu_bond *bond)
+{
+	struct device *dev = bond->sva.dev;
+	struct iommu_sva_param *param = dev->iommu_param->sva_param;
+
+	if (!refcount_dec_and_test(&bond->refs))
+		return;
+
+	io_mm_detach_locked(bond);
+	param->nr_bonds--;
+	kfree_rcu(bond, rcu_head);
+}
+
+void iommu_sva_unbind_generic(struct iommu_sva *handle)
+{
+	struct iommu_param *param = handle->dev->iommu_param;
+
+	if (WARN_ON(!param))
+		return;
+
+	mutex_lock(&param->sva_lock);
+	mutex_lock(&iommu_sva_lock);
+	iommu_sva_unbind_locked(to_iommu_bond(handle));
+	mutex_unlock(&iommu_sva_lock);
+	mutex_unlock(&param->sva_lock);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_unbind_generic);
+
+/**
+ * iommu_sva_enable() - Enable Shared Virtual Addressing for a device
+ * @dev: the device
+ * @sva_param: the parameters.
+ *
+ * Called by an IOMMU driver to setup the SVA parameters
+ * @sva_param is duplicated and can be freed when this function returns.
+ *
+ * Return 0 if initialization succeeded, or an error.
+ */
+int iommu_sva_enable(struct device *dev, struct iommu_sva_param *sva_param)
+{
+	int ret;
+	struct iommu_sva_param *new_param;
+	struct iommu_param *param = dev->iommu_param;
+
+	if (!param)
+		return -ENODEV;
+
+	new_param = kmemdup(sva_param, sizeof(*new_param), GFP_KERNEL);
+	if (!new_param)
+		return -ENOMEM;
+
+	mutex_lock(&param->sva_lock);
+	if (param->sva_param) {
+		ret = -EEXIST;
+		goto err_unlock;
+	}
+
+	dev->iommu_param->sva_param = new_param;
+	mutex_unlock(&param->sva_lock);
+	return 0;
+
+err_unlock:
+	mutex_unlock(&param->sva_lock);
+	kfree(new_param);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_sva_enable);
+
+/**
+ * iommu_sva_disable() - Disable Shared Virtual Addressing for a device
+ * @dev: the device
+ *
+ * IOMMU drivers call this to disable SVA.
+ */
+int iommu_sva_disable(struct device *dev)
+{
+	int ret = 0;
+	struct iommu_param *param = dev->iommu_param;
+
+	if (!param)
+		return -EINVAL;
+
+	mutex_lock(&param->sva_lock);
+	if (!param->sva_param) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	/* Require that all contexts are unbound */
+	if (param->sva_param->nr_bonds) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	kfree(param->sva_param);
+	param->sva_param = NULL;
+out_unlock:
+	mutex_unlock(&param->sva_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_sva_disable);
+
+bool iommu_sva_enabled(struct device *dev)
+{
+	bool enabled;
+	struct iommu_param *param = dev->iommu_param;
+
+	if (!param)
+		return false;
+
+	mutex_lock(&param->sva_lock);
+	enabled = !!param->sva_param;
+	mutex_unlock(&param->sva_lock);
+	return enabled;
+}
+EXPORT_SYMBOL_GPL(iommu_sva_enabled);
+
+int iommu_sva_get_pasid_generic(struct iommu_sva *handle)
+{
+	struct io_mm *io_mm;
+	int pasid = IOMMU_PASID_INVALID;
+	struct iommu_bond *bond = to_iommu_bond(handle);
+
+	rcu_read_lock();
+	io_mm = rcu_dereference(bond->io_mm);
+	if (io_mm)
+		pasid = io_mm->pasid;
+	rcu_read_unlock();
+	return pasid;
+}
+EXPORT_SYMBOL_GPL(iommu_sva_get_pasid_generic);
diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
new file mode 100644
index 000000000000..dd55c2db0936
--- /dev/null
+++ b/drivers/iommu/iommu-sva.h
@@ -0,0 +1,64 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SVA library for IOMMU drivers
+ */
+#ifndef _IOMMU_SVA_H
+#define _IOMMU_SVA_H
+
+#include <linux/iommu.h>
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+
+struct io_mm_ops {
+	/* Allocate a PASID context for an mm */
+	void *(*alloc)(struct mm_struct *mm);
+
+	/*
+	 * Attach a PASID context to a device. Write the entry into the PASID
+	 * table.
+	 *
+	 * @attach_domain is true when no other device in the IOMMU domain is
+	 *   already attached to this context. IOMMU drivers that share the
+	 *   PASID tables within a domain don't need to write the PASID entry
+	 *   when @attach_domain is false.
+	 */
+	int (*attach)(struct device *dev, int pasid, void *ctx,
+		      bool attach_domain);
+
+	/*
+	 * Detach a PASID context from a device. Clear the entry from the PASID
+	 * table and invalidate if necessary.
+	 *
+	 * @detach_domain is true when no other device in the IOMMU domain is
+	 *   still attached to this context. IOMMU drivers that share the PASID
+	 *   table within a domain don't need to clear the PASID entry when
+	 *   @detach_domain is false, only invalidate the caches.
+	 */
+	void (*detach)(struct device *dev, int pasid, void *ctx,
+		       bool detach_domain);
+
+	/* Invalidate a range of addresses. Cannot sleep. */
+	void (*invalidate)(struct device *dev, int pasid, void *ctx,
+			   unsigned long vaddr, size_t size);
+
+	/* Free a context. Cannot sleep. */
+	void (*release)(void *ctx);
+};
+
+struct iommu_sva_param {
+	u32			min_pasid;
+	u32			max_pasid;
+	int			nr_bonds;
+};
+
+struct iommu_sva *
+iommu_sva_bind_generic(struct device *dev, struct mm_struct *mm,
+		       const struct io_mm_ops *ops, void *drvdata);
+void iommu_sva_unbind_generic(struct iommu_sva *handle);
+int iommu_sva_get_pasid_generic(struct iommu_sva *handle);
+
+int iommu_sva_enable(struct device *dev, struct iommu_sva_param *sva_param);
+int iommu_sva_disable(struct device *dev);
+bool iommu_sva_enabled(struct device *dev);
+
+#endif /* _IOMMU_SVA_H */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3e3528436e0b..c8bd972c1788 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -164,6 +164,7 @@  static struct iommu_param *iommu_get_dev_param(struct device *dev)
 		return NULL;
 
 	mutex_init(&param->lock);
+	mutex_init(&param->sva_lock);
 	dev->iommu_param = param;
 	return param;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1739f8a7a4b4..83397ae88d2d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -368,6 +368,7 @@  struct iommu_fault_param {
  * struct iommu_param - collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
+ * @sva_param: IOMMU parameter for SVA
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
  *	struct iommu_group	*iommu_group;
@@ -376,6 +377,8 @@  struct iommu_fault_param {
 struct iommu_param {
 	struct mutex lock;
 	struct iommu_fault_param *fault_param;
+	struct mutex sva_lock;
+	struct iommu_sva_param *sva_param;
 };
 
 int  iommu_device_register(struct iommu_device *iommu);