diff mbox series

[v3,12/15] iommufd: Add kAPI toward external drivers for physical devices

Message ID 12-v3-402a7d6459de+24b-iommufd_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series IOMMUFD Generic interface | expand

Commit Message

Jason Gunthorpe Oct. 25, 2022, 6:12 p.m. UTC
Add the four functions external drivers need to connect physical DMA to
the IOMMUFD:

iommufd_device_bind() / iommufd_device_unbind()
  Register the device with iommufd and establish security isolation.

iommufd_device_attach() / iommufd_device_detach()
  Connect a bound device to a page table

Binding a device creates a device object ID in the uAPI, however the
generic API provides no IOCTLs to manipulate them.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |   1 +
 drivers/iommu/iommufd/device.c          | 399 ++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |   5 +
 drivers/iommu/iommufd/main.c            |   3 +
 include/linux/iommufd.h                 |  13 +
 5 files changed, 421 insertions(+)
 create mode 100644 drivers/iommu/iommufd/device.c

Comments

Baolu Lu Oct. 29, 2022, 7:19 a.m. UTC | #1
On 2022/10/26 2:12, Jason Gunthorpe wrote:
> +/*
> + * When automatically managing the domains we search for a compatible domain in
> + * the iopt and if one is found use it, otherwise create a new domain.
> + * Automatic domain selection will never pick a manually created domain.
> + */
> +static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
> +					  struct iommufd_ioas *ioas,
> +					  unsigned int flags)
> +{
> +	struct iommufd_hw_pagetable *hwpt;
> +	int rc;
> +
> +	/*
> +	 * There is no differentiation when domains are allocated, so any domain
> +	 * that is willing to attach to the device is interchangeable with any
> +	 * other.
> +	 */
> +	mutex_lock(&ioas->mutex);
> +	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
> +		if (!hwpt->auto_domain ||
> +		    !refcount_inc_not_zero(&hwpt->obj.users))
> +			continue;
> +
> +		rc = iommufd_device_do_attach(idev, hwpt, flags);
> +		refcount_dec(&hwpt->obj.users);
> +		if (rc) {
> +			/*
> +			 * FIXME: Requires the series to return EINVAL for
> +			 * incompatible domain attaches.
> +			 */
> +			if (rc == -EINVAL)
> +				continue;
> +			goto out_unlock;
> +		}
> +		goto out_unlock;

Can the above code be simplified as:

		if (rc == -EINVAL)
			continue;
		goto out_unlock;
?

> +	}
> +
> +	hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev->dev);
> +	if (IS_ERR(hwpt)) {
> +		rc = PTR_ERR(hwpt);
> +		goto out_unlock;
> +	}
> +	hwpt->auto_domain = true;
> +
> +	rc = iommufd_device_do_attach(idev, hwpt, flags);
> +	if (rc)
> +		goto out_abort;
> +	list_add_tail(&hwpt->hwpt_item, &ioas->hwpt_list);
> +
> +	mutex_unlock(&ioas->mutex);
> +	iommufd_object_finalize(idev->ictx, &hwpt->obj);
> +	return 0;
> +
> +out_abort:
> +	iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
> +out_unlock:
> +	mutex_unlock(&ioas->mutex);
> +	return rc;
> +}

Best regards,
baolu
Tian, Kevin Nov. 5, 2022, 7:17 a.m. UTC | #2
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, October 26, 2022 2:12 AM
>
> +/**
> + * iommufd_device_bind - Bind a physical device to an iommu fd
> + * @ictx: iommufd file descriptor
> + * @dev: Pointer to a physical PCI device struct
> + * @id: Output ID number to return to userspace for this device
> + *
> + * A successful bind establishes an ownership over the device and returns
> + * struct iommufd_device pointer, otherwise returns error pointer.
> + *
> + * A driver using this API must set driver_managed_dma and must not touch
> + * the device until this routine succeeds and establishes ownership.
> + *
> + * Binding a PCI device places the entire RID under iommufd control.
> + *

Then what about non-PCI device? clearer to say that calling this interface
just places the entire physical device under iommufd control

> +	 * FIXME: This is conceptually broken for iommufd since we want to
> allow
> +	 * userspace to change the domains, eg switch from an identity IOAS
> to a
> +	 * DMA IOAs. There is currently no way to create a MSI window that

IOAs -> IOAS

> +		rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
> +		if (rc && rc != -ENODEV)
> +			return rc;

I know this is copied from VFIO but a comment is appreciated why
-ENODEV is considered sane to move forward.

> +	/*
> +	 * Otherwise the platform has a MSI window that is not isolated. For
> +	 * historical compat with VFIO allow a module parameter to ignore
> the
> +	 * insecurity.
> +	 */
> +	if (!(flags &
> IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT))
> +		return -EPERM;

Throw out a warning similar to vfio.

> +
> +	rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt,
> idev->dev,
> +						   idev->group,
> &sw_msi_start);
> +	if (rc)
> +		goto out_iova;

goto out_unlock since the function internally already called
__iopt_remove_reserved_iova() upon error.

> +	/*
> +	 * There is no differentiation when domains are allocated, so any
> domain
> +	 * that is willing to attach to the device is interchangeable with any
> +	 * other.
> +	 */
> +	mutex_lock(&ioas->mutex);
> +	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
> +		if (!hwpt->auto_domain ||
> +		    !refcount_inc_not_zero(&hwpt->obj.users))

users cannot be zero at this point.

a new hwpt is added to the list with users==1 in attach.

detach first removes the hwpt and then decrement users.

Both are conducted by holding ioas->mutex.

> +			continue;
> +
> +		rc = iommufd_device_do_attach(idev, hwpt, flags);
> +		refcount_dec(&hwpt->obj.users);

with above I also wonder whether refcount_inc/dec are just
redundant here...

> +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id,
> +			  unsigned int flags)
> +{
> +	struct iommufd_object *pt_obj;
> +	int rc;
> +
> +	pt_obj = iommufd_get_object(idev->ictx, *pt_id,
> IOMMUFD_OBJ_ANY);
> +	if (IS_ERR(pt_obj))
> +		return PTR_ERR(pt_obj);

Is there a reason why get_object() is not required for idev?

concurrent attach/unbind could lead to use-after-free here given users
for idev is added only in the end of the function.

> +
> +	switch (pt_obj->type) {
> +	case IOMMUFD_OBJ_HW_PAGETABLE: {
> +		struct iommufd_hw_pagetable *hwpt =
> +			container_of(pt_obj, struct iommufd_hw_pagetable,
> obj);
> +
> +		rc = iommufd_device_do_attach(idev, hwpt, flags);
> +		if (rc)
> +			goto out_put_pt_obj;
> +
> +		mutex_lock(&hwpt->ioas->mutex);
> +		list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
> +		mutex_unlock(&hwpt->ioas->mutex);
> +		break;
> +	}
> +	case IOMMUFD_OBJ_IOAS: {
> +		struct iommufd_ioas *ioas =
> +			container_of(pt_obj, struct iommufd_ioas, obj);
> +
> +		rc = iommufd_device_auto_get_domain(idev, ioas, flags);
> +		if (rc)
> +			goto out_put_pt_obj;
> +		break;
> +	}
> +	default:
> +		rc = -EINVAL;
> +		goto out_put_pt_obj;
> +	}
> +
> +	refcount_inc(&idev->obj.users);
> +	*pt_id = idev->hwpt->obj.id;

What is the value of returning hwpt id of auto domain to user?

IMHO this will add more complexity to the life cycle of auto domain.

w/o allowing it the life cycle is simple i.e. the first attach which doesn't
find a matching domain creates a new auto domain then the last detach
frees it.

now if allowing user to populate auto domain similar to user-created
hwpt then detach also need get_object() to wait for concurrent
ioctls similar to iommufd_destroy() does and more trick on destroying
the object.

If no big gain probably hiding it from userspace is simpler?

> +void iommufd_device_detach(struct iommufd_device *idev)
> +{
> +	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
> +
> +	mutex_lock(&hwpt->ioas->mutex);
> +	mutex_lock(&hwpt->devices_lock);
> +	list_del(&idev->devices_item);
> +	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> +		if (list_empty(&hwpt->devices)) {
> +			iopt_table_remove_domain(&hwpt->ioas->iopt,
> +						 hwpt->domain);
> +			list_del(&hwpt->hwpt_item);
> +		}
> +		iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);

this is always required. not just for last device in a group.
Jason Gunthorpe Nov. 7, 2022, 2:14 p.m. UTC | #3
On Sat, Oct 29, 2022 at 03:19:36PM +0800, Baolu Lu wrote:
> > +		rc = iommufd_device_do_attach(idev, hwpt, flags);
> > +		refcount_dec(&hwpt->obj.users);
> > +		if (rc) {
> > +			/*
> > +			 * FIXME: Requires the series to return EINVAL for
> > +			 * incompatible domain attaches.
> > +			 */
> > +			if (rc == -EINVAL)
> > +				continue;
> > +			goto out_unlock;
> > +		}
> > +		goto out_unlock;
> 
> Can the above code be simplified as:
> 
> 		if (rc == -EINVAL)
> 			continue;
> 		goto out_unlock;
> ?

Done

Thanks,
Jason
Jason Gunthorpe Nov. 7, 2022, 5:54 p.m. UTC | #4
On Sat, Nov 05, 2022 at 07:17:38AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, October 26, 2022 2:12 AM
> >
> > +/**
> > + * iommufd_device_bind - Bind a physical device to an iommu fd
> > + * @ictx: iommufd file descriptor
> > + * @dev: Pointer to a physical PCI device struct
> > + * @id: Output ID number to return to userspace for this device
> > + *
> > + * A successful bind establishes an ownership over the device and returns
> > + * struct iommufd_device pointer, otherwise returns error pointer.
> > + *
> > + * A driver using this API must set driver_managed_dma and must not touch
> > + * the device until this routine succeeds and establishes ownership.
> > + *
> > + * Binding a PCI device places the entire RID under iommufd control.
> > + *
> 
> Then what about non-PCI device? clearer to say that calling this interface
> just places the entire physical device under iommufd control

We don't know what other bus types will do with this API. The above is
guidance specifically for PCI, other bus types should add their own
guidance as it evolves.

> > +	 * FIXME: This is conceptually broken for iommufd since we want to
> > allow
> > +	 * userspace to change the domains, eg switch from an identity IOAS
> > to a
> > +	 * DMA IOAs. There is currently no way to create a MSI window that
> 
> IOAs -> IOAS

Done

> 
> > +		rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
> > +		if (rc && rc != -ENODEV)
> > +			return rc;
> 
> I know this is copied from VFIO but a comment is appreciated why
> -ENODEV is considered sane to move forward.

Huh. I actually don't know. It looks like it is here to detect that
the static inline for !CONFIG_IOMMU_DMA returns it.

However, if we have !CONFIG_IOMMU_DMA then I think we also don't get
interrupts at all. Ie iommu_dma_prepare_msi() becomes a NOP and
nothing will map the ITS page into the iomm_domain.

So let's drop the ENODEV check, it looks wrong.

> > +	/*
> > +	 * Otherwise the platform has a MSI window that is not isolated. For
> > +	 * historical compat with VFIO allow a module parameter to ignore
> > the
> > +	 * insecurity.
> > +	 */
> > +	if (!(flags &
> > IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT))
> > +		return -EPERM;
> 
> Throw out a warning similar to vfio.

Ah... That is kind of gross.. But OK, we can fix it later if someone
comes up with a driver that can safely use
IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT.

@@ -167,6 +167,11 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev,
         */
        if (!(flags & IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT))
                return -EPERM;
+       else
+               dev_warn(
+                       idev->dev,
+                       "Device interrupts cannot be isolated by the IOMMU, this platform in insecure. Use an \"allow_unsafe_interrupts\" module >
+
        return 0;

> > +	rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt,
> > idev->dev,
> > +						   idev->group,
> > &sw_msi_start);
> > +	if (rc)
> > +		goto out_iova;
> 
> goto out_unlock since the function internally already called
> __iopt_remove_reserved_iova() upon error.

OK

> > +	/*
> > +	 * There is no differentiation when domains are allocated, so any
> > domain
> > +	 * that is willing to attach to the device is interchangeable with any
> > +	 * other.
> > +	 */
> > +	mutex_lock(&ioas->mutex);
> > +	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
> > +		if (!hwpt->auto_domain ||
> > +		    !refcount_inc_not_zero(&hwpt->obj.users))
> 
> users cannot be zero at this point.
> 
> a new hwpt is added to the list with users==1 in attach.
> 
> detach first removes the hwpt and then decrement users.
> 
> Both are conducted by holding ioas->mutex.
> > +			continue;
> > +
> > +		rc = iommufd_device_do_attach(idev, hwpt, flags);
> > +		refcount_dec(&hwpt->obj.users);
> 
> with above I also wonder whether refcount_inc/dec are just
> redundant here...

Yes, it seems so (in ealier versions this was not true, but it is now):

@@ -267,12 +272,11 @@ static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
         */
        mutex_lock(&ioas->mutex);
        list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
-               if (!hwpt->auto_domain ||
-                   !refcount_inc_not_zero(&hwpt->obj.users))
+               if (!hwpt->auto_domain)
                        continue;
 
                rc = iommufd_device_do_attach(idev, hwpt, flags);
-               refcount_dec(&hwpt->obj.users);
+
 
> > +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id,
> > +			  unsigned int flags)
> > +{
> > +	struct iommufd_object *pt_obj;
> > +	int rc;
> > +
> > +	pt_obj = iommufd_get_object(idev->ictx, *pt_id,
> > IOMMUFD_OBJ_ANY);
> > +	if (IS_ERR(pt_obj))
> > +		return PTR_ERR(pt_obj);
> 
> Is there a reason why get_object() is not required for idev?

Caller has to hold a legal reference because caller is responsible for
the pointer.

> concurrent attach/unbind could lead to use-after-free here given users
> for idev is added only in the end of the function.

Caller bug. It is not allowed to continue to use idev while it is
destroying it by calling iommufd_device_unbind()

> > +	default:
> > +		rc = -EINVAL;
> > +		goto out_put_pt_obj;
> > +	}
> > +
> > +	refcount_inc(&idev->obj.users);
> > +	*pt_id = idev->hwpt->obj.id;
> 
> What is the value of returning hwpt id of auto domain to user?

It allows the userspace to potentially query the auto domain, if we
decide on some query later.

> IMHO this will add more complexity to the life cycle of auto domain.

Not really, the usespace still controls the lifecylce, the hwpt id is
returned and is valid until userspace commands the device it just
bound to become unbound.

> now if allowing user to populate auto domain similar to user-created
> hwpt then detach also need get_object() to wait for concurrent
> ioctls similar to iommufd_destroy() does and more trick on destroying
> the object.

"autodomain" means the hwpt is destroyed when its device list becomes
empty. This is all protected by the ioas->mutex, so we don't need
additional refcounting or locking.

> > +void iommufd_device_detach(struct iommufd_device *idev)
> > +{
> > +	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
> > +
> > +	mutex_lock(&hwpt->ioas->mutex);
> > +	mutex_lock(&hwpt->devices_lock);
> > +	list_del(&idev->devices_item);
> > +	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> > +		if (list_empty(&hwpt->devices)) {
> > +			iopt_table_remove_domain(&hwpt->ioas->iopt,
> > +						 hwpt->domain);
> > +			list_del(&hwpt->hwpt_item);
> > +		}
> > +		iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
> 
> this is always required. not just for last device in a group.

Yep, good catch

Thanks,
Jason
Tian, Kevin Nov. 8, 2022, 2:17 a.m. UTC | #5
Alex, is any trick behind checking -ENODEV overlooked here?

> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, November 8, 2022 1:55 AM
> >
> > > +		rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
> > > +		if (rc && rc != -ENODEV)
> > > +			return rc;
> >
> > I know this is copied from VFIO but a comment is appreciated why
> > -ENODEV is considered sane to move forward.
> 
> Huh. I actually don't know. It looks like it is here to detect that
> the static inline for !CONFIG_IOMMU_DMA returns it.
> 
> However, if we have !CONFIG_IOMMU_DMA then I think we also don't get
> interrupts at all. Ie iommu_dma_prepare_msi() becomes a NOP and
> nothing will map the ITS page into the iomm_domain.
> 
> So let's drop the ENODEV check, it looks wrong.
>
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index e13e971aa28c60..ca28a135b9675f 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,5 +1,6 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
+	device.o \
 	hw_pagetable.o \
 	io_pagetable.o \
 	ioas.o \
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
new file mode 100644
index 00000000000000..b572b01f67b7b5
--- /dev/null
+++ b/drivers/iommu/iommufd/device.c
@@ -0,0 +1,399 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/iommufd.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/irqdomain.h>
+
+#include "iommufd_private.h"
+
+/*
+ * A iommufd_device object represents the binding relationship between a
+ * consuming driver and the iommufd. These objects are created/destroyed by
+ * external drivers, not by userspace.
+ */
+struct iommufd_device {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommufd_hw_pagetable *hwpt;
+	/* Head at iommufd_hw_pagetable::devices */
+	struct list_head devices_item;
+	/* always the physical device */
+	struct device *dev;
+	struct iommu_group *group;
+	bool enforce_cache_coherency;
+};
+
+void iommufd_device_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_device *idev =
+		container_of(obj, struct iommufd_device, obj);
+
+	iommu_device_release_dma_owner(idev->dev);
+	iommu_group_put(idev->group);
+	iommufd_ctx_put(idev->ictx);
+}
+
+/**
+ * iommufd_device_bind - Bind a physical device to an iommu fd
+ * @ictx: iommufd file descriptor
+ * @dev: Pointer to a physical PCI device struct
+ * @id: Output ID number to return to userspace for this device
+ *
+ * A successful bind establishes an ownership over the device and returns
+ * struct iommufd_device pointer, otherwise returns error pointer.
+ *
+ * A driver using this API must set driver_managed_dma and must not touch
+ * the device until this routine succeeds and establishes ownership.
+ *
+ * Binding a PCI device places the entire RID under iommufd control.
+ *
+ * The caller must undo this with iommufd_unbind_device()
+ */
+struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
+					   struct device *dev, u32 *id)
+{
+	struct iommufd_device *idev;
+	struct iommu_group *group;
+	int rc;
+
+	/*
+	 * iommufd always sets IOMMU_CACHE because we offer no way for userspace
+	 * to restore cache coherency.
+	 */
+	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
+		return ERR_PTR(-EINVAL);
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return ERR_PTR(-ENODEV);
+
+	rc = iommu_device_claim_dma_owner(dev, ictx);
+	if (rc)
+		goto out_group_put;
+
+	idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
+	if (IS_ERR(idev)) {
+		rc = PTR_ERR(idev);
+		goto out_release_owner;
+	}
+	idev->ictx = ictx;
+	iommufd_ctx_get(ictx);
+	idev->dev = dev;
+	idev->enforce_cache_coherency =
+		device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
+	/* The calling driver is a user until iommufd_device_unbind() */
+	refcount_inc(&idev->obj.users);
+	/* group refcount moves into iommufd_device */
+	idev->group = group;
+
+	/*
+	 * If the caller fails after this success it must call
+	 * iommufd_unbind_device() which is safe since we hold this refcount.
+	 * This also means the device is a leaf in the graph and no other object
+	 * can take a reference on it.
+	 */
+	iommufd_object_finalize(ictx, &idev->obj);
+	*id = idev->obj.id;
+	return idev;
+
+out_release_owner:
+	iommu_device_release_dma_owner(dev);
+out_group_put:
+	iommu_group_put(group);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD);
+
+void iommufd_device_unbind(struct iommufd_device *idev)
+{
+	bool was_destroyed;
+
+	was_destroyed = iommufd_object_destroy_user(idev->ictx, &idev->obj);
+	WARN_ON(!was_destroyed);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
+
+static int iommufd_device_setup_msi(struct iommufd_device *idev,
+				    struct iommufd_hw_pagetable *hwpt,
+				    phys_addr_t sw_msi_start,
+				    unsigned int flags)
+{
+	int rc;
+
+	/*
+	 * IOMMU_CAP_INTR_REMAP means that the platform is isolating MSI, and it
+	 * creates the MSI window by default in the iommu domain. Nothing
+	 * further to do.
+	 */
+	if (device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP))
+		return 0;
+
+	/*
+	 * On ARM systems that set the global IRQ_DOMAIN_FLAG_MSI_REMAP every
+	 * allocated iommu_domain will block interrupts by default and this
+	 * special flow is needed to turn them back on. iommu_dma_prepare_msi()
+	 * will install pages into our domain after request_irq() to make this
+	 * work.
+	 *
+	 * FIXME: This is conceptually broken for iommufd since we want to allow
+	 * userspace to change the domains, eg switch from an identity IOAS to a
+	 * DMA IOAs. There is currently no way to create a MSI window that
+	 * matches what the IRQ layer actually expects in a newly created
+	 * domain.
+	 */
+	if (irq_domain_check_msi_remap()) {
+		if (WARN_ON(!sw_msi_start))
+			return -EPERM;
+		/*
+		 * iommu_get_msi_cookie() can only be called once per domain,
+		 * it returns -EBUSY on later calls.
+		 */
+		if (hwpt->msi_cookie)
+			return 0;
+		rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
+		if (rc && rc != -ENODEV)
+			return rc;
+		hwpt->msi_cookie = true;
+		return 0;
+	}
+
+	/*
+	 * Otherwise the platform has a MSI window that is not isolated. For
+	 * historical compat with VFIO allow a module parameter to ignore the
+	 * insecurity.
+	 */
+	if (!(flags & IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT))
+		return -EPERM;
+	return 0;
+}
+
+static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
+					   struct iommu_group *group)
+{
+	struct iommufd_device *cur_dev;
+
+	list_for_each_entry(cur_dev, &hwpt->devices, devices_item)
+		if (cur_dev->group == group)
+			return true;
+	return false;
+}
+
+static int iommufd_device_do_attach(struct iommufd_device *idev,
+				    struct iommufd_hw_pagetable *hwpt,
+				    unsigned int flags)
+{
+	phys_addr_t sw_msi_start = 0;
+	int rc;
+
+	mutex_lock(&hwpt->devices_lock);
+
+	/*
+	 * Try to upgrade the domain we have, it is an iommu driver bug to
+	 * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail
+	 * enforce_cache_coherency when there are no devices attached to the
+	 * domain.
+	 */
+	if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) {
+		if (hwpt->domain->ops->enforce_cache_coherency)
+			hwpt->enforce_cache_coherency =
+				hwpt->domain->ops->enforce_cache_coherency(
+					hwpt->domain);
+		if (!hwpt->enforce_cache_coherency) {
+			WARN_ON(list_empty(&hwpt->devices));
+			rc = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev,
+						   idev->group, &sw_msi_start);
+	if (rc)
+		goto out_iova;
+
+	rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start, flags);
+	if (rc)
+		goto out_iova;
+
+	/*
+	 * FIXME: Hack around missing a device-centric iommu api, only attach to
+	 * the group once for the first device that is in the group.
+	 */
+	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
+		rc = iommu_attach_group(hwpt->domain, idev->group);
+		if (rc)
+			goto out_iova;
+	}
+
+	if (list_empty(&hwpt->devices)) {
+		rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
+		if (rc)
+			goto out_detach;
+	}
+
+	idev->hwpt = hwpt;
+	refcount_inc(&hwpt->obj.users);
+	list_add(&idev->devices_item, &hwpt->devices);
+	mutex_unlock(&hwpt->devices_lock);
+	return 0;
+
+out_detach:
+	iommu_detach_group(hwpt->domain, idev->group);
+out_iova:
+	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+out_unlock:
+	mutex_unlock(&hwpt->devices_lock);
+	return rc;
+}
+
+/*
+ * When automatically managing the domains we search for a compatible domain in
+ * the iopt and if one is found use it, otherwise create a new domain.
+ * Automatic domain selection will never pick a manually created domain.
+ */
+static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
+					  struct iommufd_ioas *ioas,
+					  unsigned int flags)
+{
+	struct iommufd_hw_pagetable *hwpt;
+	int rc;
+
+	/*
+	 * There is no differentiation when domains are allocated, so any domain
+	 * that is willing to attach to the device is interchangeable with any
+	 * other.
+	 */
+	mutex_lock(&ioas->mutex);
+	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
+		if (!hwpt->auto_domain ||
+		    !refcount_inc_not_zero(&hwpt->obj.users))
+			continue;
+
+		rc = iommufd_device_do_attach(idev, hwpt, flags);
+		refcount_dec(&hwpt->obj.users);
+		if (rc) {
+			/*
+			 * FIXME: Requires the series to return EINVAL for
+			 * incompatible domain attaches.
+			 */
+			if (rc == -EINVAL)
+				continue;
+			goto out_unlock;
+		}
+		goto out_unlock;
+	}
+
+	hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev->dev);
+	if (IS_ERR(hwpt)) {
+		rc = PTR_ERR(hwpt);
+		goto out_unlock;
+	}
+	hwpt->auto_domain = true;
+
+	rc = iommufd_device_do_attach(idev, hwpt, flags);
+	if (rc)
+		goto out_abort;
+	list_add_tail(&hwpt->hwpt_item, &ioas->hwpt_list);
+
+	mutex_unlock(&ioas->mutex);
+	iommufd_object_finalize(idev->ictx, &hwpt->obj);
+	return 0;
+
+out_abort:
+	iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
+out_unlock:
+	mutex_unlock(&ioas->mutex);
+	return rc;
+}
+
+/**
+ * iommufd_device_attach - Connect a device to an iommu_domain
+ * @idev: device to attach
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
+ *         Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @flags: Optional flags
+ *
+ * This connects the device to an iommu_domain, either automatically or manually
+ * selected. Once this completes the device could do DMA.
+ *
+ * The caller should return the resulting pt_id back to userspace.
+ * This function is undone by calling iommufd_device_detach().
+ */
+int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id,
+			  unsigned int flags)
+{
+	struct iommufd_object *pt_obj;
+	int rc;
+
+	pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY);
+	if (IS_ERR(pt_obj))
+		return PTR_ERR(pt_obj);
+
+	switch (pt_obj->type) {
+	case IOMMUFD_OBJ_HW_PAGETABLE: {
+		struct iommufd_hw_pagetable *hwpt =
+			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+		rc = iommufd_device_do_attach(idev, hwpt, flags);
+		if (rc)
+			goto out_put_pt_obj;
+
+		mutex_lock(&hwpt->ioas->mutex);
+		list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
+		mutex_unlock(&hwpt->ioas->mutex);
+		break;
+	}
+	case IOMMUFD_OBJ_IOAS: {
+		struct iommufd_ioas *ioas =
+			container_of(pt_obj, struct iommufd_ioas, obj);
+
+		rc = iommufd_device_auto_get_domain(idev, ioas, flags);
+		if (rc)
+			goto out_put_pt_obj;
+		break;
+	}
+	default:
+		rc = -EINVAL;
+		goto out_put_pt_obj;
+	}
+
+	refcount_inc(&idev->obj.users);
+	*pt_id = idev->hwpt->obj.id;
+	rc = 0;
+
+out_put_pt_obj:
+	iommufd_put_object(pt_obj);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
+
+void iommufd_device_detach(struct iommufd_device *idev)
+{
+	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
+
+	mutex_lock(&hwpt->ioas->mutex);
+	mutex_lock(&hwpt->devices_lock);
+	list_del(&idev->devices_item);
+	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
+		if (list_empty(&hwpt->devices)) {
+			iopt_table_remove_domain(&hwpt->ioas->iopt,
+						 hwpt->domain);
+			list_del(&hwpt->hwpt_item);
+		}
+		iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+		iommu_detach_group(hwpt->domain, idev->group);
+	}
+	mutex_unlock(&hwpt->devices_lock);
+	mutex_unlock(&hwpt->ioas->mutex);
+
+	if (hwpt->auto_domain)
+		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
+	else
+		refcount_dec(&hwpt->obj.users);
+
+	idev->hwpt = NULL;
+
+	refcount_dec(&idev->obj.users);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index e1521764a335e6..e982efca161699 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -102,6 +102,7 @@  static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
 enum iommufd_object_type {
 	IOMMUFD_OBJ_NONE,
 	IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+	IOMMUFD_OBJ_DEVICE,
 	IOMMUFD_OBJ_HW_PAGETABLE,
 	IOMMUFD_OBJ_IOAS,
 };
@@ -227,6 +228,8 @@  struct iommufd_hw_pagetable {
 	struct iommufd_ioas *ioas;
 	struct iommu_domain *domain;
 	bool auto_domain : 1;
+	bool enforce_cache_coherency : 1;
+	bool msi_cookie : 1;
 	/* Head at iommufd_ioas::hwpt_list */
 	struct list_head hwpt_item;
 	struct mutex devices_lock;
@@ -238,4 +241,6 @@  iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 			   struct device *dev);
 void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
 
+void iommufd_device_destroy(struct iommufd_object *obj);
+
 #endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 84d315b9e73845..774c286da04d38 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -352,6 +352,9 @@  void iommufd_ctx_put(struct iommufd_ctx *ictx)
 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
 
 static struct iommufd_object_ops iommufd_object_ops[] = {
+	[IOMMUFD_OBJ_DEVICE] = {
+		.destroy = iommufd_device_destroy,
+	},
 	[IOMMUFD_OBJ_IOAS] = {
 		.destroy = iommufd_ioas_destroy,
 	},
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 26e09d539737bb..31efacd8a46cce 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -9,10 +9,23 @@ 
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/device.h>
 
+struct iommufd_device;
 struct iommufd_ctx;
 struct file;
 
+struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
+					   struct device *dev, u32 *id);
+void iommufd_device_unbind(struct iommufd_device *idev);
+
+enum {
+	IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT = 1 << 0,
+};
+int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id,
+			  unsigned int flags);
+void iommufd_device_detach(struct iommufd_device *idev);
+
 enum {
 	IOMMUFD_ACCESS_RW_READ = 0,
 	IOMMUFD_ACCESS_RW_WRITE = 1 << 0,