diff mbox series

[02/14] iommufd: Add iommufd_group

Message ID 2-v1-7612f88c19f5+2f21-iommufd_alloc_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add iommufd physical device operations for replace and alloc hwpt | expand

Commit Message

Jason Gunthorpe Feb. 25, 2023, 12:27 a.m. UTC
When the hwpt to device attachment is fairly static we could get
away with the simple approach of keeping track of the groups via a
device list. But with replace this is infeasible.

Add an automatically managed struct that is 1:1 with the iommu_group
per-ictx so we can store the necessary tracking information there.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/device.c          | 128 +++++++++++++++++++++---
 drivers/iommu/iommufd/iommufd_private.h |   9 +-
 drivers/iommu/iommufd/main.c            |   2 +
 3 files changed, 123 insertions(+), 16 deletions(-)

Comments

Tian, Kevin March 2, 2023, 7:55 a.m. UTC | #1
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Saturday, February 25, 2023 8:28 AM
> 
> +
> +	/*
> +	 * All objects using a group reference must put it before their destroy
> +	 * completes
> +	 */
> +	new_igroup->ictx = ictx;

Looks the comment is not related to the code. Probably put it in the
destroy function?

> +
> +	/*
> +	 * We dropped the lock so igroup is invalid. Assume that the
> +	 * xa had NULL in it, if this guess is wrong then we will obtain
> +	 * the actual value under lock and try again once.

this reads like "if this guess is wrong" is checked outside of the lock.
I'd just remove "under lock".

> +	 */
> +	cur_igroup = NULL;
> +	xa_lock(&ictx->groups);
> +	while (true) {
> +		igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup,
> new_igroup,
> +				      GFP_KERNEL);
> +		if (xa_is_err(igroup)) {
> +			xa_unlock(&ictx->groups);
> +			iommufd_put_group(new_igroup);
> +			return ERR_PTR(xa_err(igroup));
> +		}
> +
> +		/* new_group was successfully installed */
> +		if (cur_igroup == igroup) {
> +			xa_unlock(&ictx->groups);
> +			return new_igroup;
> +		}
> +
> +		/* Check again if the current group is any good */
> +		if (igroup && igroup->group == group &&
> +		    kref_get_unless_zero(&igroup->ref)) {
> +			xa_unlock(&ictx->groups);
> +			iommufd_put_group(new_igroup);
> +			return igroup;
> +		}
> +		cur_igroup = igroup;
> +	}

Add a WARN_ON(igroup->group != group). The only valid race should
be when an existing group which is created by another device in the
same iommu group is being destroyed then we want another try
hoping that object will be removed from xarray soon. But there should
not be a case with the same slot pointing to a different iommu group.

> @@ -98,7 +195,7 @@ struct iommufd_device *iommufd_device_bind(struct
> iommufd_ctx *ictx,
>  	/* The calling driver is a user until iommufd_device_unbind() */
>  	refcount_inc(&idev->obj.users);
>  	/* group refcount moves into iommufd_device */
> -	idev->group = group;
> +	idev->igroup = igroup;

the comment about group refcount is stale now.
Jason Gunthorpe March 2, 2023, 12:51 p.m. UTC | #2
On Thu, Mar 02, 2023 at 07:55:12AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Saturday, February 25, 2023 8:28 AM
> > 
> > +
> > +	/*
> > +	 * All objects using a group reference must put it before their destroy
> > +	 * completes
> > +	 */
> > +	new_igroup->ictx = ictx;
> 
> Looks the comment is not related to the code. Probably put it in the
> destroy function?

It explains why we don't take a reference on ictx here.

> > +	cur_igroup = NULL;
> > +	xa_lock(&ictx->groups);
> > +	while (true) {
> > +		igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup,
> > new_igroup,
> > +				      GFP_KERNEL);
> > +		if (xa_is_err(igroup)) {
> > +			xa_unlock(&ictx->groups);
> > +			iommufd_put_group(new_igroup);
> > +			return ERR_PTR(xa_err(igroup));
> > +		}
> > +
> > +		/* new_group was successfully installed */
> > +		if (cur_igroup == igroup) {
> > +			xa_unlock(&ictx->groups);
> > +			return new_igroup;
> > +		}
> > +
> > +		/* Check again if the current group is any good */
> > +		if (igroup && igroup->group == group &&
> > +		    kref_get_unless_zero(&igroup->ref)) {
> > +			xa_unlock(&ictx->groups);
> > +			iommufd_put_group(new_igroup);
> > +			return igroup;
> > +		}
> > +		cur_igroup = igroup;
> > +	}
> 
> Add a WARN_ON(igroup->group != group). The only valid race should
> be when an existing group which is created by another device in the
> same iommu group is being destroyed then we want another try
> hoping that object will be removed from xarray soon. But there should
> not be a case with the same slot pointing to a different iommu group.

Yeah, that is the case, both the group checks are WARN_ON's
because we hold an iommu_group reference as long as the xarray entry
is popoulated so it should be impossible for another iommu_group
pointer to have the same ID.

> > @@ -98,7 +195,7 @@ struct iommufd_device *iommufd_device_bind(struct
> > iommufd_ctx *ictx,
> >  	/* The calling driver is a user until iommufd_device_unbind() */
> >  	refcount_inc(&idev->obj.users);
> >  	/* group refcount moves into iommufd_device */
> > -	idev->group = group;
> > +	idev->igroup = igroup;
> 
> the comment about group refcount is stale now.

You mean it should say 'igroup refcount' ?

Jason
Tian, Kevin March 3, 2023, 2:13 a.m. UTC | #3
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, March 2, 2023 8:52 PM
> 
> 
> > > @@ -98,7 +195,7 @@ struct iommufd_device
> *iommufd_device_bind(struct
> > > iommufd_ctx *ictx,
> > >  	/* The calling driver is a user until iommufd_device_unbind() */
> > >  	refcount_inc(&idev->obj.users);
> > >  	/* group refcount moves into iommufd_device */
> > > -	idev->group = group;
> > > +	idev->igroup = igroup;
> >
> > the comment about group refcount is stale now.
> 
> You mean it should say 'igroup refcount' ?
> 

The original comment refers to the refcnt of iommu group.

Now that refcnt is held by iommufd_group and no movement per se.

I'd just remove this comment.
Jason Gunthorpe March 6, 2023, 7:16 p.m. UTC | #4
On Fri, Mar 03, 2023 at 02:13:30AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Thursday, March 2, 2023 8:52 PM
> > 
> > 
> > > > @@ -98,7 +195,7 @@ struct iommufd_device
> > *iommufd_device_bind(struct
> > > > iommufd_ctx *ictx,
> > > >  	/* The calling driver is a user until iommufd_device_unbind() */
> > > >  	refcount_inc(&idev->obj.users);
> > > >  	/* group refcount moves into iommufd_device */
> > > > -	idev->group = group;
> > > > +	idev->igroup = igroup;
> > >
> > > the comment about group refcount is stale now.
> > 
> > You mean it should say 'igroup refcount' ?
> > 
> 
> The original comment refers to the refcnt of iommu group.
> 
> Now that refcnt is held by iommufd_group and no movement per se.

igroup has a reference as well and it is this reference that is now
being moved

Jason
Tian, Kevin March 7, 2023, 2:32 a.m. UTC | #5
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, March 7, 2023 3:17 AM
> 
> On Fri, Mar 03, 2023 at 02:13:30AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Thursday, March 2, 2023 8:52 PM
> > >
> > >
> > > > > @@ -98,7 +195,7 @@ struct iommufd_device
> > > *iommufd_device_bind(struct
> > > > > iommufd_ctx *ictx,
> > > > >  	/* The calling driver is a user until iommufd_device_unbind() */
> > > > >  	refcount_inc(&idev->obj.users);
> > > > >  	/* group refcount moves into iommufd_device */
> > > > > -	idev->group = group;
> > > > > +	idev->igroup = igroup;
> > > >
> > > > the comment about group refcount is stale now.
> > >
> > > You mean it should say 'igroup refcount' ?
> > >
> >
> > The original comment refers to the refcnt of iommu group.
> >
> > Now that refcnt is held by iommufd_group and no movement per se.
> 
> igroup has a reference as well and it is this reference that is now
> being moved
> 

oh, yes.
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 63b65cdfe97f29..d1e227f310e823 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -15,13 +15,110 @@  MODULE_PARM_DESC(
 	"Allow IOMMUFD to bind to devices even if the platform cannot isolate "
 	"the MSI interrupt window. Enabling this is a security weakness.");
 
+static void iommufd_group_release(struct kref *kref)
+{
+	struct iommufd_group *igroup =
+		container_of(kref, struct iommufd_group, ref);
+
+	xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup,
+		   NULL, GFP_KERNEL);
+	iommu_group_put(igroup->group);
+	kfree(igroup);
+}
+
+static void iommufd_put_group(struct iommufd_group *group)
+{
+	kref_put(&group->ref, iommufd_group_release);
+}
+
+/*
+ * iommufd needs to store some more data for each iommu_group, we keep a
+ * parallel xarray indexed by iommu_group id to hold this instead of putting it
+ * in the core structure. To keep things simple the iommufd_group memory is
+ * unique within the iommufd_ctx. This makes it easy to check there are no
+ * memory leaks.
+ */
+static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
+					       struct device *dev)
+{
+	struct iommufd_group *new_igroup;
+	struct iommufd_group *cur_igroup;
+	struct iommufd_group *igroup;
+	struct iommu_group *group;
+	unsigned int id;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return ERR_PTR(-ENODEV);
+
+	id = iommu_group_id(group);
+
+	xa_lock(&ictx->groups);
+	igroup = xa_load(&ictx->groups, id);
+	if (igroup && igroup->group == group &&
+	    kref_get_unless_zero(&igroup->ref)) {
+		xa_unlock(&ictx->groups);
+		iommu_group_put(group);
+		return igroup;
+	}
+	xa_unlock(&ictx->groups);
+
+	new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL);
+	if (!new_igroup) {
+		iommu_group_put(group);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	kref_init(&new_igroup->ref);
+	/* group reference moves into new_igroup */
+	new_igroup->group = group;
+
+	/*
+	 * All objects using a group reference must put it before their destroy
+	 * completes
+	 */
+	new_igroup->ictx = ictx;
+
+	/*
+	 * We dropped the lock so igroup is invalid. Assume that the
+	 * xa had NULL in it, if this guess is wrong then we will obtain
+	 * the actual value under lock and try again once.
+	 */
+	cur_igroup = NULL;
+	xa_lock(&ictx->groups);
+	while (true) {
+		igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup,
+				      GFP_KERNEL);
+		if (xa_is_err(igroup)) {
+			xa_unlock(&ictx->groups);
+			iommufd_put_group(new_igroup);
+			return ERR_PTR(xa_err(igroup));
+		}
+
+		/* new_group was successfully installed */
+		if (cur_igroup == igroup) {
+			xa_unlock(&ictx->groups);
+			return new_igroup;
+		}
+
+		/* Check again if the current group is any good */
+		if (igroup && igroup->group == group &&
+		    kref_get_unless_zero(&igroup->ref)) {
+			xa_unlock(&ictx->groups);
+			iommufd_put_group(new_igroup);
+			return igroup;
+		}
+		cur_igroup = igroup;
+	}
+}
+
 void iommufd_device_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_device *idev =
 		container_of(obj, struct iommufd_device, obj);
 
 	iommu_device_release_dma_owner(idev->dev);
-	iommu_group_put(idev->group);
+	iommufd_put_group(idev->igroup);
 	if (!iommufd_selftest_is_mock_dev(idev->dev))
 		iommufd_ctx_put(idev->ictx);
 }
@@ -46,7 +143,7 @@  struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 					   struct device *dev, u32 *id)
 {
 	struct iommufd_device *idev;
-	struct iommu_group *group;
+	struct iommufd_group *igroup;
 	int rc;
 
 	/*
@@ -56,9 +153,9 @@  struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
 		return ERR_PTR(-EINVAL);
 
-	group = iommu_group_get(dev);
-	if (!group)
-		return ERR_PTR(-ENODEV);
+	igroup = iommufd_get_group(ictx, dev);
+	if (IS_ERR(igroup))
+		return ERR_CAST(igroup);
 
 	/*
 	 * For historical compat with VFIO the insecure interrupt path is
@@ -67,7 +164,7 @@  struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 	 * interrupt outside this iommufd context.
 	 */
 	if (!iommufd_selftest_is_mock_dev(dev) &&
-	    !iommu_group_has_isolated_msi(group)) {
+	    !iommu_group_has_isolated_msi(igroup->group)) {
 		if (!allow_unsafe_interrupts) {
 			rc = -EPERM;
 			goto out_group_put;
@@ -98,7 +195,7 @@  struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 	/* The calling driver is a user until iommufd_device_unbind() */
 	refcount_inc(&idev->obj.users);
 	/* group refcount moves into iommufd_device */
-	idev->group = group;
+	idev->igroup = igroup;
 
 	/*
 	 * If the caller fails after this success it must call
@@ -113,7 +210,7 @@  struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 out_release_owner:
 	iommu_device_release_dma_owner(dev);
 out_group_put:
-	iommu_group_put(group);
+	iommufd_put_group(igroup);
 	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD);
@@ -170,14 +267,14 @@  static int iommufd_device_setup_msi(struct iommufd_device *idev,
 }
 
 static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
-					   struct iommu_group *group)
+					   struct iommufd_group *igroup)
 {
 	struct iommufd_device *cur_dev;
 
 	lockdep_assert_held(&hwpt->devices_lock);
 
 	list_for_each_entry(cur_dev, &hwpt->devices, devices_item)
-		if (cur_dev->group == group)
+		if (cur_dev->igroup->group == igroup->group)
 			return true;
 	return false;
 }
@@ -211,7 +308,8 @@  int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 	}
 
 	rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev,
-						   idev->group, &sw_msi_start);
+						   idev->igroup->group,
+						   &sw_msi_start);
 	if (rc)
 		return rc;
 
@@ -223,8 +321,8 @@  int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 	 * FIXME: Hack around missing a device-centric iommu api, only attach to
 	 * the group once for the first device that is in the group.
 	 */
-	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
-		rc = iommu_attach_group(hwpt->domain, idev->group);
+	if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) {
+		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
 		if (rc)
 			goto err_unresv;
 	}
@@ -237,8 +335,8 @@  int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt,
 				 struct iommufd_device *idev)
 {
-	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group))
-		iommu_detach_group(hwpt->domain, idev->group);
+	if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup))
+		iommu_detach_group(hwpt->domain, idev->igroup->group);
 	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
 }
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index d523ef12890e1e..2544f10dae9aef 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -17,6 +17,7 @@  struct iommufd_device;
 struct iommufd_ctx {
 	struct file *file;
 	struct xarray objects;
+	struct xarray groups;
 
 	u8 account_mode;
 	/* Compatibility with VFIO no iommu */
@@ -262,6 +263,12 @@  void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt,
 				 struct iommufd_device *idev);
 void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
 
+struct iommufd_group {
+	struct kref ref;
+	struct iommufd_ctx *ictx;
+	struct iommu_group *group;
+};
+
 /*
  * A iommufd_device object represents the binding relationship between a
  * consuming driver and the iommufd. These objects are created/destroyed by
@@ -270,12 +277,12 @@  void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
 struct iommufd_device {
 	struct iommufd_object obj;
 	struct iommufd_ctx *ictx;
+	struct iommufd_group *igroup;
 	struct iommufd_hw_pagetable *hwpt;
 	/* Head at iommufd_hw_pagetable::devices */
 	struct list_head devices_item;
 	/* always the physical device */
 	struct device *dev;
-	struct iommu_group *group;
 	bool enforce_cache_coherency;
 };
 
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 3fbe636c3d8a69..e5ed5dfa91a0b5 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -183,6 +183,7 @@  static int iommufd_fops_open(struct inode *inode, struct file *filp)
 	}
 
 	xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
+	xa_init(&ictx->groups);
 	ictx->file = filp;
 	filp->private_data = ictx;
 	return 0;
@@ -218,6 +219,7 @@  static int iommufd_fops_release(struct inode *inode, struct file *filp)
 		if (WARN_ON(!destroyed))
 			break;
 	}
+	WARN_ON(!xa_empty(&ictx->groups));
 	kfree(ictx);
 	return 0;
 }