diff mbox series

[v5,01/13] iommufd/viommu: Add IOMMUFD_OBJ_VDEVICE and IOMMU_VDEVICE_ALLOC ioctl

Message ID 53025c827c44d68edb6469bfd940a8e8bc6147a5.1729897278.git.nicolinc@nvidia.com (mailing list archive)
State New
Headers show
Series iommufd: Add vIOMMU infrastructure (Part-2: vDEVICE) | expand

Commit Message

Nicolin Chen Oct. 25, 2024, 11:50 p.m. UTC
Introduce a new IOMMUFD_OBJ_VDEVICE to represent a physical device, i.e.
iommufd_device (idev) object, against an iommufd_viommu (vIOMMU) object in
the VM. This vDEVICE object (and its structure) holds all the information
and attributes in a VM, regarding the device related to the vIOMMU.

As an initial patch, add a per-vIOMMU virtual ID. This can be:
 - Virtual StreamID on a nested ARM SMMUv3, an index to a Stream Table
 - Virtual DeviceID on a nested AMD IOMMU, an index to a Device Table
 - Virtual ID on a nested Intel VT-D IOMMU, an index to a Context Table
Potentially, this vDEVICE structure would hold some vData for Confidential
Compute Architecture (CCA). Use this virtual ID to index an "vdevs" xarray
that belongs to a vIOMMU object.

Add a new ioctl for vDEVICE allocations. Since a vDEVICE is a connection
of an iommufd_device object and an iommufd_viommu object, require both as
the ioctl inputs and take refcounts in the ioctl handler.

Then, let the idev structure hold the allocated vdev pointer with a proper
locking protection.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  20 +++++
 include/linux/iommufd.h                 |   3 +
 include/uapi/linux/iommufd.h            |  26 ++++++
 drivers/iommu/iommufd/device.c          |  11 +++
 drivers/iommu/iommufd/main.c            |   7 ++
 drivers/iommu/iommufd/viommu.c          | 101 ++++++++++++++++++++++++
 6 files changed, 168 insertions(+)

Comments

Tian, Kevin Oct. 28, 2024, 3:11 a.m. UTC | #1
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, October 26, 2024 7:51 AM
> 
> +
> +/**
> + * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
> + * @size: sizeof(struct iommu_vdevice_alloc)
> + * @viommu_id: vIOMMU ID to associate with the virtual device
> + * @dev_id: The pyhsical device to allocate a virtual instance on the
> vIOMMU

s/pyhsical/physical/, or just say 'iommufd device"

> +
> +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_vdevice_alloc *cmd = ucmd->cmd;
> +	struct iommufd_vdevice *vdev, *curr;
> +	struct iommufd_viommu *viommu;
> +	struct iommufd_device *idev;
> +	u64 virt_id = cmd->virt_id;
> +	int rc = 0;
> +
> +	if (virt_id > ULONG_MAX)
> +		return -EINVAL;
> +
> +	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
> +	if (IS_ERR(viommu))
> +		return PTR_ERR(viommu);
> +
> +	idev = iommufd_get_device(ucmd, cmd->dev_id);
> +	if (IS_ERR(idev)) {
> +		rc = PTR_ERR(idev);
> +		goto out_put_viommu;
> +	}
> +
> +	mutex_lock(&idev->igroup->lock);
> +	if (idev->vdev) {
> +		rc = -EEXIST;
> +		goto out_unlock_igroup;
> +	}
> +
> +	vdev = iommufd_object_alloc(ucmd->ictx, vdev,
> IOMMUFD_OBJ_VDEVICE);
> +	if (IS_ERR(vdev)) {
> +		rc = PTR_ERR(vdev);
> +		goto out_unlock_igroup;
> +	}
> +

also need to check that the device and the viommu are associated
to a same physical iommu.

> +	rc = iommufd_verify_unfinalized_object(ucmd->ictx, &vdev->obj);
> +	if (rc) {
> +		kfree(vdev);
> +		goto out_unlock_igroup;
> +	}
> +
> +	vdev->idev = idev;
> +	vdev->id = virt_id;
> +	vdev->viommu = viommu;
> +
> +	idev->vdev = vdev;
> +	refcount_inc(&idev->obj.users);
> +	refcount_inc(&viommu->obj.users);
> +
> +	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev,
> GFP_KERNEL);
> +	if (curr) {
> +		rc = xa_err(curr) ?: -EBUSY;
> +		goto out_abort;
> +	}
> +
> +	cmd->out_vdevice_id = vdev->obj.id;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +	if (rc)
> +		goto out_abort;
> +	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
> +	goto out_unlock_igroup;
> +
> +out_abort:
> +	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
> +out_unlock_igroup:
> +	mutex_unlock(&idev->igroup->lock);
> +	iommufd_put_object(ucmd->ictx, &idev->obj);
> +out_put_viommu:
> +	iommufd_put_object(ucmd->ictx, &viommu->obj);
> +	return rc;
> +}
> --
> 2.43.0
Nicolin Chen Oct. 28, 2024, 8:18 p.m. UTC | #2
On Mon, Oct 28, 2024 at 03:11:32AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Saturday, October 26, 2024 7:51 AM
> >
> > +
> > +/**
> > + * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
> > + * @size: sizeof(struct iommu_vdevice_alloc)
> > + * @viommu_id: vIOMMU ID to associate with the virtual device
> > + * @dev_id: The pyhsical device to allocate a virtual instance on the
> > vIOMMU
> 
> s/pyhsical/physical/, or just say 'iommufd device"

Ack for "physical", aligning with other @dev_id lines.

> > +
> > +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
> > +{
> > +     struct iommu_vdevice_alloc *cmd = ucmd->cmd;
> > +     struct iommufd_vdevice *vdev, *curr;
> > +     struct iommufd_viommu *viommu;
> > +     struct iommufd_device *idev;
> > +     u64 virt_id = cmd->virt_id;
> > +     int rc = 0;
> > +
> > +     if (virt_id > ULONG_MAX)
> > +             return -EINVAL;
> > +
> > +     viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
> > +     if (IS_ERR(viommu))
> > +             return PTR_ERR(viommu);
> > +
> > +     idev = iommufd_get_device(ucmd, cmd->dev_id);
> > +     if (IS_ERR(idev)) {
> > +             rc = PTR_ERR(idev);
> > +             goto out_put_viommu;
> > +     }
> > +
> > +     mutex_lock(&idev->igroup->lock);
> > +     if (idev->vdev) {
> > +             rc = -EEXIST;
> > +             goto out_unlock_igroup;
> > +     }
> > +
> > +     vdev = iommufd_object_alloc(ucmd->ictx, vdev,
> > IOMMUFD_OBJ_VDEVICE);
> > +     if (IS_ERR(vdev)) {
> > +             rc = PTR_ERR(vdev);
> > +             goto out_unlock_igroup;
> > +     }
> > +
> 
> also need to check that the device and the viommu are associated
> to a same physical iommu.

Ack. Will add this prior to mutex_lock(&idev->igroup->lock);

+       if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+               rc = -EINVAL;
+               goto out_put_idev;
+       }

Thanks!
Nicolin
Jason Gunthorpe Oct. 29, 2024, 3:58 p.m. UTC | #3
On Fri, Oct 25, 2024 at 04:50:30PM -0700, Nicolin Chen wrote:
> +/**
> + * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
> + * @size: sizeof(struct iommu_vdevice_alloc)
> + * @viommu_id: vIOMMU ID to associate with the virtual device
> + * @dev_id: The pyhsical device to allocate a virtual instance on the vIOMMU
> + * @__reserved: Must be 0
> + * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
> + *           of AMD IOMMU, and vID of a nested Intel VT-d to a Context Table.
> + * @out_vdevice_id: Output virtual instance ID for the allocated object

How about:

@out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY


> + * Allocate a virtual device instance (for a physical device) against a vIOMMU.
> + * This instance holds the device's information (related to its vIOMMU) in a VM.
> + */
> +struct iommu_vdevice_alloc {
> +	__u32 size;
> +	__u32 viommu_id;
> +	__u32 dev_id;
> +	__u32 __reserved;
> +	__aligned_u64 virt_id;
> +	__u32 out_vdevice_id;
> +	__u32 __reserved2;

Lets not have two u32 reserved, put the out_vdevice_id above virt_id

> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index 5fd3dd420290..e50113305a9c 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -277,6 +277,17 @@ EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
>   */
>  void iommufd_device_unbind(struct iommufd_device *idev)
>  {
> +	u32 vdev_id = 0;
> +
> +	/* idev->vdev object should be destroyed prior, yet just in case.. */
> +	mutex_lock(&idev->igroup->lock);
> +	if (idev->vdev)

Then should it have a WARN_ON here?

> +		vdev_id = idev->vdev->obj.id;
> +	mutex_unlock(&idev->igroup->lock);
> +	/* Relying on xa_lock against a race with iommufd_destroy() */
> +	if (vdev_id)
> +		iommufd_object_remove(idev->ictx, NULL, vdev_id, 0);

That doesn't seem right, iommufd_object_remove() should never be used
to destroy an object that userspace created with an IOCTL, in fact
that just isn't allowed.

Ugh, there is worse here, we can't hold a long term reference on a
kernel owned object:

	idev->vdev = vdev;
	refcount_inc(&idev->obj.users);

As it prevents the kernel from disconnecting it.

I came up with this that seems like it will work. Maybe we will need
to improve it later. Instead of using the idev, just keep the raw
struct device. We can hold a refcount on the struct device without
races. There is no need for the idev igroup lock since the xa_lock
does everything we need.

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index e50113305a9c47..5fd3dd42029015 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -277,17 +277,6 @@ EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
  */
 void iommufd_device_unbind(struct iommufd_device *idev)
 {
-	u32 vdev_id = 0;
-
-	/* idev->vdev object should be destroyed prior, yet just in case.. */
-	mutex_lock(&idev->igroup->lock);
-	if (idev->vdev)
-		vdev_id = idev->vdev->obj.id;
-	mutex_unlock(&idev->igroup->lock);
-	/* Relying on xa_lock against a race with iommufd_destroy() */
-	if (vdev_id)
-		iommufd_object_remove(idev->ictx, NULL, vdev_id, 0);
-
 	iommufd_object_destroy_user(idev->ictx, &idev->obj);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index 9849474f429f98..6e870bce2a0cd0 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -46,6 +46,6 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 	lockdep_assert_held(&viommu->vdevs.xa_lock);
 
 	vdev = xa_load(&viommu->vdevs, vdev_id);
-	return vdev ? vdev->idev->dev : NULL;
+	return vdev ? vdev->dev : NULL;
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, IOMMUFD);
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 365cf5a56cdf20..275f954235940c 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -152,9 +152,6 @@ static inline void iommufd_put_object(struct iommufd_ctx *ictx,
 		wake_up_interruptible_all(&ictx->destroy_wait);
 }
 
-int iommufd_verify_unfinalized_object(struct iommufd_ctx *ictx,
-				      struct iommufd_object *to_verify);
-
 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj);
 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
 				      struct iommufd_object *obj);
@@ -391,7 +388,6 @@ struct iommufd_device {
 	struct iommufd_object obj;
 	struct iommufd_ctx *ictx;
 	struct iommufd_group *igroup;
-	struct iommufd_vdevice *vdev;
 	struct list_head group_item;
 	/* always the physical device */
 	struct device *dev;
@@ -523,7 +519,7 @@ void iommufd_vdevice_abort(struct iommufd_object *obj);
 struct iommufd_vdevice {
 	struct iommufd_object obj;
 	struct iommufd_ctx *ictx;
-	struct iommufd_device *idev;
+	struct device *dev;
 	struct iommufd_viommu *viommu;
 	u64 id; /* per-vIOMMU virtual ID */
 };
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 696ac9e0e74b89..c90fe15af98be4 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -43,9 +43,10 @@ void iommufd_object_finalize(struct iommufd_ctx *ictx,
 {
 	void *old;
 
-	old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL);
+	old = xa_cmpxchg(&ictx->objects, obj->id, XA_ZERO_ENTRY, obj,
+			 GFP_KERNEL);
 	/* obj->id was returned from xa_alloc() so the xa_store() cannot fail */
-	WARN_ON(old);
+	WARN_ON(old != XA_ZERO_ENTRY);
 }
 
 /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */
@@ -89,26 +90,6 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
 	return obj;
 }
 
-int iommufd_verify_unfinalized_object(struct iommufd_ctx *ictx,
-				      struct iommufd_object *to_verify)
-{
-	XA_STATE(xas, &ictx->objects, 0);
-	struct iommufd_object *obj;
-	int rc = 0;
-
-	if (!to_verify || !to_verify->id)
-		return -EINVAL;
-	xas.xa_index = to_verify->id;
-
-	xa_lock(&ictx->objects);
-	obj = xas_load(&xas);
-	/* Being an unfinalized object, the loaded obj is a reserved space */
-	if (obj != XA_ZERO_ENTRY)
-		rc = -ENOENT;
-	xa_unlock(&ictx->objects);
-	return rc;
-}
-
 static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx,
 					     struct iommufd_object *to_destroy)
 {
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 2b9a9a80298d8e..e7385676f17659 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -55,12 +55,6 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_hwpt;
 	}
 
-	rc = iommufd_verify_unfinalized_object(ucmd->ictx, &viommu->obj);
-	if (rc) {
-		kfree(viommu);
-		goto out_put_hwpt;
-	}
-
 	viommu->type = cmd->type;
 	viommu->ictx = ucmd->ictx;
 	viommu->hwpt = hwpt_paging;
@@ -95,27 +89,18 @@ void iommufd_vdevice_abort(struct iommufd_object *obj)
 	struct iommufd_vdevice *old,
 		*vdev = container_of(obj, struct iommufd_vdevice, obj);
 	struct iommufd_viommu *viommu = vdev->viommu;
-	struct iommufd_device *idev = vdev->idev;
-
-	lockdep_assert_held(&idev->igroup->lock);
 
 	old = xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
 	if (old)
 		WARN_ON(old != vdev);
 
 	refcount_dec(&viommu->obj.users);
-	refcount_dec(&idev->obj.users);
-	idev->vdev = NULL;
+	put_device(vdev->dev);
 }
 
 void iommufd_vdevice_destroy(struct iommufd_object *obj)
 {
-	struct iommufd_vdevice *vdev =
-		container_of(obj, struct iommufd_vdevice, obj);
-
-	mutex_lock(&vdev->idev->igroup->lock);
 	iommufd_vdevice_abort(obj);
-	mutex_unlock(&vdev->idev->igroup->lock);
 }
 
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -140,30 +125,16 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_viommu;
 	}
 
-	mutex_lock(&idev->igroup->lock);
-	if (idev->vdev) {
-		rc = -EEXIST;
-		goto out_unlock_igroup;
-	}
-
 	vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
 	if (IS_ERR(vdev)) {
 		rc = PTR_ERR(vdev);
 		goto out_unlock_igroup;
 	}
 
-	rc = iommufd_verify_unfinalized_object(ucmd->ictx, &vdev->obj);
-	if (rc) {
-		kfree(vdev);
-		goto out_unlock_igroup;
-	}
-
-	vdev->idev = idev;
 	vdev->id = virt_id;
+	vdev->dev = idev->dev;
+	get_device(idev->dev);
 	vdev->viommu = viommu;
-
-	idev->vdev = vdev;
-	refcount_inc(&idev->obj.users);
 	refcount_inc(&viommu->obj.users);
 
 	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
@@ -182,7 +153,6 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 out_abort:
 	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
 out_unlock_igroup:
-	mutex_unlock(&idev->igroup->lock);
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 out_put_viommu:
 	iommufd_put_object(ucmd->ictx, &viommu->obj);
Nicolin Chen Oct. 29, 2024, 5:29 p.m. UTC | #4
On Tue, Oct 29, 2024 at 12:58:24PM -0300, Jason Gunthorpe wrote:
> On Fri, Oct 25, 2024 at 04:50:30PM -0700, Nicolin Chen wrote:
> > diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> > index 5fd3dd420290..e50113305a9c 100644
> > --- a/drivers/iommu/iommufd/device.c
> > +++ b/drivers/iommu/iommufd/device.c
> > @@ -277,6 +277,17 @@ EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
> >   */
> >  void iommufd_device_unbind(struct iommufd_device *idev)
> >  {
> > +	u32 vdev_id = 0;
> > +
> > +	/* idev->vdev object should be destroyed prior, yet just in case.. */
> > +	mutex_lock(&idev->igroup->lock);
> > +	if (idev->vdev)
> 
> Then should it have a WARN_ON here?

It'd be a user space mistake that forgot to call the destroy ioctl
to the object, in which case I recall kernel shouldn't WARN_ON?

> > +		vdev_id = idev->vdev->obj.id;
> > +	mutex_unlock(&idev->igroup->lock);
> > +	/* Relying on xa_lock against a race with iommufd_destroy() */
> > +	if (vdev_id)
> > +		iommufd_object_remove(idev->ictx, NULL, vdev_id, 0);
> 
> That doesn't seem right, iommufd_object_remove() should never be used
> to destroy an object that userspace created with an IOCTL, in fact
> that just isn't allowed.

It was for our auto destroy feature. If user space forgot to destroy
the object while trying to unplug the device from VM. This saves the
day.

> Ugh, there is worse here, we can't hold a long term reference on a
> kernel owned object:
> 
> 	idev->vdev = vdev;
> 	refcount_inc(&idev->obj.users);
> 
> As it prevents the kernel from disconnecting it.

Hmm, mind elaborating? I think the iommufd_fops_release() would
xa_for_each the object list that destroys the vdev object first
then this idev (and viommu too)?

> I came up with this that seems like it will work. Maybe we will need
> to improve it later. Instead of using the idev, just keep the raw
> struct device. We can hold a refcount on the struct device without
> races. There is no need for the idev igroup lock since the xa_lock
> does everything we need.

OK. If user space forgot to destroy its vdev while unplugging the
device, it would not be allowed to hotplug another device (or the
same device) back to the same slot having the same RID, since the
RID on the vIOMMU would be occupied by the undestroyed vdev.

If we decide to do so, I think we should highlight this somewhere
in the doc.

Thanks
Nicolin
Jason Gunthorpe Oct. 29, 2024, 6:48 p.m. UTC | #5
On Tue, Oct 29, 2024 at 10:29:56AM -0700, Nicolin Chen wrote:
> On Tue, Oct 29, 2024 at 12:58:24PM -0300, Jason Gunthorpe wrote:
> > On Fri, Oct 25, 2024 at 04:50:30PM -0700, Nicolin Chen wrote:
> > > diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> > > index 5fd3dd420290..e50113305a9c 100644
> > > --- a/drivers/iommu/iommufd/device.c
> > > +++ b/drivers/iommu/iommufd/device.c
> > > @@ -277,6 +277,17 @@ EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
> > >   */
> > >  void iommufd_device_unbind(struct iommufd_device *idev)
> > >  {
> > > +	u32 vdev_id = 0;
> > > +
> > > +	/* idev->vdev object should be destroyed prior, yet just in case.. */
> > > +	mutex_lock(&idev->igroup->lock);
> > > +	if (idev->vdev)
> > 
> > Then should it have a WARN_ON here?
> 
> It'd be a user space mistake that forgot to call the destroy ioctl
> to the object, in which case I recall kernel shouldn't WARN_ON?

But you can't get here because:

 	refcount_inc(&idev->obj.users);

And kernel doesn't destroy objects with elevated ref counts?


> > > +		vdev_id = idev->vdev->obj.id;
> > > +	mutex_unlock(&idev->igroup->lock);
> > > +	/* Relying on xa_lock against a race with iommufd_destroy() */
> > > +	if (vdev_id)
> > > +		iommufd_object_remove(idev->ictx, NULL, vdev_id, 0);
> > 
> > That doesn't seem right, iommufd_object_remove() should never be used
> > to destroy an object that userspace created with an IOCTL, in fact
> > that just isn't allowed.
> 
> It was for our auto destroy feature. 

auto domains are "hidden" hwpts that are kernel managed. They are not
"userspace created".

"Usespace created" objects are ones that userspace is expected to call
destroy on.

If you destroy them behind the scenes in the kerenl then the objecd ID
can be reallocated for something else and when userspace does DESTROY
on the ID it thought was still allocated it will malfunction.

So, only userspace can destroy objects that userspace created.

> If user space forgot to destroy the object while trying to unplug
> the device from VM. This saves the day.

No, it should/does fail destroy of the VIOMMU object because the users
refcount is elevated.


> > Ugh, there is worse here, we can't hold a long term reference on a
> > kernel owned object:
> > 
> > 	idev->vdev = vdev;
> > 	refcount_inc(&idev->obj.users);
> > 
> > As it prevents the kernel from disconnecting it.
> 
> Hmm, mind elaborating? I think the iommufd_fops_release() would
> xa_for_each the object list that destroys the vdev object first
> then this idev (and viommu too)?

iommufd_device_unbind() can't fail, and if the object can't be
destroyed because it has an elevated long term refcount it WARN's:


	ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM);

	/*
	 * If there is a bug and we couldn't destroy the object then we did put
	 * back the caller's users refcount and will eventually try to free it
	 * again during close.
	 */
	WARN_ON(ret);

So you cannot take long term references on kernel owned objects. Only
userspace owned objects.


> OK. If user space forgot to destroy its vdev while unplugging the
> device, it would not be allowed to hotplug another device (or the
> same device) back to the same slot having the same RID, since the
> RID on the vIOMMU would be occupied by the undestroyed vdev.

Yes, that seems correct and obvious to me. Until the vdev is
explicitly destroyed the ID is in-use.

Good userspace should destroy the iommufd vDEVICE object before
closing the VFIO file descriptor.

If it doesn't, then the VDEVICE object remains even though the VFIO it
was linked to is gone.

Jason
Nicolin Chen Oct. 29, 2024, 7:30 p.m. UTC | #6
On Tue, Oct 29, 2024 at 03:48:01PM -0300, Jason Gunthorpe wrote:
> On Tue, Oct 29, 2024 at 10:29:56AM -0700, Nicolin Chen wrote:
> > On Tue, Oct 29, 2024 at 12:58:24PM -0300, Jason Gunthorpe wrote:
> > > On Fri, Oct 25, 2024 at 04:50:30PM -0700, Nicolin Chen wrote:
> > > > diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> > > > index 5fd3dd420290..e50113305a9c 100644
> > > > --- a/drivers/iommu/iommufd/device.c
> > > > +++ b/drivers/iommu/iommufd/device.c
> > > > @@ -277,6 +277,17 @@ EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
> > > >   */
> > > >  void iommufd_device_unbind(struct iommufd_device *idev)
> > > >  {
> > > > +	u32 vdev_id = 0;
> > > > +
> > > > +	/* idev->vdev object should be destroyed prior, yet just in case.. */
> > > > +	mutex_lock(&idev->igroup->lock);
> > > > +	if (idev->vdev)
> > > 
> > > Then should it have a WARN_ON here?
> > 
> > It'd be a user space mistake that forgot to call the destroy ioctl
> > to the object, in which case I recall kernel shouldn't WARN_ON?
> 
> But you can't get here because:
> 
>  	refcount_inc(&idev->obj.users);
> 
> And kernel doesn't destroy objects with elevated ref counts?

Hmm, this is not a ->destroy() but iommufd_device_unbind called
by VFIO. And we actually ran into this routine when QEMU didn't
destroy vdev. So, I added this chunk.

The iommufd_object_remove(vdev_id) here would destroy the vdev
where its destroy() does refcount_dec(&idev->obj.users). Then,
the following iommufd_object_destroy_user(.., &idev->obj) will
succeed.

With that said, let's just mandate userspace to destroy vdev.

> > > > +		vdev_id = idev->vdev->obj.id;
> > > > +	mutex_unlock(&idev->igroup->lock);
> > > > +	/* Relying on xa_lock against a race with iommufd_destroy() */
> > > > +	if (vdev_id)
> > > > +		iommufd_object_remove(idev->ictx, NULL, vdev_id, 0);
> > > 
> > > That doesn't seem right, iommufd_object_remove() should never be used
> > > to destroy an object that userspace created with an IOCTL, in fact
> > > that just isn't allowed.
> > 
> > It was for our auto destroy feature. 
> 
> auto domains are "hidden" hwpts that are kernel managed. They are not
> "userspace created".
> 
> "Usespace created" objects are ones that userspace is expected to call
> destroy on.

OK. I misunderstood that.

> If you destroy them behind the scenes in the kerenl then the objecd ID
> can be reallocated for something else and when userspace does DESTROY
> on the ID it thought was still allocated it will malfunction.
> 
> So, only userspace can destroy objects that userspace created.

I see. That makes sense.

> > If user space forgot to destroy the object while trying to unplug
> > the device from VM. This saves the day.
> 
> No, it should/does fail destroy of the VIOMMU object because the users
> refcount is elevated.

The vIOMMU object is refcount_dec also from the unbind() calling
remove(). But anyway, we aligned that userspace should destroy it
explicitly.

> > > Ugh, there is worse here, we can't hold a long term reference on a
> > > kernel owned object:
> > > 
> > > 	idev->vdev = vdev;
> > > 	refcount_inc(&idev->obj.users);
> > > 
> > > As it prevents the kernel from disconnecting it.
> > 
> > Hmm, mind elaborating? I think the iommufd_fops_release() would
> > xa_for_each the object list that destroys the vdev object first
> > then this idev (and viommu too)?
> 
> iommufd_device_unbind() can't fail, and if the object can't be
> destroyed because it has an elevated long term refcount it WARN's:
> 
> 
> 	ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM);
> 
> 	/*
> 	 * If there is a bug and we couldn't destroy the object then we did put
> 	 * back the caller's users refcount and will eventually try to free it
> 	 * again during close.
> 	 */
> 	WARN_ON(ret);
> 
> So you cannot take long term references on kernel owned objects. Only
> userspace owned objects.

OK. I think I had got this part. Gao ran into this WARN_ON at v3,
so I added iommufd_object_remove(vdev_id) in unbind() prior to
this iommufd_object_destroy_user(idev->ictx, &idev->obj).

> > OK. If user space forgot to destroy its vdev while unplugging the
> > device, it would not be allowed to hotplug another device (or the
> > same device) back to the same slot having the same RID, since the
> > RID on the vIOMMU would be occupied by the undestroyed vdev.
> 
> Yes, that seems correct and obvious to me. Until the vdev is
> explicitly destroyed the ID is in-use.
> 
> Good userspace should destroy the iommufd vDEVICE object before
> closing the VFIO file descriptor.
> 
> If it doesn't, then the VDEVICE object remains even though the VFIO it
> was linked to is gone.

I see.

Thanks
Nicolin
Jason Gunthorpe Oct. 30, 2024, 12:08 a.m. UTC | #7
On Tue, Oct 29, 2024 at 12:30:00PM -0700, Nicolin Chen wrote:

> > iommufd_device_unbind() can't fail, and if the object can't be
> > destroyed because it has an elevated long term refcount it WARN's:
> > 
> > 
> > 	ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM);
> > 
> > 	/*
> > 	 * If there is a bug and we couldn't destroy the object then we did put
> > 	 * back the caller's users refcount and will eventually try to free it
> > 	 * again during close.
> > 	 */
> > 	WARN_ON(ret);
> > 
> > So you cannot take long term references on kernel owned objects. Only
> > userspace owned objects.
> 
> OK. I think I had got this part. Gao ran into this WARN_ON at v3,
> so I added iommufd_object_remove(vdev_id) in unbind() prior to
> this iommufd_object_destroy_user(idev->ictx, &idev->obj).

Oh I see, so the fix to that is to not take a longterm reference, not
to try to destroy a vdev.

The alternative ould be to try to unlink the idev from the vdev and
leave a zombie vdev, but that didn't look so nice to implement. If we
need it we can do it later

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 8c9ab35eaea5..365cf5a56cdf 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -391,6 +391,7 @@  struct iommufd_device {
 	struct iommufd_object obj;
 	struct iommufd_ctx *ictx;
 	struct iommufd_group *igroup;
+	struct iommufd_vdevice *vdev;
 	struct list_head group_item;
 	/* always the physical device */
 	struct device *dev;
@@ -505,8 +506,27 @@  static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+static inline struct iommufd_viommu *
+iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_VIOMMU),
+			    struct iommufd_viommu, obj);
+}
+
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_vdevice_destroy(struct iommufd_object *obj);
+void iommufd_vdevice_abort(struct iommufd_object *obj);
+
+struct iommufd_vdevice {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommufd_device *idev;
+	struct iommufd_viommu *viommu;
+	u64 id; /* per-vIOMMU virtual ID */
+};
 
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 083ceb209704..e6cd288e8b83 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -31,6 +31,7 @@  enum iommufd_object_type {
 	IOMMUFD_OBJ_ACCESS,
 	IOMMUFD_OBJ_FAULT,
 	IOMMUFD_OBJ_VIOMMU,
+	IOMMUFD_OBJ_VDEVICE,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -89,6 +90,8 @@  struct iommufd_viommu {
 
 	const struct iommufd_viommu_ops *ops;
 
+	struct xarray vdevs;
+
 	unsigned int type;
 };
 
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 56c742106a45..b699ecb7aa9c 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -52,6 +52,7 @@  enum {
 	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
 	IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f,
+	IOMMUFD_CMD_VDEVICE_ALLOC = 0x90,
 };
 
 /**
@@ -896,4 +897,29 @@  struct iommu_viommu_alloc {
 	__u32 out_viommu_id;
 };
 #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
+
+/**
+ * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
+ * @size: sizeof(struct iommu_vdevice_alloc)
+ * @viommu_id: vIOMMU ID to associate with the virtual device
+ * @dev_id: The pyhsical device to allocate a virtual instance on the vIOMMU
+ * @__reserved: Must be 0
+ * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
+ *           of AMD IOMMU, and vID of a nested Intel VT-d to a Context Table.
+ * @out_vdevice_id: Output virtual instance ID for the allocated object
+ * @__reserved2: Must be 0
+ *
+ * Allocate a virtual device instance (for a physical device) against a vIOMMU.
+ * This instance holds the device's information (related to its vIOMMU) in a VM.
+ */
+struct iommu_vdevice_alloc {
+	__u32 size;
+	__u32 viommu_id;
+	__u32 dev_id;
+	__u32 __reserved;
+	__aligned_u64 virt_id;
+	__u32 out_vdevice_id;
+	__u32 __reserved2;
+};
+#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
 #endif
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 5fd3dd420290..e50113305a9c 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -277,6 +277,17 @@  EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
  */
 void iommufd_device_unbind(struct iommufd_device *idev)
 {
+	u32 vdev_id = 0;
+
+	/* idev->vdev object should be destroyed prior, yet just in case.. */
+	mutex_lock(&idev->igroup->lock);
+	if (idev->vdev)
+		vdev_id = idev->vdev->obj.id;
+	mutex_unlock(&idev->igroup->lock);
+	/* Relying on xa_lock against a race with iommufd_destroy() */
+	if (vdev_id)
+		iommufd_object_remove(idev->ictx, NULL, vdev_id, 0);
+
 	iommufd_object_destroy_user(idev->ictx, &idev->obj);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index ab5ee325d809..696ac9e0e74b 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -322,6 +322,7 @@  union ucmd_buffer {
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_viommu_alloc viommu;
+	struct iommu_vdevice_alloc vdev;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -375,6 +376,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 __reserved),
 	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
 		 struct iommu_viommu_alloc, out_viommu_id),
+	IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
+		 struct iommu_vdevice_alloc, __reserved2),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -513,6 +516,10 @@  static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_VIOMMU] = {
 		.destroy = iommufd_viommu_destroy,
 	},
+	[IOMMUFD_OBJ_VDEVICE] = {
+		.destroy = iommufd_vdevice_destroy,
+		.abort = iommufd_vdevice_abort,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index eb41e15ebab1..2b9a9a80298d 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -12,6 +12,7 @@  void iommufd_viommu_destroy(struct iommufd_object *obj)
 	if (viommu->ops && viommu->ops->free)
 		viommu->ops->free(viommu);
 	refcount_dec(&viommu->hwpt->common.obj.users);
+	xa_destroy(&viommu->vdevs);
 }
 
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -70,6 +71,7 @@  int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	 */
 	viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
 
+	xa_init(&viommu->vdevs);
 	refcount_inc(&viommu->hwpt->common.obj.users);
 
 	cmd->out_viommu_id = viommu->obj.id;
@@ -87,3 +89,102 @@  int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 	return rc;
 }
+
+void iommufd_vdevice_abort(struct iommufd_object *obj)
+{
+	struct iommufd_vdevice *old,
+		*vdev = container_of(obj, struct iommufd_vdevice, obj);
+	struct iommufd_viommu *viommu = vdev->viommu;
+	struct iommufd_device *idev = vdev->idev;
+
+	lockdep_assert_held(&idev->igroup->lock);
+
+	old = xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
+	if (old)
+		WARN_ON(old != vdev);
+
+	refcount_dec(&viommu->obj.users);
+	refcount_dec(&idev->obj.users);
+	idev->vdev = NULL;
+}
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_vdevice *vdev =
+		container_of(obj, struct iommufd_vdevice, obj);
+
+	mutex_lock(&vdev->idev->igroup->lock);
+	iommufd_vdevice_abort(obj);
+	mutex_unlock(&vdev->idev->igroup->lock);
+}
+
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_vdevice_alloc *cmd = ucmd->cmd;
+	struct iommufd_vdevice *vdev, *curr;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	u64 virt_id = cmd->virt_id;
+	int rc = 0;
+
+	if (virt_id > ULONG_MAX)
+		return -EINVAL;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev)) {
+		rc = PTR_ERR(idev);
+		goto out_put_viommu;
+	}
+
+	mutex_lock(&idev->igroup->lock);
+	if (idev->vdev) {
+		rc = -EEXIST;
+		goto out_unlock_igroup;
+	}
+
+	vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
+	if (IS_ERR(vdev)) {
+		rc = PTR_ERR(vdev);
+		goto out_unlock_igroup;
+	}
+
+	rc = iommufd_verify_unfinalized_object(ucmd->ictx, &vdev->obj);
+	if (rc) {
+		kfree(vdev);
+		goto out_unlock_igroup;
+	}
+
+	vdev->idev = idev;
+	vdev->id = virt_id;
+	vdev->viommu = viommu;
+
+	idev->vdev = vdev;
+	refcount_inc(&idev->obj.users);
+	refcount_inc(&viommu->obj.users);
+
+	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
+	if (curr) {
+		rc = xa_err(curr) ?: -EBUSY;
+		goto out_abort;
+	}
+
+	cmd->out_vdevice_id = vdev->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_abort;
+	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
+	goto out_unlock_igroup;
+
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_unlock_igroup:
+	mutex_unlock(&idev->igroup->lock);
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+out_put_viommu:
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+	return rc;
+}