diff mbox series

[v3,04/11] iommufd/viommu: Add IOMMU_VIOMMU_ALLOC ioctl

Message ID 352e1701acdec6e038ccddf02227be3a1670706e.1728491453.git.nicolinc@nvidia.com (mailing list archive)
State New
Headers show
Series cover-letter: iommufd: Add vIOMMU infrastructure (Part-1) | expand

Commit Message

Nicolin Chen Oct. 9, 2024, 4:38 p.m. UTC
Add a new ioctl for user space to do a vIOMMU allocation. It must be based
on a nesting parent HWPT, so take its refcount.

As an initial version, define an IOMMU_VIOMMU_TYPE_DEFAULT type. Using it,
the object will be allocated by the iommufd core.

If an IOMMU driver supports a driver-managed vIOMMU object, it must define
its own IOMMU_VIOMMU_TYPE_ in the uAPI header and implement a viommu_alloc
op in its iommu_ops.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |  3 +-
 drivers/iommu/iommufd/iommufd_private.h |  3 +
 include/uapi/linux/iommufd.h            | 40 +++++++++++
 drivers/iommu/iommufd/main.c            |  6 ++
 drivers/iommu/iommufd/viommu.c          | 91 +++++++++++++++++++++++++
 5 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/iommufd/viommu.c

Comments

Nicolin Chen Oct. 16, 2024, 6:38 a.m. UTC | #1
On Wed, Oct 09, 2024 at 09:38:04AM -0700, Nicolin Chen wrote:

> +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
[...]
> +       if (cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) {
> +               viommu = __iommufd_viommu_alloc(ucmd->ictx, sizeof(*viommu),
> +                                               NULL);
> +       } else {
> +               if (!ops->viommu_alloc) {
> +                       rc = -EOPNOTSUPP;
> +                       goto out_put_hwpt;
> +               }
> +
> +               viommu = ops->viommu_alloc(idev->dev->iommu->iommu_dev,
> +                                          hwpt_paging->common.domain,
> +                                          ucmd->ictx, cmd->type);
> +       }
> +       if (IS_ERR(viommu)) {
> +               rc = PTR_ERR(viommu);
> +               goto out_put_hwpt;
> +       }

While reworking the vIRQ series, I found here we should verify
this driver-allocated viommu object if it is allocated properly
via the suggested API (or if it is properly init-ed as a legit
ictx object).

Likely it needs a helper doing a comparison between viommu->obj
and the returned obj of xa_load(&ictx->objects, viommu->obj.id).
And the following vDEVICE alloc needs it too.

Nicolin
Jason Gunthorpe Oct. 17, 2024, 4:40 p.m. UTC | #2
On Wed, Oct 09, 2024 at 09:38:04AM -0700, Nicolin Chen wrote:
> Add a new ioctl for user space to do a vIOMMU allocation. It must be based
> on a nesting parent HWPT, so take its refcount.
> 
> As an initial version, define an IOMMU_VIOMMU_TYPE_DEFAULT type. Using it,
> the object will be allocated by the iommufd core.
> 
> If an IOMMU driver supports a driver-managed vIOMMU object, it must define
> its own IOMMU_VIOMMU_TYPE_ in the uAPI header and implement a viommu_alloc
> op in its iommu_ops.
> 
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/iommufd/Makefile          |  3 +-
>  drivers/iommu/iommufd/iommufd_private.h |  3 +
>  include/uapi/linux/iommufd.h            | 40 +++++++++++
>  drivers/iommu/iommufd/main.c            |  6 ++
>  drivers/iommu/iommufd/viommu.c          | 91 +++++++++++++++++++++++++
>  5 files changed, 142 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/iommu/iommufd/viommu.c

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
Alexey Kardashevskiy Oct. 21, 2024, 8:11 a.m. UTC | #3
On 10/10/24 03:38, Nicolin Chen wrote:
> Add a new ioctl for user space to do a vIOMMU allocation. It must be based
> on a nesting parent HWPT, so take its refcount.
> 
> As an initial version, define an IOMMU_VIOMMU_TYPE_DEFAULT type. Using it,
> the object will be allocated by the iommufd core.
> 
> If an IOMMU driver supports a driver-managed vIOMMU object, it must define
> its own IOMMU_VIOMMU_TYPE_ in the uAPI header and implement a viommu_alloc
> op in its iommu_ops.
> 
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>   drivers/iommu/iommufd/Makefile          |  3 +-
>   drivers/iommu/iommufd/iommufd_private.h |  3 +
>   include/uapi/linux/iommufd.h            | 40 +++++++++++
>   drivers/iommu/iommufd/main.c            |  6 ++
>   drivers/iommu/iommufd/viommu.c          | 91 +++++++++++++++++++++++++
>   5 files changed, 142 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/iommu/iommufd/viommu.c
> 
> diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
> index 93daedd7e5c8..288ef3e895e3 100644
> --- a/drivers/iommu/iommufd/Makefile
> +++ b/drivers/iommu/iommufd/Makefile
> @@ -7,7 +7,8 @@ iommufd-y := \
>   	ioas.o \
>   	main.o \
>   	pages.o \
> -	vfio_compat.o
> +	vfio_compat.o \
> +	viommu.o
>   
>   iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
>   
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 6a364073f699..4aefac6af23f 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -521,6 +521,9 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
>   	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
>   }
>   
> +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
> +void iommufd_viommu_destroy(struct iommufd_object *obj);
> +
>   #ifdef CONFIG_IOMMUFD_TEST
>   int iommufd_test(struct iommufd_ucmd *ucmd);
>   void iommufd_selftest_destroy(struct iommufd_object *obj);
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index cd4920886ad0..db9008a4eeef 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -51,6 +51,7 @@ enum {
>   	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
>   	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
>   	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
> +	IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f,
>   };
>   
>   /**
> @@ -852,4 +853,43 @@ struct iommu_fault_alloc {
>   	__u32 out_fault_fd;
>   };
>   #define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
> +
> +/**
> + * enum iommu_viommu_type - Virtual IOMMU Type
> + * @IOMMU_VIOMMU_TYPE_DEFAULT: Core-managed virtual IOMMU type
> + */
> +enum iommu_viommu_type {
> +	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
> +};
> +
> +/**
> + * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
> + * @size: sizeof(struct iommu_viommu_alloc)
> + * @flags: Must be 0
> + * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
> + * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
> + * @hwpt_id: ID of a nesting parent HWPT to associate to
> + * @out_viommu_id: Output virtual IOMMU ID for the allocated object
> + *
> + * Allocate a virtual IOMMU object that represents the underlying physical
> + * IOMMU's virtualization support. The vIOMMU object is a security-isolated
> + * slice of the physical IOMMU HW that is unique to a specific VM. Operations
> + * global to the IOMMU are connected to the vIOMMU, such as:
> + * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
> + * - Access to a sharable nesting parent pagetable across physical IOMMUs
> + * - Virtualization of various platforms IDs, e.g. RIDs and others
> + * - Delivery of paravirtualized invalidation
> + * - Direct assigned invalidation queues
> + * - Direct assigned interrupts
> + * - Non-affiliated event reporting
> + */
> +struct iommu_viommu_alloc {
> +	__u32 size;
> +	__u32 flags;
> +	__u32 type;
> +	__u32 dev_id;
> +	__u32 hwpt_id;
> +	__u32 out_viommu_id;
> +};
> +#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
>   #endif
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 92bd075108e5..cbd0a80b2d67 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -301,6 +301,7 @@ union ucmd_buffer {
>   	struct iommu_ioas_unmap unmap;
>   	struct iommu_option option;
>   	struct iommu_vfio_ioas vfio_ioas;
> +	struct iommu_viommu_alloc viommu;
>   #ifdef CONFIG_IOMMUFD_TEST
>   	struct iommu_test_cmd test;
>   #endif
> @@ -352,6 +353,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
>   		 val64),
>   	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
>   		 __reserved),
> +	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
> +		 struct iommu_viommu_alloc, out_viommu_id),
>   #ifdef CONFIG_IOMMUFD_TEST
>   	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
>   #endif
> @@ -487,6 +490,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
>   	[IOMMUFD_OBJ_FAULT] = {
>   		.destroy = iommufd_fault_destroy,
>   	},
> +	[IOMMUFD_OBJ_VIOMMU] = {
> +		.destroy = iommufd_viommu_destroy,
> +	},
>   #ifdef CONFIG_IOMMUFD_TEST
>   	[IOMMUFD_OBJ_SELFTEST] = {
>   		.destroy = iommufd_selftest_destroy,
> diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
> new file mode 100644
> index 000000000000..3a903baeee6a
> --- /dev/null
> +++ b/drivers/iommu/iommufd/viommu.c
> @@ -0,0 +1,91 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
> + */
> +
> +#include "iommufd_private.h"
> +
> +void iommufd_viommu_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_viommu *viommu =
> +		container_of(obj, struct iommufd_viommu, obj);
> +
> +	if (viommu->ops && viommu->ops->free)
> +		viommu->ops->free(viommu);
> +	refcount_dec(&viommu->hwpt->common.obj.users);
> +}
> +
> +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
> +{
> +	struct iommu_viommu_alloc *cmd = ucmd->cmd;
> +	struct iommufd_hwpt_paging *hwpt_paging;
> +	struct iommufd_viommu *viommu;
> +	struct iommufd_device *idev;
> +	const struct iommu_ops *ops;
> +	int rc;
> +
> +	if (cmd->flags)
> +		return -EOPNOTSUPP;
> +
> +	idev = iommufd_get_device(ucmd, cmd->dev_id);
> +	if (IS_ERR(idev))
> +		return PTR_ERR(idev);
> +	ops = dev_iommu_ops(idev->dev);
> +
> +	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
> +	if (IS_ERR(hwpt_paging)) {
> +		rc = PTR_ERR(hwpt_paging);


iommufd_get_hwpt_paging() is container_of() which does not test for the 
value and just does a simple math so the actual error value from 
iommufd_get_object() is ... lost?

Oh well, not really lost, as "obj" and "common.obj" seem to be forced to 
be at the beginning of iommufd object structs so container_of() is no-op 
or is not it?


> +		goto out_put_idev;
> +	}
> +
> +	if (!hwpt_paging->nest_parent) {
> +		rc = -EINVAL;
> +		goto out_put_hwpt;
> +	}
> +
> +	if (cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) {
> +		viommu = __iommufd_viommu_alloc(ucmd->ictx, sizeof(*viommu),
> +						NULL);
> +	} else {
> +		if (!ops->viommu_alloc) {
> +			rc = -EOPNOTSUPP;
> +			goto out_put_hwpt;
> +		}
> +
> +		viommu = ops->viommu_alloc(idev->dev->iommu->iommu_dev,
> +					   hwpt_paging->common.domain,
> +					   ucmd->ictx, cmd->type);
> +	}
> +	if (IS_ERR(viommu)) {
> +		rc = PTR_ERR(viommu);
> +		goto out_put_hwpt;
> +	}
> +
> +	viommu->type = cmd->type;
> +	viommu->ictx = ucmd->ictx;
> +	viommu->hwpt = hwpt_paging;
> +
> +	/*
> +	 * A real physical IOMMU instance would unlikely get unplugged, so the
> +	 * life cycle of this iommu_dev is guaranteed to stay alive, mostly. A
> +	 * pluggable IOMMU instance (if exists) is responsible for refcounting
> +	 * on its own.


"Assume IOMMUs are unpluggable (the most likely case)" would do imho, 
all these "unlikely" and "mostly" and "if exists" confuse.

If something needs to be guaranteed to stay alive, may be just call 
device_get(viommu->dev), with the comment above? Or it is some different 
refcounting which is missing? Thanks,


> +	 */
> +	viommu->iommu_dev = idev->dev->iommu->iommu_dev;
> +
> +	refcount_inc(&viommu->hwpt->common.obj.users);
> +
> +	cmd->out_viommu_id = viommu->obj.id;
> +	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
> +	if (rc)
> +		goto out_abort;
> +	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
> +	goto out_put_hwpt;
> +
> +out_abort:
> +	iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
> +out_put_hwpt:
> +	iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
> +out_put_idev:
> +	iommufd_put_object(ucmd->ictx, &idev->obj);
> +	return rc;
> +}
Jason Gunthorpe Oct. 21, 2024, 12:19 p.m. UTC | #4
On Mon, Oct 21, 2024 at 07:11:47PM +1100, Alexey Kardashevskiy wrote:
> > +	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
> > +	if (IS_ERR(hwpt_paging)) {
> > +		rc = PTR_ERR(hwpt_paging);
> 
> 
> iommufd_get_hwpt_paging() is container_of() which does not test for the
> value and just does a simple math so the actual error value from
> iommufd_get_object() is ... lost?
> 
> Oh well, not really lost, as "obj" and "common.obj" seem to be forced to be
> at the beginning of iommufd object structs so container_of() is no-op or is
> not it?

Yes, it is just a type check, the revised code I suggested doesn't
rely on this. Jonathon remarked on the same for fwctl and we came up
with something different.

> If something needs to be guaranteed to stay alive, may be just call
> device_get(viommu->dev), with the comment above? Or it is some different
> refcounting which is missing? Thanks,

device_get is different, it just makes sure the struct device memory
is around, it doesn't prevent iommu unplug

Jason
Nicolin Chen Oct. 21, 2024, 11:41 p.m. UTC | #5
On Mon, Oct 21, 2024 at 07:11:47PM +1100, Alexey Kardashevskiy wrote:
> > +     /*
> > +      * A real physical IOMMU instance would unlikely get unplugged, so the
> > +      * life cycle of this iommu_dev is guaranteed to stay alive, mostly. A
> > +      * pluggable IOMMU instance (if exists) is responsible for refcounting
> > +      * on its own.
> 
> "Assume IOMMUs are unpluggable (the most likely case)" would do imho,
> all these "unlikely" and "mostly" and "if exists" confuse.

Done.

-----------------------------------------------------------------
@@ -63,13 +63,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
        viommu->type = cmd->type;
        viommu->ictx = ucmd->ictx;
        viommu->hwpt = hwpt_paging;
-
-       /*
-        * A real physical IOMMU instance would unlikely get unplugged, so the
-        * life cycle of this iommu_dev is guaranteed to stay alive, mostly. A
-        * pluggable IOMMU instance (if exists) is responsible for refcounting
-        * on its own.
-        */
+       /* Assume physical IOMMUs are unpluggable (the most likely case) */
        viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);

        refcount_inc(&viommu->hwpt->common.obj.users);
-----------------------------------------------------------------

Thanks
Nicolin
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 93daedd7e5c8..288ef3e895e3 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -7,7 +7,8 @@  iommufd-y := \
 	ioas.o \
 	main.o \
 	pages.o \
-	vfio_compat.o
+	vfio_compat.o \
+	viommu.o
 
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 6a364073f699..4aefac6af23f 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -521,6 +521,9 @@  static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index cd4920886ad0..db9008a4eeef 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -51,6 +51,7 @@  enum {
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
 	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
+	IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f,
 };
 
 /**
@@ -852,4 +853,43 @@  struct iommu_fault_alloc {
 	__u32 out_fault_fd;
 };
 #define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Core-managed virtual IOMMU type
+ */
+enum iommu_viommu_type {
+	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate a virtual IOMMU object that represents the underlying physical
+ * IOMMU's virtualization support. The vIOMMU object is a security-isolated
+ * slice of the physical IOMMU HW that is unique to a specific VM. Operations
+ * global to the IOMMU are connected to the vIOMMU, such as:
+ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * - Access to a sharable nesting parent pagetable across physical IOMMUs
+ * - Virtualization of various platforms IDs, e.g. RIDs and others
+ * - Delivery of paravirtualized invalidation
+ * - Direct assigned invalidation queues
+ * - Direct assigned interrupts
+ * - Non-affiliated event reporting
+ */
+struct iommu_viommu_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 type;
+	__u32 dev_id;
+	__u32 hwpt_id;
+	__u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
 #endif
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 92bd075108e5..cbd0a80b2d67 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -301,6 +301,7 @@  union ucmd_buffer {
 	struct iommu_ioas_unmap unmap;
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
+	struct iommu_viommu_alloc viommu;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -352,6 +353,8 @@  static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 val64),
 	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
 		 __reserved),
+	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+		 struct iommu_viommu_alloc, out_viommu_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -487,6 +490,9 @@  static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_FAULT] = {
 		.destroy = iommufd_fault_destroy,
 	},
+	[IOMMUFD_OBJ_VIOMMU] = {
+		.destroy = iommufd_viommu_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
new file mode 100644
index 000000000000..3a903baeee6a
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu.c
@@ -0,0 +1,91 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+
+#include "iommufd_private.h"
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_viommu *viommu =
+		container_of(obj, struct iommufd_viommu, obj);
+
+	if (viommu->ops && viommu->ops->free)
+		viommu->ops->free(viommu);
+	refcount_dec(&viommu->hwpt->common.obj.users);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_viommu_alloc *cmd = ucmd->cmd;
+	struct iommufd_hwpt_paging *hwpt_paging;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	const struct iommu_ops *ops;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+	ops = dev_iommu_ops(idev->dev);
+
+	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt_paging)) {
+		rc = PTR_ERR(hwpt_paging);
+		goto out_put_idev;
+	}
+
+	if (!hwpt_paging->nest_parent) {
+		rc = -EINVAL;
+		goto out_put_hwpt;
+	}
+
+	if (cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) {
+		viommu = __iommufd_viommu_alloc(ucmd->ictx, sizeof(*viommu),
+						NULL);
+	} else {
+		if (!ops->viommu_alloc) {
+			rc = -EOPNOTSUPP;
+			goto out_put_hwpt;
+		}
+
+		viommu = ops->viommu_alloc(idev->dev->iommu->iommu_dev,
+					   hwpt_paging->common.domain,
+					   ucmd->ictx, cmd->type);
+	}
+	if (IS_ERR(viommu)) {
+		rc = PTR_ERR(viommu);
+		goto out_put_hwpt;
+	}
+
+	viommu->type = cmd->type;
+	viommu->ictx = ucmd->ictx;
+	viommu->hwpt = hwpt_paging;
+
+	/*
+	 * A real physical IOMMU instance would unlikely get unplugged, so the
+	 * life cycle of this iommu_dev is guaranteed to stay alive, mostly. A
+	 * pluggable IOMMU instance (if exists) is responsible for refcounting
+	 * on its own.
+	 */
+	viommu->iommu_dev = idev->dev->iommu->iommu_dev;
+
+	refcount_inc(&viommu->hwpt->common.obj.users);
+
+	cmd->out_viommu_id = viommu->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_abort;
+	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
+	goto out_put_hwpt;
+
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
+out_put_hwpt:
+	iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+	return rc;
+}