diff mbox series

vfio: Remove VFIO_TYPE1_NESTING_IOMMU

Message ID 0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series vfio: Remove VFIO_TYPE1_NESTING_IOMMU | expand

Commit Message

Jason Gunthorpe May 10, 2022, 4:55 p.m. UTC
This control causes the ARM SMMU drivers to choose a stage 2
implementation for the IO pagetable (vs the stage 1 usual default),
however this choice has no visible impact to the VFIO user. Further qemu
never implemented this and no other userspace user is known.

The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
SMMU translation services to the guest operating system" however the rest
of the API to set the guest table pointer for the stage 1 was never
completed, or at least never upstreamed, rendering this part useless dead
code.

Since the current patches to enable nested translation, aka userspace page
tables, rely on iommufd and will not use the enable_nesting()
iommu_domain_op, remove this infrastructure. However, don't cut too deep
into the SMMU drivers for now expecting the iommufd work to pick it up -
we still need to create S2 IO page tables.

Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
enable_nesting iommu_domain_op.

Just in-case there is some userspace using this continue to treat
requesting it as a NOP, but do not advertise support any more.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ----------------
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 16 ----------------
 drivers/iommu/iommu.c                       | 10 ----------
 drivers/vfio/vfio_iommu_type1.c             | 12 +-----------
 include/linux/iommu.h                       |  3 ---
 include/uapi/linux/vfio.h                   |  2 +-
 6 files changed, 2 insertions(+), 57 deletions(-)

It would probably make sense for this to go through the VFIO tree with Robin's
ack for the SMMU changes.


base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a

Comments

Robin Murphy May 10, 2022, 5:52 p.m. UTC | #1
On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
> This control causes the ARM SMMU drivers to choose a stage 2
> implementation for the IO pagetable (vs the stage 1 usual default),
> however this choice has no visible impact to the VFIO user. Further qemu
> never implemented this and no other userspace user is known.
> 
> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
> SMMU translation services to the guest operating system" however the rest
> of the API to set the guest table pointer for the stage 1 was never
> completed, or at least never upstreamed, rendering this part useless dead
> code.
> 
> Since the current patches to enable nested translation, aka userspace page
> tables, rely on iommufd and will not use the enable_nesting()
> iommu_domain_op, remove this infrastructure. However, don't cut too deep
> into the SMMU drivers for now expecting the iommufd work to pick it up -
> we still need to create S2 IO page tables.
> 
> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
> enable_nesting iommu_domain_op.
> 
> Just in-case there is some userspace using this continue to treat
> requesting it as a NOP, but do not advertise support any more.

I assume the nested translation/guest SVA patches that Eric and Vivek 
were working on pre-IOMMUFD made use of this, and given that they got 
quite far along, I wouldn't be too surprised if some eager cloud vendors 
might have even deployed something based on the patches off the list. I 
can't help feeling a little wary about removing this until IOMMUFD can 
actually offer a functional replacement - is it in the way of anything 
upcoming?

Robin.

> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ----------------
>   drivers/iommu/arm/arm-smmu/arm-smmu.c       | 16 ----------------
>   drivers/iommu/iommu.c                       | 10 ----------
>   drivers/vfio/vfio_iommu_type1.c             | 12 +-----------
>   include/linux/iommu.h                       |  3 ---
>   include/uapi/linux/vfio.h                   |  2 +-
>   6 files changed, 2 insertions(+), 57 deletions(-)
> 
> It would probably make sense for this to go through the VFIO tree with Robin's
> ack for the SMMU changes.
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 627a3ed5ee8fd1..b901e8973bb4ea 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -2724,21 +2724,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>   	return group;
>   }
>   
> -static int arm_smmu_enable_nesting(struct iommu_domain *domain)
> -{
> -	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> -	int ret = 0;
> -
> -	mutex_lock(&smmu_domain->init_mutex);
> -	if (smmu_domain->smmu)
> -		ret = -EPERM;
> -	else
> -		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
> -	mutex_unlock(&smmu_domain->init_mutex);
> -
> -	return ret;
> -}
> -
>   static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
>   {
>   	return iommu_fwspec_add_ids(dev, args->args, 1);
> @@ -2865,7 +2850,6 @@ static struct iommu_ops arm_smmu_ops = {
>   		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
>   		.iotlb_sync		= arm_smmu_iotlb_sync,
>   		.iova_to_phys		= arm_smmu_iova_to_phys,
> -		.enable_nesting		= arm_smmu_enable_nesting,
>   		.free			= arm_smmu_domain_free,
>   	}
>   };
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> index 568cce590ccc13..239e6f6585b48d 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> @@ -1507,21 +1507,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>   	return group;
>   }
>   
> -static int arm_smmu_enable_nesting(struct iommu_domain *domain)
> -{
> -	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> -	int ret = 0;
> -
> -	mutex_lock(&smmu_domain->init_mutex);
> -	if (smmu_domain->smmu)
> -		ret = -EPERM;
> -	else
> -		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
> -	mutex_unlock(&smmu_domain->init_mutex);
> -
> -	return ret;
> -}
> -
>   static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
>   		unsigned long quirks)
>   {
> @@ -1600,7 +1585,6 @@ static struct iommu_ops arm_smmu_ops = {
>   		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
>   		.iotlb_sync		= arm_smmu_iotlb_sync,
>   		.iova_to_phys		= arm_smmu_iova_to_phys,
> -		.enable_nesting		= arm_smmu_enable_nesting,
>   		.set_pgtable_quirks	= arm_smmu_set_pgtable_quirks,
>   		.free			= arm_smmu_domain_free,
>   	}
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 857d4c2fd1a206..f33c0d569a5d03 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -2561,16 +2561,6 @@ static int __init iommu_init(void)
>   }
>   core_initcall(iommu_init);
>   
> -int iommu_enable_nesting(struct iommu_domain *domain)
> -{
> -	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
> -		return -EINVAL;
> -	if (!domain->ops->enable_nesting)
> -		return -EINVAL;
> -	return domain->ops->enable_nesting(domain);
> -}
> -EXPORT_SYMBOL_GPL(iommu_enable_nesting);
> -
>   int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>   		unsigned long quirk)
>   {
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9394aa9444c10c..ff669723b0488f 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -74,7 +74,6 @@ struct vfio_iommu {
>   	uint64_t		num_non_pinned_groups;
>   	wait_queue_head_t	vaddr_wait;
>   	bool			v2;
> -	bool			nesting;
>   	bool			dirty_page_tracking;
>   	bool			container_open;
>   	struct list_head	emulated_iommu_groups;
> @@ -2207,12 +2206,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>   	if (!domain->domain)
>   		goto out_free_domain;
>   
> -	if (iommu->nesting) {
> -		ret = iommu_enable_nesting(domain->domain);
> -		if (ret)
> -			goto out_domain;
> -	}
> -
>   	ret = iommu_attach_group(domain->domain, group->iommu_group);
>   	if (ret)
>   		goto out_domain;
> @@ -2546,9 +2539,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>   	switch (arg) {
>   	case VFIO_TYPE1_IOMMU:
>   		break;
> -	case VFIO_TYPE1_NESTING_IOMMU:
> -		iommu->nesting = true;
> -		fallthrough;
> +	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
>   	case VFIO_TYPE1v2_IOMMU:
>   		iommu->v2 = true;
>   		break;
> @@ -2634,7 +2625,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
>   	switch (arg) {
>   	case VFIO_TYPE1_IOMMU:
>   	case VFIO_TYPE1v2_IOMMU:
> -	case VFIO_TYPE1_NESTING_IOMMU:
>   	case VFIO_UNMAP_ALL:
>   	case VFIO_UPDATE_VADDR:
>   		return 1;
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 9208eca4b0d1ac..51cb4d3eb0d391 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -272,7 +272,6 @@ struct iommu_ops {
>    * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
>    *            queue
>    * @iova_to_phys: translate iova to physical address
> - * @enable_nesting: Enable nesting
>    * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
>    * @free: Release the domain after use.
>    */
> @@ -300,7 +299,6 @@ struct iommu_domain_ops {
>   	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
>   				    dma_addr_t iova);
>   
> -	int (*enable_nesting)(struct iommu_domain *domain);
>   	int (*set_pgtable_quirks)(struct iommu_domain *domain,
>   				  unsigned long quirks);
>   
> @@ -496,7 +494,6 @@ extern int iommu_page_response(struct device *dev,
>   extern int iommu_group_id(struct iommu_group *group);
>   extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
>   
> -int iommu_enable_nesting(struct iommu_domain *domain);
>   int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>   		unsigned long quirks);
>   
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index fea86061b44e65..6e0640f0a4cad7 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -35,7 +35,7 @@
>   #define VFIO_EEH			5
>   
>   /* Two-stage IOMMU */
> -#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
> +#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
>   
>   #define VFIO_SPAPR_TCE_v2_IOMMU		7
>   
> 
> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
Jason Gunthorpe May 10, 2022, 6:13 p.m. UTC | #2
On Tue, May 10, 2022 at 06:52:06PM +0100, Robin Murphy wrote:
> On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
> > This control causes the ARM SMMU drivers to choose a stage 2
> > implementation for the IO pagetable (vs the stage 1 usual default),
> > however this choice has no visible impact to the VFIO user. Further qemu
> > never implemented this and no other userspace user is known.
> > 
> > The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
> > new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
> > SMMU translation services to the guest operating system" however the rest
> > of the API to set the guest table pointer for the stage 1 was never
> > completed, or at least never upstreamed, rendering this part useless dead
> > code.
> > 
> > Since the current patches to enable nested translation, aka userspace page
> > tables, rely on iommufd and will not use the enable_nesting()
> > iommu_domain_op, remove this infrastructure. However, don't cut too deep
> > into the SMMU drivers for now expecting the iommufd work to pick it up -
> > we still need to create S2 IO page tables.
> > 
> > Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
> > enable_nesting iommu_domain_op.
> > 
> > Just in-case there is some userspace using this continue to treat
> > requesting it as a NOP, but do not advertise support any more.
> 
> I assume the nested translation/guest SVA patches that Eric and Vivek were
> working on pre-IOMMUFD made use of this, and given that they got quite far
> along, I wouldn't be too surprised if some eager cloud vendors might have
> even deployed something based on the patches off the list. 

With upstream there is no way to make use of this flag, if someone is
using it they have other out of tree kernel, vfio, kvm and qemu
patches to make it all work.

You can see how much is still needed in Eric's tree:

https://github.com/eauger/linux/commits/v5.15-rc7-nested-v16

> I can't help feeling a little wary about removing this until IOMMUFD
> can actually offer a functional replacement - is it in the way of
> anything upcoming?

From an upstream perspective if someone has a patched kernel to
complete the feature, then they can patch this part in as well, we
should not carry dead code like this in the kernel and in the uapi.

It is not directly in the way, but this needs to get done at some
point, I'd rather just get it out of the way.

Thanks,
Jason
Nicolin Chen May 10, 2022, 6:31 p.m. UTC | #3
On Tue, May 10, 2022 at 01:55:24PM -0300, Jason Gunthorpe wrote:
> This control causes the ARM SMMU drivers to choose a stage 2
> implementation for the IO pagetable (vs the stage 1 usual default),
> however this choice has no visible impact to the VFIO user. Further qemu
> never implemented this and no other userspace user is known.
> 
> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
> SMMU translation services to the guest operating system" however the rest
> of the API to set the guest table pointer for the stage 1 was never
> completed, or at least never upstreamed, rendering this part useless dead
> code.
> 
> Since the current patches to enable nested translation, aka userspace page
> tables, rely on iommufd and will not use the enable_nesting()
> iommu_domain_op, remove this infrastructure. However, don't cut too deep
> into the SMMU drivers for now expecting the iommufd work to pick it up -
> we still need to create S2 IO page tables.
> 
> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
> enable_nesting iommu_domain_op.
> 
> Just in-case there is some userspace using this continue to treat
> requesting it as a NOP, but do not advertise support any more.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

Sanity-tested with qemu-system-aarch64 using "iommu=nested-smmuv3"
(unmerged feature) and "iommu=none" strings on top of vfio next.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tian, Kevin May 12, 2022, 5:04 a.m. UTC | #4
> From: Jason Gunthorpe
> Sent: Wednesday, May 11, 2022 12:55 AM
> 
> This control causes the ARM SMMU drivers to choose a stage 2
> implementation for the IO pagetable (vs the stage 1 usual default),
> however this choice has no visible impact to the VFIO user. Further qemu
> never implemented this and no other userspace user is known.
> 
> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to
> "provide
> SMMU translation services to the guest operating system" however the rest
> of the API to set the guest table pointer for the stage 1 was never
> completed, or at least never upstreamed, rendering this part useless dead
> code.
> 
> Since the current patches to enable nested translation, aka userspace page
> tables, rely on iommufd and will not use the enable_nesting()
> iommu_domain_op, remove this infrastructure. However, don't cut too deep
> into the SMMU drivers for now expecting the iommufd work to pick it up -
> we still need to create S2 IO page tables.
> 
> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
> enable_nesting iommu_domain_op.
> 
> Just in-case there is some userspace using this continue to treat
> requesting it as a NOP, but do not advertise support any more.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

Reviewed-by: Kevin Tian <kevin.tian@intel.com>

> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ----------------
>  drivers/iommu/arm/arm-smmu/arm-smmu.c       | 16 ----------------
>  drivers/iommu/iommu.c                       | 10 ----------
>  drivers/vfio/vfio_iommu_type1.c             | 12 +-----------
>  include/linux/iommu.h                       |  3 ---
>  include/uapi/linux/vfio.h                   |  2 +-
>  6 files changed, 2 insertions(+), 57 deletions(-)
> 
> It would probably make sense for this to go through the VFIO tree with
> Robin's
> ack for the SMMU changes.
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 627a3ed5ee8fd1..b901e8973bb4ea 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -2724,21 +2724,6 @@ static struct iommu_group
> *arm_smmu_device_group(struct device *dev)
>  	return group;
>  }
> 
> -static int arm_smmu_enable_nesting(struct iommu_domain *domain)
> -{
> -	struct arm_smmu_domain *smmu_domain =
> to_smmu_domain(domain);
> -	int ret = 0;
> -
> -	mutex_lock(&smmu_domain->init_mutex);
> -	if (smmu_domain->smmu)
> -		ret = -EPERM;
> -	else
> -		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
> -	mutex_unlock(&smmu_domain->init_mutex);
> -
> -	return ret;
> -}
> -
>  static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args
> *args)
>  {
>  	return iommu_fwspec_add_ids(dev, args->args, 1);
> @@ -2865,7 +2850,6 @@ static struct iommu_ops arm_smmu_ops = {
>  		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
>  		.iotlb_sync		= arm_smmu_iotlb_sync,
>  		.iova_to_phys		= arm_smmu_iova_to_phys,
> -		.enable_nesting		= arm_smmu_enable_nesting,
>  		.free			= arm_smmu_domain_free,
>  	}
>  };
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> index 568cce590ccc13..239e6f6585b48d 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> @@ -1507,21 +1507,6 @@ static struct iommu_group
> *arm_smmu_device_group(struct device *dev)
>  	return group;
>  }
> 
> -static int arm_smmu_enable_nesting(struct iommu_domain *domain)
> -{
> -	struct arm_smmu_domain *smmu_domain =
> to_smmu_domain(domain);
> -	int ret = 0;
> -
> -	mutex_lock(&smmu_domain->init_mutex);
> -	if (smmu_domain->smmu)
> -		ret = -EPERM;
> -	else
> -		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
> -	mutex_unlock(&smmu_domain->init_mutex);
> -
> -	return ret;
> -}
> -
>  static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
>  		unsigned long quirks)
>  {
> @@ -1600,7 +1585,6 @@ static struct iommu_ops arm_smmu_ops = {
>  		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
>  		.iotlb_sync		= arm_smmu_iotlb_sync,
>  		.iova_to_phys		= arm_smmu_iova_to_phys,
> -		.enable_nesting		= arm_smmu_enable_nesting,
>  		.set_pgtable_quirks	= arm_smmu_set_pgtable_quirks,
>  		.free			= arm_smmu_domain_free,
>  	}
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 857d4c2fd1a206..f33c0d569a5d03 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -2561,16 +2561,6 @@ static int __init iommu_init(void)
>  }
>  core_initcall(iommu_init);
> 
> -int iommu_enable_nesting(struct iommu_domain *domain)
> -{
> -	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
> -		return -EINVAL;
> -	if (!domain->ops->enable_nesting)
> -		return -EINVAL;
> -	return domain->ops->enable_nesting(domain);
> -}
> -EXPORT_SYMBOL_GPL(iommu_enable_nesting);
> -
>  int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>  		unsigned long quirk)
>  {
> diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> index 9394aa9444c10c..ff669723b0488f 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -74,7 +74,6 @@ struct vfio_iommu {
>  	uint64_t		num_non_pinned_groups;
>  	wait_queue_head_t	vaddr_wait;
>  	bool			v2;
> -	bool			nesting;
>  	bool			dirty_page_tracking;
>  	bool			container_open;
>  	struct list_head	emulated_iommu_groups;
> @@ -2207,12 +2206,6 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
>  	if (!domain->domain)
>  		goto out_free_domain;
> 
> -	if (iommu->nesting) {
> -		ret = iommu_enable_nesting(domain->domain);
> -		if (ret)
> -			goto out_domain;
> -	}
> -
>  	ret = iommu_attach_group(domain->domain, group->iommu_group);
>  	if (ret)
>  		goto out_domain;
> @@ -2546,9 +2539,7 @@ static void *vfio_iommu_type1_open(unsigned
> long arg)
>  	switch (arg) {
>  	case VFIO_TYPE1_IOMMU:
>  		break;
> -	case VFIO_TYPE1_NESTING_IOMMU:
> -		iommu->nesting = true;
> -		fallthrough;
> +	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
>  	case VFIO_TYPE1v2_IOMMU:
>  		iommu->v2 = true;
>  		break;
> @@ -2634,7 +2625,6 @@ static int
> vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
>  	switch (arg) {
>  	case VFIO_TYPE1_IOMMU:
>  	case VFIO_TYPE1v2_IOMMU:
> -	case VFIO_TYPE1_NESTING_IOMMU:
>  	case VFIO_UNMAP_ALL:
>  	case VFIO_UPDATE_VADDR:
>  		return 1;
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 9208eca4b0d1ac..51cb4d3eb0d391 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -272,7 +272,6 @@ struct iommu_ops {
>   * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty
> flush
>   *            queue
>   * @iova_to_phys: translate iova to physical address
> - * @enable_nesting: Enable nesting
>   * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
>   * @free: Release the domain after use.
>   */
> @@ -300,7 +299,6 @@ struct iommu_domain_ops {
>  	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
>  				    dma_addr_t iova);
> 
> -	int (*enable_nesting)(struct iommu_domain *domain);
>  	int (*set_pgtable_quirks)(struct iommu_domain *domain,
>  				  unsigned long quirks);
> 
> @@ -496,7 +494,6 @@ extern int iommu_page_response(struct device *dev,
>  extern int iommu_group_id(struct iommu_group *group);
>  extern struct iommu_domain *iommu_group_default_domain(struct
> iommu_group *);
> 
> -int iommu_enable_nesting(struct iommu_domain *domain);
>  int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>  		unsigned long quirks);
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index fea86061b44e65..6e0640f0a4cad7 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -35,7 +35,7 @@
>  #define VFIO_EEH			5
> 
>  /* Two-stage IOMMU */
> -#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
> +#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU	6	/* Implies v2
> */
> 
>  #define VFIO_SPAPR_TCE_v2_IOMMU		7
> 
> 
> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
> --
> 2.36.0
> 
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Zhangfei Gao May 12, 2022, 7:07 a.m. UTC | #5
Hi, Jason

On 2022/5/11 上午2:13, Jason Gunthorpe via iommu wrote:
> On Tue, May 10, 2022 at 06:52:06PM +0100, Robin Murphy wrote:
>> On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
>>> This control causes the ARM SMMU drivers to choose a stage 2
>>> implementation for the IO pagetable (vs the stage 1 usual default),
>>> however this choice has no visible impact to the VFIO user. Further qemu
>>> never implemented this and no other userspace user is known.
>>>
>>> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
>>> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
>>> SMMU translation services to the guest operating system" however the rest
>>> of the API to set the guest table pointer for the stage 1 was never
>>> completed, or at least never upstreamed, rendering this part useless dead
>>> code.
>>>
>>> Since the current patches to enable nested translation, aka userspace page
>>> tables, rely on iommufd and will not use the enable_nesting()
>>> iommu_domain_op, remove this infrastructure. However, don't cut too deep
>>> into the SMMU drivers for now expecting the iommufd work to pick it up -
>>> we still need to create S2 IO page tables.
>>>
>>> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
>>> enable_nesting iommu_domain_op.
>>>
>>> Just in-case there is some userspace using this continue to treat
>>> requesting it as a NOP, but do not advertise support any more.
>> I assume the nested translation/guest SVA patches that Eric and Vivek were
>> working on pre-IOMMUFD made use of this, and given that they got quite far
>> along, I wouldn't be too surprised if some eager cloud vendors might have
>> even deployed something based on the patches off the list.
> With upstream there is no way to make use of this flag, if someone is
> using it they have other out of tree kernel, vfio, kvm and qemu
> patches to make it all work.
>
> You can see how much is still needed in Eric's tree:
>
> https://github.com/eauger/linux/commits/v5.15-rc7-nested-v16
>
>> I can't help feeling a little wary about removing this until IOMMUFD
>> can actually offer a functional replacement - is it in the way of
>> anything upcoming?
>  From an upstream perspective if someone has a patched kernel to
> complete the feature, then they can patch this part in as well, we
> should not carry dead code like this in the kernel and in the uapi.
>
> It is not directly in the way, but this needs to get done at some
> point, I'd rather just get it out of the way.

We are using this interface for nested mode.

Is the replacing patch ready?
Can we remove this until submitting the new one?

Thanks
Jason Gunthorpe May 12, 2022, 11:32 a.m. UTC | #6
On Thu, May 12, 2022 at 03:07:09PM +0800, Zhangfei Gao wrote:
> > > I can't help feeling a little wary about removing this until IOMMUFD
> > > can actually offer a functional replacement - is it in the way of
> > > anything upcoming?
> >  From an upstream perspective if someone has a patched kernel to
> > complete the feature, then they can patch this part in as well, we
> > should not carry dead code like this in the kernel and in the uapi.
> > 
> > It is not directly in the way, but this needs to get done at some
> > point, I'd rather just get it out of the way.
> 
> We are using this interface for nested mode.

How are you using it? It doesn't do anything. Do you have out of tree
patches as well?

Jason
zhangfei.gao@foxmail.com May 12, 2022, 11:57 a.m. UTC | #7
On 2022/5/12 下午7:32, Jason Gunthorpe via iommu wrote:
> On Thu, May 12, 2022 at 03:07:09PM +0800, Zhangfei Gao wrote:
>>>> I can't help feeling a little wary about removing this until IOMMUFD
>>>> can actually offer a functional replacement - is it in the way of
>>>> anything upcoming?
>>>   From an upstream perspective if someone has a patched kernel to
>>> complete the feature, then they can patch this part in as well, we
>>> should not carry dead code like this in the kernel and in the uapi.
>>>
>>> It is not directly in the way, but this needs to get done at some
>>> point, I'd rather just get it out of the way.
>> We are using this interface for nested mode.
> How are you using it? It doesn't do anything. Do you have out of tree
> patches as well?

Yes, there are some patches out of tree, since they are pending for 
almost one year.

By the way, I am trying to test nesting mode based on iommufd, still 
requires iommu_enable_nesting,
which is removed in this patch as well.

So can we wait for alternative patch then remove them?

Thanks
Jason Gunthorpe May 12, 2022, 11:59 a.m. UTC | #8
On Thu, May 12, 2022 at 07:57:26PM +0800, zhangfei.gao@foxmail.com wrote:
> 
> 
> On 2022/5/12 下午7:32, Jason Gunthorpe via iommu wrote:
> > On Thu, May 12, 2022 at 03:07:09PM +0800, Zhangfei Gao wrote:
> > > > > I can't help feeling a little wary about removing this until IOMMUFD
> > > > > can actually offer a functional replacement - is it in the way of
> > > > > anything upcoming?
> > > >   From an upstream perspective if someone has a patched kernel to
> > > > complete the feature, then they can patch this part in as well, we
> > > > should not carry dead code like this in the kernel and in the uapi.
> > > > 
> > > > It is not directly in the way, but this needs to get done at some
> > > > point, I'd rather just get it out of the way.
> > > We are using this interface for nested mode.
> > How are you using it? It doesn't do anything. Do you have out of tree
> > patches as well?
> 
> Yes, there are some patches out of tree, since they are pending for almost
> one year.
> 
> By the way, I am trying to test nesting mode based on iommufd, still
> requires iommu_enable_nesting,
> which is removed in this patch as well.

This is temporary.

> So can we wait for alternative patch then remove them?

Do you see a problem with patching this along with all the other
patches you need?

Jason
zhangfei.gao@foxmail.com May 12, 2022, 12:07 p.m. UTC | #9
On 2022/5/12 下午7:59, Jason Gunthorpe wrote:
> On Thu, May 12, 2022 at 07:57:26PM +0800, zhangfei.gao@foxmail.com wrote:
>>
>> On 2022/5/12 下午7:32, Jason Gunthorpe via iommu wrote:
>>> On Thu, May 12, 2022 at 03:07:09PM +0800, Zhangfei Gao wrote:
>>>>>> I can't help feeling a little wary about removing this until IOMMUFD
>>>>>> can actually offer a functional replacement - is it in the way of
>>>>>> anything upcoming?
>>>>>    From an upstream perspective if someone has a patched kernel to
>>>>> complete the feature, then they can patch this part in as well, we
>>>>> should not carry dead code like this in the kernel and in the uapi.
>>>>>
>>>>> It is not directly in the way, but this needs to get done at some
>>>>> point, I'd rather just get it out of the way.
>>>> We are using this interface for nested mode.
>>> How are you using it? It doesn't do anything. Do you have out of tree
>>> patches as well?
>> Yes, there are some patches out of tree, since they are pending for almost
>> one year.
>>
>> By the way, I am trying to test nesting mode based on iommufd, still
>> requires iommu_enable_nesting,
>> which is removed in this patch as well.
> This is temporary.
>
>> So can we wait for alternative patch then remove them?
> Do you see a problem with patching this along with all the other
> patches you need?
Not yet.

Currently I am using two workarounds, but it can simply work.

1. Hard code to to enable nesting mode, both in kernel and qemu.
Will consider then.

2. Adding VFIOIOASHwpt *hwpt in VFIOIOMMUFDContainer.
And save hwpt when vfio_device_attach_container.

While currently VFIOIOMMUFDContainer has hwpt_list now.
Have asked Yi in another mail.

Thanks
Eric Auger May 12, 2022, 5:27 p.m. UTC | #10
Hi,

On 5/10/22 20:13, Jason Gunthorpe wrote:
> On Tue, May 10, 2022 at 06:52:06PM +0100, Robin Murphy wrote:
>> On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
>>> This control causes the ARM SMMU drivers to choose a stage 2
>>> implementation for the IO pagetable (vs the stage 1 usual default),
>>> however this choice has no visible impact to the VFIO user. Further qemu
>>> never implemented this and no other userspace user is known.
>>>
>>> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
>>> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
>>> SMMU translation services to the guest operating system" however the rest
>>> of the API to set the guest table pointer for the stage 1 was never
>>> completed, or at least never upstreamed, rendering this part useless dead
>>> code.
>>>
>>> Since the current patches to enable nested translation, aka userspace page
>>> tables, rely on iommufd and will not use the enable_nesting()
>>> iommu_domain_op, remove this infrastructure. However, don't cut too deep
>>> into the SMMU drivers for now expecting the iommufd work to pick it up -
>>> we still need to create S2 IO page tables.
>>>
>>> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
>>> enable_nesting iommu_domain_op.
>>>
>>> Just in-case there is some userspace using this continue to treat
>>> requesting it as a NOP, but do not advertise support any more.
>> I assume the nested translation/guest SVA patches that Eric and Vivek were
>> working on pre-IOMMUFD made use of this, and given that they got quite far
>> along, I wouldn't be too surprised if some eager cloud vendors might have
>> even deployed something based on the patches off the list. 

thank you Robin for the heads up.
> With upstream there is no way to make use of this flag, if someone is
> using it they have other out of tree kernel, vfio, kvm and qemu
> patches to make it all work.
>
> You can see how much is still needed in Eric's tree:
>
> https://github.com/eauger/linux/commits/v5.15-rc7-nested-v16
>
>> I can't help feeling a little wary about removing this until IOMMUFD
>> can actually offer a functional replacement - is it in the way of
>> anything upcoming?
> From an upstream perspective if someone has a patched kernel to
> complete the feature, then they can patch this part in as well, we
> should not carry dead code like this in the kernel and in the uapi.

On the other end the code is in the kernel for 8 years now, I think we
could wait for some additional weeks/months until the iommufd nested
integration arises and gets tested.

Thanks

Eric
>
> It is not directly in the way, but this needs to get done at some
> point, I'd rather just get it out of the way.
>
> Thanks,
> Jason
>
Christoph Hellwig May 16, 2022, 6:39 a.m. UTC | #11
Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

we really should not keep dead code like this around.
Alex Williamson May 17, 2022, 8:26 p.m. UTC | #12
On Tue, 10 May 2022 13:55:24 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> This control causes the ARM SMMU drivers to choose a stage 2
> implementation for the IO pagetable (vs the stage 1 usual default),
> however this choice has no visible impact to the VFIO user. Further qemu
> never implemented this and no other userspace user is known.
> 
> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
> SMMU translation services to the guest operating system" however the rest
> of the API to set the guest table pointer for the stage 1 was never
> completed, or at least never upstreamed, rendering this part useless dead
> code.
> 
> Since the current patches to enable nested translation, aka userspace page
> tables, rely on iommufd and will not use the enable_nesting()
> iommu_domain_op, remove this infrastructure. However, don't cut too deep
> into the SMMU drivers for now expecting the iommufd work to pick it up -
> we still need to create S2 IO page tables.
> 
> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
> enable_nesting iommu_domain_op.
> 
> Just in-case there is some userspace using this continue to treat
> requesting it as a NOP, but do not advertise support any more.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ----------------
>  drivers/iommu/arm/arm-smmu/arm-smmu.c       | 16 ----------------
>  drivers/iommu/iommu.c                       | 10 ----------
>  drivers/vfio/vfio_iommu_type1.c             | 12 +-----------
>  include/linux/iommu.h                       |  3 ---
>  include/uapi/linux/vfio.h                   |  2 +-
>  6 files changed, 2 insertions(+), 57 deletions(-)
> 
> It would probably make sense for this to go through the VFIO tree with Robin's
> ack for the SMMU changes.

I'd be in favor of applying this, but it seems Robin and Eric are
looking for a stay of execution and I'd also be looking for an ack from
Joerg.  Thanks,

Alex
Joerg Roedel May 20, 2022, 7:38 a.m. UTC | #13
On Tue, May 17, 2022 at 02:26:56PM -0600, Alex Williamson wrote:
> I'd be in favor of applying this, but it seems Robin and Eric are
> looking for a stay of execution and I'd also be looking for an ack from
> Joerg.  Thanks,

This is mainly an ARM-SMMU thing, so I defer my ack to Will and Robin.

Regards,

	Joerg
Robin Murphy May 20, 2022, 11:02 a.m. UTC | #14
On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
> This control causes the ARM SMMU drivers to choose a stage 2
> implementation for the IO pagetable (vs the stage 1 usual default),
> however this choice has no visible impact to the VFIO user.

Oh, I should have read more carefully... this isn't entirely true. Stage 
2 has a different permission model from stage 1, so although it's 
arguably undocumented behaviour, VFIO users that know enough about the 
underlying system could use this to get write-only mappings if they so wish.

There may potentially also be visible differences in translation 
performance between stages, although I imagine that's firmly over in the 
niche of things that users might look at for system validation purposes, 
rather than for practical usefulness.

> Further qemu
> never implemented this and no other userspace user is known.
> 
> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
> SMMU translation services to the guest operating system" however the rest
> of the API to set the guest table pointer for the stage 1 was never
> completed, or at least never upstreamed, rendering this part useless dead
> code.
> 
> Since the current patches to enable nested translation, aka userspace page
> tables, rely on iommufd and will not use the enable_nesting()
> iommu_domain_op, remove this infrastructure. However, don't cut too deep
> into the SMMU drivers for now expecting the iommufd work to pick it up -
> we still need to create S2 IO page tables.
> 
> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
> enable_nesting iommu_domain_op.
> 
> Just in-case there is some userspace using this continue to treat
> requesting it as a NOP, but do not advertise support any more.

This also seems a bit odd, especially given that it's not actually a 
no-op; surely either it's supported and functional or it isn't?

In all honesty, I'm not personally attached to this code either way. If 
this patch had come 5 years ago, when the interface already looked like 
a bit of a dead end, I'd probably have agreed more readily. But now, 
when we're possibly mere months away from implementing the functional 
equivalent for IOMMUFD, which if done right might be able to support a 
trivial compat layer for this anyway, I just don't see what we gain from 
not at least waiting to see where that ends up. The given justification 
reads as "get rid of this code that we already know we'll need to bring 
back in some form, and half-break an unpopular VFIO ABI because it 
doesn't do *everything* that its name might imply", which just isn't 
convincing me.

Thanks,
Robin.

> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ----------------
>   drivers/iommu/arm/arm-smmu/arm-smmu.c       | 16 ----------------
>   drivers/iommu/iommu.c                       | 10 ----------
>   drivers/vfio/vfio_iommu_type1.c             | 12 +-----------
>   include/linux/iommu.h                       |  3 ---
>   include/uapi/linux/vfio.h                   |  2 +-
>   6 files changed, 2 insertions(+), 57 deletions(-)
> 
> It would probably make sense for this to go through the VFIO tree with Robin's
> ack for the SMMU changes.
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 627a3ed5ee8fd1..b901e8973bb4ea 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -2724,21 +2724,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>   	return group;
>   }
>   
> -static int arm_smmu_enable_nesting(struct iommu_domain *domain)
> -{
> -	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> -	int ret = 0;
> -
> -	mutex_lock(&smmu_domain->init_mutex);
> -	if (smmu_domain->smmu)
> -		ret = -EPERM;
> -	else
> -		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
> -	mutex_unlock(&smmu_domain->init_mutex);
> -
> -	return ret;
> -}
> -
>   static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
>   {
>   	return iommu_fwspec_add_ids(dev, args->args, 1);
> @@ -2865,7 +2850,6 @@ static struct iommu_ops arm_smmu_ops = {
>   		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
>   		.iotlb_sync		= arm_smmu_iotlb_sync,
>   		.iova_to_phys		= arm_smmu_iova_to_phys,
> -		.enable_nesting		= arm_smmu_enable_nesting,
>   		.free			= arm_smmu_domain_free,
>   	}
>   };
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> index 568cce590ccc13..239e6f6585b48d 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> @@ -1507,21 +1507,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>   	return group;
>   }
>   
> -static int arm_smmu_enable_nesting(struct iommu_domain *domain)
> -{
> -	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> -	int ret = 0;
> -
> -	mutex_lock(&smmu_domain->init_mutex);
> -	if (smmu_domain->smmu)
> -		ret = -EPERM;
> -	else
> -		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
> -	mutex_unlock(&smmu_domain->init_mutex);
> -
> -	return ret;
> -}
> -
>   static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
>   		unsigned long quirks)
>   {
> @@ -1600,7 +1585,6 @@ static struct iommu_ops arm_smmu_ops = {
>   		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
>   		.iotlb_sync		= arm_smmu_iotlb_sync,
>   		.iova_to_phys		= arm_smmu_iova_to_phys,
> -		.enable_nesting		= arm_smmu_enable_nesting,
>   		.set_pgtable_quirks	= arm_smmu_set_pgtable_quirks,
>   		.free			= arm_smmu_domain_free,
>   	}
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 857d4c2fd1a206..f33c0d569a5d03 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -2561,16 +2561,6 @@ static int __init iommu_init(void)
>   }
>   core_initcall(iommu_init);
>   
> -int iommu_enable_nesting(struct iommu_domain *domain)
> -{
> -	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
> -		return -EINVAL;
> -	if (!domain->ops->enable_nesting)
> -		return -EINVAL;
> -	return domain->ops->enable_nesting(domain);
> -}
> -EXPORT_SYMBOL_GPL(iommu_enable_nesting);
> -
>   int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>   		unsigned long quirk)
>   {
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9394aa9444c10c..ff669723b0488f 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -74,7 +74,6 @@ struct vfio_iommu {
>   	uint64_t		num_non_pinned_groups;
>   	wait_queue_head_t	vaddr_wait;
>   	bool			v2;
> -	bool			nesting;
>   	bool			dirty_page_tracking;
>   	bool			container_open;
>   	struct list_head	emulated_iommu_groups;
> @@ -2207,12 +2206,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>   	if (!domain->domain)
>   		goto out_free_domain;
>   
> -	if (iommu->nesting) {
> -		ret = iommu_enable_nesting(domain->domain);
> -		if (ret)
> -			goto out_domain;
> -	}
> -
>   	ret = iommu_attach_group(domain->domain, group->iommu_group);
>   	if (ret)
>   		goto out_domain;
> @@ -2546,9 +2539,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>   	switch (arg) {
>   	case VFIO_TYPE1_IOMMU:
>   		break;
> -	case VFIO_TYPE1_NESTING_IOMMU:
> -		iommu->nesting = true;
> -		fallthrough;
> +	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
>   	case VFIO_TYPE1v2_IOMMU:
>   		iommu->v2 = true;
>   		break;
> @@ -2634,7 +2625,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
>   	switch (arg) {
>   	case VFIO_TYPE1_IOMMU:
>   	case VFIO_TYPE1v2_IOMMU:
> -	case VFIO_TYPE1_NESTING_IOMMU:
>   	case VFIO_UNMAP_ALL:
>   	case VFIO_UPDATE_VADDR:
>   		return 1;
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 9208eca4b0d1ac..51cb4d3eb0d391 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -272,7 +272,6 @@ struct iommu_ops {
>    * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
>    *            queue
>    * @iova_to_phys: translate iova to physical address
> - * @enable_nesting: Enable nesting
>    * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
>    * @free: Release the domain after use.
>    */
> @@ -300,7 +299,6 @@ struct iommu_domain_ops {
>   	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
>   				    dma_addr_t iova);
>   
> -	int (*enable_nesting)(struct iommu_domain *domain);
>   	int (*set_pgtable_quirks)(struct iommu_domain *domain,
>   				  unsigned long quirks);
>   
> @@ -496,7 +494,6 @@ extern int iommu_page_response(struct device *dev,
>   extern int iommu_group_id(struct iommu_group *group);
>   extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
>   
> -int iommu_enable_nesting(struct iommu_domain *domain);
>   int iommu_set_pgtable_quirks(struct iommu_domain *domain,
>   		unsigned long quirks);
>   
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index fea86061b44e65..6e0640f0a4cad7 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -35,7 +35,7 @@
>   #define VFIO_EEH			5
>   
>   /* Two-stage IOMMU */
> -#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
> +#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
>   
>   #define VFIO_SPAPR_TCE_v2_IOMMU		7
>   
> 
> base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
Will Deacon May 20, 2022, noon UTC | #15
On Fri, May 20, 2022 at 12:02:23PM +0100, Robin Murphy wrote:
> On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
> > This control causes the ARM SMMU drivers to choose a stage 2
> > implementation for the IO pagetable (vs the stage 1 usual default),
> > however this choice has no visible impact to the VFIO user.
> 
> Oh, I should have read more carefully... this isn't entirely true. Stage 2
> has a different permission model from stage 1, so although it's arguably
> undocumented behaviour, VFIO users that know enough about the underlying
> system could use this to get write-only mappings if they so wish.

There's also an impact on combining memory attributes, but it's not hugely
clear how that impacts userspace via things like VFIO_DMA_CC_IOMMU.

> There may potentially also be visible differences in translation performance
> between stages, although I imagine that's firmly over in the niche of things
> that users might look at for system validation purposes, rather than for
> practical usefulness.
> 
> > Further qemu
> > never implemented this and no other userspace user is known.
> > 
> > The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
> > new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
> > SMMU translation services to the guest operating system" however the rest
> > of the API to set the guest table pointer for the stage 1 was never
> > completed, or at least never upstreamed, rendering this part useless dead
> > code.
> > 
> > Since the current patches to enable nested translation, aka userspace page
> > tables, rely on iommufd and will not use the enable_nesting()
> > iommu_domain_op, remove this infrastructure. However, don't cut too deep
> > into the SMMU drivers for now expecting the iommufd work to pick it up -
> > we still need to create S2 IO page tables.
> > 
> > Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
> > enable_nesting iommu_domain_op.
> > 
> > Just in-case there is some userspace using this continue to treat
> > requesting it as a NOP, but do not advertise support any more.
> 
> This also seems a bit odd, especially given that it's not actually a no-op;
> surely either it's supported and functional or it isn't?
> 
> In all honesty, I'm not personally attached to this code either way. If this
> patch had come 5 years ago, when the interface already looked like a bit of
> a dead end, I'd probably have agreed more readily. But now, when we're
> possibly mere months away from implementing the functional equivalent for
> IOMMUFD, which if done right might be able to support a trivial compat layer
> for this anyway, I just don't see what we gain from not at least waiting to
> see where that ends up. The given justification reads as "get rid of this
> code that we already know we'll need to bring back in some form, and
> half-break an unpopular VFIO ABI because it doesn't do *everything* that its
> name might imply", which just isn't convincing me.

I'm inclined to agree although we can very easily revert this patch when we
need to bring the stuff back, it would just be a bit churny.

Will
Jason Gunthorpe May 20, 2022, 12:18 p.m. UTC | #16
On Fri, May 20, 2022 at 12:02:23PM +0100, Robin Murphy wrote:
> On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
> > This control causes the ARM SMMU drivers to choose a stage 2
> > implementation for the IO pagetable (vs the stage 1 usual default),
> > however this choice has no visible impact to the VFIO user.
> 
> Oh, I should have read more carefully... this isn't entirely true. Stage 2
> has a different permission model from stage 1, so although it's arguably
> undocumented behaviour, VFIO users that know enough about the underlying
> system could use this to get write-only mappings if they so wish.

I think this is getting very pedantic, the intention of this was to
enable nesting, not to expose subtle differences in ARM
architecture. I'm very doubtful something exists here that uses this
without also using the other parts.

> > Just in-case there is some userspace using this continue to treat
> > requesting it as a NOP, but do not advertise support any more.
> 
> This also seems a bit odd, especially given that it's not actually a no-op;
> surely either it's supported and functional or it isn't?

Indeed, I was loath to completely break possibly existing broken
userspace, but I agree we could just fail here as well. Normal
userspace should see the missing capability and not call the API at
all. But maybe somehow hardwired their VMM or something IDK.

> In all honesty, I'm not personally attached to this code either way. If this
> patch had come 5 years ago, when the interface already looked like a bit of
> a dead end, I'd probably have agreed more readily. But now, when we're
> possibly mere months away from implementing the functional equivalent for
> IOMMUFD, which if done right might be able to support a trivial compat layer
> for this anyway,

There won't be compat for this - the generic code is not going to know
about the driver specific details of how nesting works.  The plan is
to have some new op like:

   alloc_user_domain(struct device *dev, void *params, size_t params_len, ..)

And all the special driver-specific iommu domains will be allocated
opaquely this way. The stage 1 or stage 2 choice will be specified in
the params along with any other HW specific parameters required to
hook it up properly.

So, the core code can't just do what VFIO_TYPE1_NESTING_IOMMU is doing
now without also being hardwired to make it work with ARM.

This is why I want to remove it now to be clear we are not going to be
accommodating this API.

> I just don't see what we gain from not at least waiting to see where
> that ends up. The given justification reads as "get rid of this code
> that we already know we'll need to bring back in some form, and
> half-break an unpopular VFIO ABI because it doesn't do *everything*
> that its name might imply", which just isn't convincing me.

No it wont come back. The code that we will need is the driver code in
SMMU, and that wasn't touched here.

We can defer this, as long as we all agree the uAPI is going away.

But I don't see a strong argument why we should wait either.

Jason
Jason Gunthorpe May 20, 2022, 12:28 p.m. UTC | #17
On Fri, May 20, 2022 at 01:00:32PM +0100, Will Deacon wrote:
> On Fri, May 20, 2022 at 12:02:23PM +0100, Robin Murphy wrote:
> > On 2022-05-10 17:55, Jason Gunthorpe via iommu wrote:
> > > This control causes the ARM SMMU drivers to choose a stage 2
> > > implementation for the IO pagetable (vs the stage 1 usual default),
> > > however this choice has no visible impact to the VFIO user.
> > 
> > Oh, I should have read more carefully... this isn't entirely true. Stage 2
> > has a different permission model from stage 1, so although it's arguably
> > undocumented behaviour, VFIO users that know enough about the underlying
> > system could use this to get write-only mappings if they so wish.
> 
> There's also an impact on combining memory attributes, but it's not hugely
> clear how that impacts userspace via things like VFIO_DMA_CC_IOMMU.

VFIO_DMA_CC_IOMMU has been clarified now to only be about the ability
to reject device requested no-snoop transactions. ie ignore the
NoSnoop bit in PCIe TLPs.

AFAICT SMMU can't implement this currently, and maybe doesn't need to.

VFIO requires the IO page tables be setup for cache coherent operation
for ordinary device DMA otherwises users like DPDK will not work.

It is surprising to hear you say that VFIO_TYPE1_NESTING_IOMMU might
also cause the IO to become non-coherent..

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 627a3ed5ee8fd1..b901e8973bb4ea 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2724,21 +2724,6 @@  static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	int ret = 0;
-
-	mutex_lock(&smmu_domain->init_mutex);
-	if (smmu_domain->smmu)
-		ret = -EPERM;
-	else
-		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-	mutex_unlock(&smmu_domain->init_mutex);
-
-	return ret;
-}
-
 static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
 	return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2865,7 +2850,6 @@  static struct iommu_ops arm_smmu_ops = {
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
-		.enable_nesting		= arm_smmu_enable_nesting,
 		.free			= arm_smmu_domain_free,
 	}
 };
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 568cce590ccc13..239e6f6585b48d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1507,21 +1507,6 @@  static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	int ret = 0;
-
-	mutex_lock(&smmu_domain->init_mutex);
-	if (smmu_domain->smmu)
-		ret = -EPERM;
-	else
-		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-	mutex_unlock(&smmu_domain->init_mutex);
-
-	return ret;
-}
-
 static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks)
 {
@@ -1600,7 +1585,6 @@  static struct iommu_ops arm_smmu_ops = {
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
-		.enable_nesting		= arm_smmu_enable_nesting,
 		.set_pgtable_quirks	= arm_smmu_set_pgtable_quirks,
 		.free			= arm_smmu_domain_free,
 	}
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 857d4c2fd1a206..f33c0d569a5d03 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2561,16 +2561,6 @@  static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
-int iommu_enable_nesting(struct iommu_domain *domain)
-{
-	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
-		return -EINVAL;
-	if (!domain->ops->enable_nesting)
-		return -EINVAL;
-	return domain->ops->enable_nesting(domain);
-}
-EXPORT_SYMBOL_GPL(iommu_enable_nesting);
-
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirk)
 {
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9394aa9444c10c..ff669723b0488f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -74,7 +74,6 @@  struct vfio_iommu {
 	uint64_t		num_non_pinned_groups;
 	wait_queue_head_t	vaddr_wait;
 	bool			v2;
-	bool			nesting;
 	bool			dirty_page_tracking;
 	bool			container_open;
 	struct list_head	emulated_iommu_groups;
@@ -2207,12 +2206,6 @@  static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (!domain->domain)
 		goto out_free_domain;
 
-	if (iommu->nesting) {
-		ret = iommu_enable_nesting(domain->domain);
-		if (ret)
-			goto out_domain;
-	}
-
 	ret = iommu_attach_group(domain->domain, group->iommu_group);
 	if (ret)
 		goto out_domain;
@@ -2546,9 +2539,7 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	switch (arg) {
 	case VFIO_TYPE1_IOMMU:
 		break;
-	case VFIO_TYPE1_NESTING_IOMMU:
-		iommu->nesting = true;
-		fallthrough;
+	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
 		iommu->v2 = true;
 		break;
@@ -2634,7 +2625,6 @@  static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	switch (arg) {
 	case VFIO_TYPE1_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
-	case VFIO_TYPE1_NESTING_IOMMU:
 	case VFIO_UNMAP_ALL:
 	case VFIO_UPDATE_VADDR:
 		return 1;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9208eca4b0d1ac..51cb4d3eb0d391 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -272,7 +272,6 @@  struct iommu_ops {
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *            queue
  * @iova_to_phys: translate iova to physical address
- * @enable_nesting: Enable nesting
  * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @free: Release the domain after use.
  */
@@ -300,7 +299,6 @@  struct iommu_domain_ops {
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    dma_addr_t iova);
 
-	int (*enable_nesting)(struct iommu_domain *domain);
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
 				  unsigned long quirks);
 
@@ -496,7 +494,6 @@  extern int iommu_page_response(struct device *dev,
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
-int iommu_enable_nesting(struct iommu_domain *domain);
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks);
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index fea86061b44e65..6e0640f0a4cad7 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -35,7 +35,7 @@ 
 #define VFIO_EEH			5
 
 /* Two-stage IOMMU */
-#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
+#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
 
 #define VFIO_SPAPR_TCE_v2_IOMMU		7