diff mbox series

[v2,06/17] iommu: Add iommu_fwspec_alloc/dealloc()

Message ID 6-v2-36a0088ecaa7+22c6e-iommu_fwspec_jgg@nvidia.com (mailing list archive)
State Handled Elsewhere
Headers show
Series Solve iommu probe races around iommu_fwspec | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR fail PR summary
conchuod/patch-6-test-1 success .github/scripts/patches/build_rv32_defconfig.sh
conchuod/patch-6-test-2 fail .github/scripts/patches/build_rv64_clang_allmodconfig.sh
conchuod/patch-6-test-3 fail .github/scripts/patches/build_rv64_gcc_allmodconfig.sh
conchuod/patch-6-test-4 success .github/scripts/patches/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-6-test-5 success .github/scripts/patches/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-6-test-6 success .github/scripts/patches/checkpatch.sh
conchuod/patch-6-test-7 success .github/scripts/patches/dtb_warn_rv64.sh
conchuod/patch-6-test-8 success .github/scripts/patches/header_inline.sh
conchuod/patch-6-test-9 success .github/scripts/patches/kdoc.sh
conchuod/patch-6-test-10 success .github/scripts/patches/module_param.sh
conchuod/patch-6-test-11 success .github/scripts/patches/verify_fixes.sh
conchuod/patch-6-test-12 success .github/scripts/patches/verify_signedoff.sh

Commit Message

Jason Gunthorpe Nov. 15, 2023, 2:05 p.m. UTC
Allow fwspec to exist independently from the dev->iommu by providing
functions to allow allocating and freeing the raw struct iommu_fwspec.

Reflow the existing paths to call the new alloc/dealloc functions.

Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommu.c | 82 ++++++++++++++++++++++++++++++++-----------
 include/linux/iommu.h | 11 +++++-
 2 files changed, 72 insertions(+), 21 deletions(-)

Comments

Hector Martin Nov. 19, 2023, 8:10 a.m. UTC | #1
On 2023/11/15 23:05, Jason Gunthorpe wrote:
> Allow fwspec to exist independently from the dev->iommu by providing
> functions to allow allocating and freeing the raw struct iommu_fwspec.
> 
> Reflow the existing paths to call the new alloc/dealloc functions.
> 
> Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/iommu.c | 82 ++++++++++++++++++++++++++++++++-----------
>  include/linux/iommu.h | 11 +++++-
>  2 files changed, 72 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 18a82a20934d53..86bbb9e75c7e03 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -361,10 +361,8 @@ static void dev_iommu_free(struct device *dev)
>  	struct dev_iommu *param = dev->iommu;
>  
>  	dev->iommu = NULL;
> -	if (param->fwspec) {
> -		fwnode_handle_put(param->fwspec->iommu_fwnode);
> -		kfree(param->fwspec);
> -	}
> +	if (param->fwspec)
> +		iommu_fwspec_dealloc(param->fwspec);
>  	kfree(param);
>  }
>  
> @@ -2920,10 +2918,61 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
>  	return ops;
>  }
>  
> +static int iommu_fwspec_assign_iommu(struct iommu_fwspec *fwspec,
> +				     struct device *dev,
> +				     struct fwnode_handle *iommu_fwnode)
> +{
> +	const struct iommu_ops *ops;
> +
> +	if (fwspec->iommu_fwnode) {
> +		/*
> +		 * fwspec->iommu_fwnode is the first iommu's fwnode. In the rare
> +		 * case of multiple iommus for one device they must point to the
> +		 * same driver, checked via same ops.
> +		 */
> +		ops = iommu_ops_from_fwnode(iommu_fwnode);

This carries over a related bug from the original code: If a device has
two IOMMUs and the first one probes but the second one defers, ops will
be NULL here and the check will fail with EINVAL.

Adding a check for that case here fixes it:

		if (!ops)
			return driver_deferred_probe_check_state(dev);

With that, for the whole series:

Tested-by: Hector Martin <marcan@marcan.st>

I can't specifically test for the probe races the series intends to fix
though, since that bug we only hit extremely rarely. I'm just testing
that nothing breaks.

> +		if (fwspec->ops != ops)
> +			return -EINVAL;
> +		return 0;
> +	}
> +
> +	if (!fwspec->ops) {
> +		ops = iommu_ops_from_fwnode(iommu_fwnode);
> +		if (!ops)
> +			return driver_deferred_probe_check_state(dev);
> +		fwspec->ops = ops;
> +	}
> +
> +	of_node_get(to_of_node(iommu_fwnode));
> +	fwspec->iommu_fwnode = iommu_fwnode;
> +	return 0;
> +}
> +
> +struct iommu_fwspec *iommu_fwspec_alloc(void)
> +{
> +	struct iommu_fwspec *fwspec;
> +
> +	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
> +	if (!fwspec)
> +		return ERR_PTR(-ENOMEM);
> +	return fwspec;
> +}
> +
> +void iommu_fwspec_dealloc(struct iommu_fwspec *fwspec)
> +{
> +	if (!fwspec)
> +		return;
> +
> +	if (fwspec->iommu_fwnode)
> +		fwnode_handle_put(fwspec->iommu_fwnode);
> +	kfree(fwspec);
> +}
> +
>  int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
>  		      const struct iommu_ops *ops)
>  {
>  	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	int ret;
>  
>  	if (fwspec)
>  		return ops == fwspec->ops ? 0 : -EINVAL;
> @@ -2931,29 +2980,22 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
>  	if (!dev_iommu_get(dev))
>  		return -ENOMEM;
>  
> -	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
> -	if (!fwspec)
> -		return -ENOMEM;
> +	fwspec = iommu_fwspec_alloc();
> +	if (IS_ERR(fwspec))
> +		return PTR_ERR(fwspec);
>  
> -	of_node_get(to_of_node(iommu_fwnode));
> -	fwspec->iommu_fwnode = iommu_fwnode;
>  	fwspec->ops = ops;
> +	ret = iommu_fwspec_assign_iommu(fwspec, dev, iommu_fwnode);
> +	if (ret) {
> +		iommu_fwspec_dealloc(fwspec);
> +		return ret;
> +	}
> +
>  	dev_iommu_fwspec_set(dev, fwspec);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(iommu_fwspec_init);
>  
> -void iommu_fwspec_free(struct device *dev)
> -{
> -	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> -
> -	if (fwspec) {
> -		fwnode_handle_put(fwspec->iommu_fwnode);
> -		kfree(fwspec);
> -		dev_iommu_fwspec_set(dev, NULL);
> -	}
> -}
> -EXPORT_SYMBOL_GPL(iommu_fwspec_free);
>  
>  int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
>  {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index e98a4ca8f536b7..c7c68cb59aa4dc 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -813,9 +813,18 @@ struct iommu_sva {
>  	struct iommu_domain		*domain;
>  };
>  
> +struct iommu_fwspec *iommu_fwspec_alloc(void);
> +void iommu_fwspec_dealloc(struct iommu_fwspec *fwspec);
> +
>  int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
>  		      const struct iommu_ops *ops);
> -void iommu_fwspec_free(struct device *dev);
> +static inline void iommu_fwspec_free(struct device *dev)
> +{
> +	if (!dev->iommu)
> +		return;
> +	iommu_fwspec_dealloc(dev->iommu->fwspec);
> +	dev->iommu->fwspec = NULL;
> +}
>  int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids);
>  const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
>  

- Hector
Hector Martin Nov. 19, 2023, 9:19 a.m. UTC | #2
On 2023/11/19 17:10, Hector Martin wrote:
> On 2023/11/15 23:05, Jason Gunthorpe wrote:
>> Allow fwspec to exist independently from the dev->iommu by providing
>> functions to allow allocating and freeing the raw struct iommu_fwspec.
>>
>> Reflow the existing paths to call the new alloc/dealloc functions.
>>
>> Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>> ---
>>  drivers/iommu/iommu.c | 82 ++++++++++++++++++++++++++++++++-----------
>>  include/linux/iommu.h | 11 +++++-
>>  2 files changed, 72 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index 18a82a20934d53..86bbb9e75c7e03 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -361,10 +361,8 @@ static void dev_iommu_free(struct device *dev)
>>  	struct dev_iommu *param = dev->iommu;
>>  
>>  	dev->iommu = NULL;
>> -	if (param->fwspec) {
>> -		fwnode_handle_put(param->fwspec->iommu_fwnode);
>> -		kfree(param->fwspec);
>> -	}
>> +	if (param->fwspec)
>> +		iommu_fwspec_dealloc(param->fwspec);
>>  	kfree(param);
>>  }
>>  
>> @@ -2920,10 +2918,61 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
>>  	return ops;
>>  }
>>  
>> +static int iommu_fwspec_assign_iommu(struct iommu_fwspec *fwspec,
>> +				     struct device *dev,
>> +				     struct fwnode_handle *iommu_fwnode)
>> +{
>> +	const struct iommu_ops *ops;
>> +
>> +	if (fwspec->iommu_fwnode) {
>> +		/*
>> +		 * fwspec->iommu_fwnode is the first iommu's fwnode. In the rare
>> +		 * case of multiple iommus for one device they must point to the
>> +		 * same driver, checked via same ops.
>> +		 */
>> +		ops = iommu_ops_from_fwnode(iommu_fwnode);
> 
> This carries over a related bug from the original code: If a device has
> two IOMMUs and the first one probes but the second one defers, ops will
> be NULL here and the check will fail with EINVAL.
> 
> Adding a check for that case here fixes it:
> 
> 		if (!ops)
> 			return driver_deferred_probe_check_state(dev);
> 
> With that, for the whole series:
> 
> Tested-by: Hector Martin <marcan@marcan.st>
> 
> I can't specifically test for the probe races the series intends to fix
> though, since that bug we only hit extremely rarely. I'm just testing
> that nothing breaks.

Actually no, this fix is not sufficient. If the first IOMMU is ready
then the xlate path allocates dev->iommu, which then
__iommu_probe_device takes as a sign that all IOMMUs are ready and does
the device init. Then when the xlate comes along again after suceeding
with the second IOMMU, __iommu_probe_device sees the device is already
in a group and never initializes the second IOMMU, leaving the device
with only one IOMMU.

This patch fixes it, but honestly, at this point I have no idea how to
"properly" fix this. There is *way* too much subtlety in this whole
codepath.

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2477dec29740..2e4baf0572e7 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2935,6 +2935,12 @@ int iommu_fwspec_of_xlate(struct iommu_fwspec
*fwspec, struct device *dev,
 	int ret;

 	ret = iommu_fwspec_assign_iommu(fwspec, dev, iommu_fwnode);
+	if (ret == -EPROBE_DEFER) {
+		mutex_lock(&iommu_probe_device_lock);
+		if (dev->iommu)
+			dev_iommu_free(dev);
+		mutex_unlock(&iommu_probe_device_lock);
+	}
 	if (ret)
 		return ret;

> 
>> +		if (fwspec->ops != ops)
>> +			return -EINVAL;
>> +		return 0;
>> +	}
>> +
>> +	if (!fwspec->ops) {
>> +		ops = iommu_ops_from_fwnode(iommu_fwnode);
>> +		if (!ops)
>> +			return driver_deferred_probe_check_state(dev);
>> +		fwspec->ops = ops;
>> +	}
>> +
>> +	of_node_get(to_of_node(iommu_fwnode));
>> +	fwspec->iommu_fwnode = iommu_fwnode;
>> +	return 0;
>> +}
>> +
>> +struct iommu_fwspec *iommu_fwspec_alloc(void)
>> +{
>> +	struct iommu_fwspec *fwspec;
>> +
>> +	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
>> +	if (!fwspec)
>> +		return ERR_PTR(-ENOMEM);
>> +	return fwspec;
>> +}
>> +
>> +void iommu_fwspec_dealloc(struct iommu_fwspec *fwspec)
>> +{
>> +	if (!fwspec)
>> +		return;
>> +
>> +	if (fwspec->iommu_fwnode)
>> +		fwnode_handle_put(fwspec->iommu_fwnode);
>> +	kfree(fwspec);
>> +}
>> +
>>  int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
>>  		      const struct iommu_ops *ops)
>>  {
>>  	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
>> +	int ret;
>>  
>>  	if (fwspec)
>>  		return ops == fwspec->ops ? 0 : -EINVAL;
>> @@ -2931,29 +2980,22 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
>>  	if (!dev_iommu_get(dev))
>>  		return -ENOMEM;
>>  
>> -	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
>> -	if (!fwspec)
>> -		return -ENOMEM;
>> +	fwspec = iommu_fwspec_alloc();
>> +	if (IS_ERR(fwspec))
>> +		return PTR_ERR(fwspec);
>>  
>> -	of_node_get(to_of_node(iommu_fwnode));
>> -	fwspec->iommu_fwnode = iommu_fwnode;
>>  	fwspec->ops = ops;
>> +	ret = iommu_fwspec_assign_iommu(fwspec, dev, iommu_fwnode);
>> +	if (ret) {
>> +		iommu_fwspec_dealloc(fwspec);
>> +		return ret;
>> +	}
>> +
>>  	dev_iommu_fwspec_set(dev, fwspec);
>>  	return 0;
>>  }
>>  EXPORT_SYMBOL_GPL(iommu_fwspec_init);
>>  
>> -void iommu_fwspec_free(struct device *dev)
>> -{
>> -	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
>> -
>> -	if (fwspec) {
>> -		fwnode_handle_put(fwspec->iommu_fwnode);
>> -		kfree(fwspec);
>> -		dev_iommu_fwspec_set(dev, NULL);
>> -	}
>> -}
>> -EXPORT_SYMBOL_GPL(iommu_fwspec_free);
>>  
>>  int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
>>  {
>> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index e98a4ca8f536b7..c7c68cb59aa4dc 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>> @@ -813,9 +813,18 @@ struct iommu_sva {
>>  	struct iommu_domain		*domain;
>>  };
>>  
>> +struct iommu_fwspec *iommu_fwspec_alloc(void);
>> +void iommu_fwspec_dealloc(struct iommu_fwspec *fwspec);
>> +
>>  int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
>>  		      const struct iommu_ops *ops);
>> -void iommu_fwspec_free(struct device *dev);
>> +static inline void iommu_fwspec_free(struct device *dev)
>> +{
>> +	if (!dev->iommu)
>> +		return;
>> +	iommu_fwspec_dealloc(dev->iommu->fwspec);
>> +	dev->iommu->fwspec = NULL;
>> +}
>>  int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids);
>>  const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
>>  
> 
> - Hector
> 
> 

- Hector
Jason Gunthorpe Nov. 19, 2023, 2:13 p.m. UTC | #3
On Sun, Nov 19, 2023 at 06:19:43PM +0900, Hector Martin wrote:
> >> +static int iommu_fwspec_assign_iommu(struct iommu_fwspec *fwspec,
> >> +				     struct device *dev,
> >> +				     struct fwnode_handle *iommu_fwnode)
> >> +{
> >> +	const struct iommu_ops *ops;
> >> +
> >> +	if (fwspec->iommu_fwnode) {
> >> +		/*
> >> +		 * fwspec->iommu_fwnode is the first iommu's fwnode. In the rare
> >> +		 * case of multiple iommus for one device they must point to the
> >> +		 * same driver, checked via same ops.
> >> +		 */
> >> +		ops = iommu_ops_from_fwnode(iommu_fwnode);
> > 
> > This carries over a related bug from the original code: If a device has
> > two IOMMUs and the first one probes but the second one defers, ops will
> > be NULL here and the check will fail with EINVAL.
> > 
> > Adding a check for that case here fixes it:
> > 
> > 		if (!ops)
> > 			return driver_deferred_probe_check_state(dev);

Yes!

> > With that, for the whole series:
> > 
> > Tested-by: Hector Martin <marcan@marcan.st>
> > 
> > I can't specifically test for the probe races the series intends to fix
> > though, since that bug we only hit extremely rarely. I'm just testing
> > that nothing breaks.
> 
> Actually no, this fix is not sufficient. If the first IOMMU is ready
> then the xlate path allocates dev->iommu, which then
> __iommu_probe_device takes as a sign that all IOMMUs are ready and does
> the device init.

It doesn't.. The code there is:

	if (!fwspec && dev->iommu)
		fwspec = dev->iommu->fwspec;
	if (fwspec)
		ops = fwspec->ops;
	else
		ops = dev->bus->iommu_ops;
	if (!ops) {
		ret = -ENODEV;
		goto out_unlock;
	}

Which is sensitive only to !NULL fwspec, and if EPROBE_DEFER is
returned fwspec will be freed and dev->iommu->fwspec will be NULL
here.

In the NULL case it does a 'bus probe' with a NULL fwspec and all the
fwspec drivers return immediately from their probe functions.

Did I miss something?

> Then when the xlate comes along again after suceeding
> with the second IOMMU, __iommu_probe_device sees the device is already
> in a group and never initializes the second IOMMU, leaving the device
> with only one IOMMU.

This should be fixed by the first hunk to check every iommu and fail?

BTW, do you have a systems with same device attached to multiple
iommus?

I've noticed another bug here, many drivers don't actually support
differing iommu instances and nothing seems to check it..

Thanks,
Jason
Hector Martin Nov. 21, 2023, 6:47 a.m. UTC | #4
On 2023/11/19 23:13, Jason Gunthorpe wrote:
> On Sun, Nov 19, 2023 at 06:19:43PM +0900, Hector Martin wrote:
>>>> +static int iommu_fwspec_assign_iommu(struct iommu_fwspec *fwspec,
>>>> +				     struct device *dev,
>>>> +				     struct fwnode_handle *iommu_fwnode)
>>>> +{
>>>> +	const struct iommu_ops *ops;
>>>> +
>>>> +	if (fwspec->iommu_fwnode) {
>>>> +		/*
>>>> +		 * fwspec->iommu_fwnode is the first iommu's fwnode. In the rare
>>>> +		 * case of multiple iommus for one device they must point to the
>>>> +		 * same driver, checked via same ops.
>>>> +		 */
>>>> +		ops = iommu_ops_from_fwnode(iommu_fwnode);
>>>
>>> This carries over a related bug from the original code: If a device has
>>> two IOMMUs and the first one probes but the second one defers, ops will
>>> be NULL here and the check will fail with EINVAL.
>>>
>>> Adding a check for that case here fixes it:
>>>
>>> 		if (!ops)
>>> 			return driver_deferred_probe_check_state(dev);
> 
> Yes!
> 
>>> With that, for the whole series:
>>>
>>> Tested-by: Hector Martin <marcan@marcan.st>
>>>
>>> I can't specifically test for the probe races the series intends to fix
>>> though, since that bug we only hit extremely rarely. I'm just testing
>>> that nothing breaks.
>>
>> Actually no, this fix is not sufficient. If the first IOMMU is ready
>> then the xlate path allocates dev->iommu, which then
>> __iommu_probe_device takes as a sign that all IOMMUs are ready and does
>> the device init.
> 
> It doesn't.. The code there is:
> 
> 	if (!fwspec && dev->iommu)
> 		fwspec = dev->iommu->fwspec;
> 	if (fwspec)
> 		ops = fwspec->ops;
> 	else
> 		ops = dev->bus->iommu_ops;
> 	if (!ops) {
> 		ret = -ENODEV;
> 		goto out_unlock;
> 	}
> 
> Which is sensitive only to !NULL fwspec, and if EPROBE_DEFER is
> returned fwspec will be freed and dev->iommu->fwspec will be NULL
> here.
> 
> In the NULL case it does a 'bus probe' with a NULL fwspec and all the
> fwspec drivers return immediately from their probe functions.
> 
> Did I miss something?

apple_dart is not a fwspec driver and doesn't do that :-)

> 
>> Then when the xlate comes along again after suceeding
>> with the second IOMMU, __iommu_probe_device sees the device is already
>> in a group and never initializes the second IOMMU, leaving the device
>> with only one IOMMU.
> 
> This should be fixed by the first hunk to check every iommu and fail?
> 
> BTW, do you have a systems with same device attached to multiple
> iommus?

Yes, Apple ARM64 machines all have multiple ganged IOMMUs for certain
devices (USB and ISP). We also attach all display IOMMUs to the global
virtual display-subsystem device to handle framebuffer mappings, instead
of trying to dynamically map them to a bunch of individual display
controllers (which is a lot more painful). That last one is what
reliably reproduces this problem, display breaks without both previous
patches ever since we started supporting more than one display output.
The first one is not enough.

> I've noticed another bug here, many drivers don't actually support
> differing iommu instances and nothing seems to check it..

apple-dart does (as long as all the IOMMUs are using that driver).

> 
> Thanks,
> Jason
> 
> 

- Hector
Jason Gunthorpe Nov. 21, 2023, 4 p.m. UTC | #5
On Tue, Nov 21, 2023 at 03:47:48PM +0900, Hector Martin wrote:
> > Which is sensitive only to !NULL fwspec, and if EPROBE_DEFER is
> > returned fwspec will be freed and dev->iommu->fwspec will be NULL
> > here.
> > 
> > In the NULL case it does a 'bus probe' with a NULL fwspec and all the
> > fwspec drivers return immediately from their probe functions.
> > 
> > Did I miss something?
> 
> apple_dart is not a fwspec driver and doesn't do that :-)

It implements of_xlate that makes it a driver using the fwspec probe
path.

The issue is in apple-dart. Its logic for avoiding bus probe vs
fwspec probe is not correct.

It does:

static int apple_dart_of_xlate(struct device *dev, struct of_phandle_args *args)
{
 [..]
 	dev_iommu_priv_set(dev, cfg);


Then:

static struct iommu_device *apple_dart_probe_device(struct device *dev)
{
	struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
	struct apple_dart_stream_map *stream_map;
	int i;

	if (!cfg)
		return ERR_PTR(-ENODEV);

Which leaks the cfg memory on rare error cases and wrongly allows the
driver to probe without a fwspec, which I think is what you are
hitting.

It should be

       if (!dev_iommu_fwspec_get(dev) || !cfg)
		return ERR_PTR(-ENODEV);

To ensure the driver never probes on the bus path.

Clearing the dev->iommu in the core code has the side effect of
clearing (and leaking) the cfg which would hide this issue.

Jason
Hector Martin Nov. 23, 2023, 9:08 a.m. UTC | #6
On 2023/11/22 1:00, Jason Gunthorpe wrote:
> On Tue, Nov 21, 2023 at 03:47:48PM +0900, Hector Martin wrote:
>>> Which is sensitive only to !NULL fwspec, and if EPROBE_DEFER is
>>> returned fwspec will be freed and dev->iommu->fwspec will be NULL
>>> here.
>>>
>>> In the NULL case it does a 'bus probe' with a NULL fwspec and all the
>>> fwspec drivers return immediately from their probe functions.
>>>
>>> Did I miss something?
>>
>> apple_dart is not a fwspec driver and doesn't do that :-)
> 
> It implements of_xlate that makes it a driver using the fwspec probe
> path.
> 
> The issue is in apple-dart. Its logic for avoiding bus probe vs
> fwspec probe is not correct.
> 
> It does:
> 
> static int apple_dart_of_xlate(struct device *dev, struct of_phandle_args *args)
> {
>  [..]
>  	dev_iommu_priv_set(dev, cfg);
> 
> 
> Then:
> 
> static struct iommu_device *apple_dart_probe_device(struct device *dev)
> {
> 	struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
> 	struct apple_dart_stream_map *stream_map;
> 	int i;
> 
> 	if (!cfg)
> 		return ERR_PTR(-ENODEV);
> 
> Which leaks the cfg memory on rare error cases and wrongly allows the
> driver to probe without a fwspec, which I think is what you are
> hitting.
> 
> It should be
> 
>        if (!dev_iommu_fwspec_get(dev) || !cfg)
> 		return ERR_PTR(-ENODEV);
> 
> To ensure the driver never probes on the bus path.
> 
> Clearing the dev->iommu in the core code has the side effect of
> clearing (and leaking) the cfg which would hide this issue.

Aha! Yes, that makes it work with only the first change. I'll throw the
apple-dart fix into our tree (and submit it once I get to clearing out
the branch; the affected consumer driver isn't upstream yet so this
isn't particularly urgent).

- Hector
diff mbox series

Patch

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 18a82a20934d53..86bbb9e75c7e03 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -361,10 +361,8 @@  static void dev_iommu_free(struct device *dev)
 	struct dev_iommu *param = dev->iommu;
 
 	dev->iommu = NULL;
-	if (param->fwspec) {
-		fwnode_handle_put(param->fwspec->iommu_fwnode);
-		kfree(param->fwspec);
-	}
+	if (param->fwspec)
+		iommu_fwspec_dealloc(param->fwspec);
 	kfree(param);
 }
 
@@ -2920,10 +2918,61 @@  const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 	return ops;
 }
 
+static int iommu_fwspec_assign_iommu(struct iommu_fwspec *fwspec,
+				     struct device *dev,
+				     struct fwnode_handle *iommu_fwnode)
+{
+	const struct iommu_ops *ops;
+
+	if (fwspec->iommu_fwnode) {
+		/*
+		 * fwspec->iommu_fwnode is the first iommu's fwnode. In the rare
+		 * case of multiple iommus for one device they must point to the
+		 * same driver, checked via same ops.
+		 */
+		ops = iommu_ops_from_fwnode(iommu_fwnode);
+		if (fwspec->ops != ops)
+			return -EINVAL;
+		return 0;
+	}
+
+	if (!fwspec->ops) {
+		ops = iommu_ops_from_fwnode(iommu_fwnode);
+		if (!ops)
+			return driver_deferred_probe_check_state(dev);
+		fwspec->ops = ops;
+	}
+
+	of_node_get(to_of_node(iommu_fwnode));
+	fwspec->iommu_fwnode = iommu_fwnode;
+	return 0;
+}
+
+struct iommu_fwspec *iommu_fwspec_alloc(void)
+{
+	struct iommu_fwspec *fwspec;
+
+	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
+	if (!fwspec)
+		return ERR_PTR(-ENOMEM);
+	return fwspec;
+}
+
+void iommu_fwspec_dealloc(struct iommu_fwspec *fwspec)
+{
+	if (!fwspec)
+		return;
+
+	if (fwspec->iommu_fwnode)
+		fwnode_handle_put(fwspec->iommu_fwnode);
+	kfree(fwspec);
+}
+
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 		      const struct iommu_ops *ops)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	int ret;
 
 	if (fwspec)
 		return ops == fwspec->ops ? 0 : -EINVAL;
@@ -2931,29 +2980,22 @@  int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 	if (!dev_iommu_get(dev))
 		return -ENOMEM;
 
-	fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
-	if (!fwspec)
-		return -ENOMEM;
+	fwspec = iommu_fwspec_alloc();
+	if (IS_ERR(fwspec))
+		return PTR_ERR(fwspec);
 
-	of_node_get(to_of_node(iommu_fwnode));
-	fwspec->iommu_fwnode = iommu_fwnode;
 	fwspec->ops = ops;
+	ret = iommu_fwspec_assign_iommu(fwspec, dev, iommu_fwnode);
+	if (ret) {
+		iommu_fwspec_dealloc(fwspec);
+		return ret;
+	}
+
 	dev_iommu_fwspec_set(dev, fwspec);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_init);
 
-void iommu_fwspec_free(struct device *dev)
-{
-	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-
-	if (fwspec) {
-		fwnode_handle_put(fwspec->iommu_fwnode);
-		kfree(fwspec);
-		dev_iommu_fwspec_set(dev, NULL);
-	}
-}
-EXPORT_SYMBOL_GPL(iommu_fwspec_free);
 
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e98a4ca8f536b7..c7c68cb59aa4dc 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -813,9 +813,18 @@  struct iommu_sva {
 	struct iommu_domain		*domain;
 };
 
+struct iommu_fwspec *iommu_fwspec_alloc(void);
+void iommu_fwspec_dealloc(struct iommu_fwspec *fwspec);
+
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 		      const struct iommu_ops *ops);
-void iommu_fwspec_free(struct device *dev);
+static inline void iommu_fwspec_free(struct device *dev)
+{
+	if (!dev->iommu)
+		return;
+	iommu_fwspec_dealloc(dev->iommu->fwspec);
+	dev->iommu->fwspec = NULL;
+}
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids);
 const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);