diff mbox series

iommu/arm: Cleanup resources in case of probe error path

Message ID 20210608164559.204023-1-ameynarkhede03@gmail.com (mailing list archive)
State New, archived
Headers show
Series iommu/arm: Cleanup resources in case of probe error path | expand

Commit Message

ameynarkhede03 June 8, 2021, 4:45 p.m. UTC
If device registration fails, remove sysfs attribute
and if setting bus callbacks fails, unregister the device
and cleanup the sysfs attribute.

Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 14 ++++++++++++--
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 15 ++++++++++++---
 drivers/iommu/arm/arm-smmu/qcom_iommu.c     | 13 +++++++++++--
 3 files changed, 35 insertions(+), 7 deletions(-)

--
2.31.1

Comments

Will Deacon June 8, 2021, 5:16 p.m. UTC | #1
On Tue, 8 Jun 2021 22:15:59 +0530, Amey Narkhede wrote:
> If device registration fails, remove sysfs attribute
> and if setting bus callbacks fails, unregister the device
> and cleanup the sysfs attribute.

Applied to will (for-joerg/arm-smmu/updates), thanks!

[1/1] iommu/arm: Cleanup resources in case of probe error path
      https://git.kernel.org/will/c/249c9dc6aa0d

Cheers,
Marek Szyprowski June 30, 2021, 12:48 p.m. UTC | #2
Hi,

On 08.06.2021 18:45, Amey Narkhede wrote:
> If device registration fails, remove sysfs attribute
> and if setting bus callbacks fails, unregister the device
> and cleanup the sysfs attribute.
>
> Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>

This patch landed in linux-next some time ago as commit 249c9dc6aa0d 
("iommu/arm: Cleanup resources in case of probe error path"). After 
bisecting and some manual searching I finally found that it is 
responsible for breaking s2idle on DragonBoard 410c. Here is the log 
(captured with no_console_suspend):

# time rtcwake -s10 -mmem
rtcwake: wakeup from "mem" using /dev/rtc0 at Thu Jan  1 00:02:13 1970
PM: suspend entry (s2idle)
Filesystems sync: 0.002 seconds
Freezing user space processes ... (elapsed 0.006 seconds) done.
OOM killer disabled.
Freezing remaining freezable tasks ... (elapsed 0.004 seconds) done.
Unable to handle kernel NULL pointer dereference at virtual address 
0000000000000070
Mem abort info:
   ESR = 0x96000006
   EC = 0x25: DABT (current EL), IL = 32 bits
   SET = 0, FnV = 0
   EA = 0, S1PTW = 0
   FSC = 0x06: level 2 translation fault
Data abort info:
   ISV = 0, ISS = 0x00000006
   CM = 0, WnR = 0
user pgtable: 4k pages, 48-bit VAs, pgdp=000000008ad08000
[0000000000000070] pgd=0800000085c3c003, p4d=0800000085c3c003, 
pud=0800000088dcf003, pmd=0000000000000000
Internal error: Oops: 96000006 [#1] PREEMPT SMP
Modules linked in: bluetooth ecdh_generic ecc rfkill ipv6 ax88796b 
venus_enc venus_dec videobuf2_dma_contig asix crct10dif_ce adv7511 
snd_soc_msm8916_analog qcom_spmi_temp_alarm rtc_pm8xxx qcom_pon 
qcom_camss qcom_spmi_vadc videobuf2_dma_sg qcom_vadc_common msm 
venus_core v4l2_fwnode v4l2_async snd_soc_msm8916_digital 
videobuf2_memops snd_soc_lpass_apq8016 snd_soc_lpass_cpu v4l2_mem2mem 
snd_soc_lpass_platform snd_soc_apq8016_sbc videobuf2_v4l2 
snd_soc_qcom_common qcom_rng videobuf2_common i2c_qcom_cci qnoc_msm8916 
videodev mc icc_smd_rpm mdt_loader socinfo display_connector rmtfs_mem
CPU: 1 PID: 1522 Comm: rtcwake Not tainted 5.13.0-next-20210629 #3592
Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
pc : msm_runtime_suspend+0x1c/0x60 [msm]
lr : msm_pm_suspend+0x18/0x38 [msm]
...
Call trace:
  msm_runtime_suspend+0x1c/0x60 [msm]
  msm_pm_suspend+0x18/0x38 [msm]
  dpm_run_callback+0x84/0x378
  __device_suspend+0x118/0x680
  dpm_suspend+0x150/0x4f0
  dpm_suspend_start+0x98/0xa0
  suspend_devices_and_enter+0xfc/0xaf0
  pm_suspend+0x2b0/0x3d0
  state_store+0x84/0x108
  kobj_attr_store+0x14/0x28
  sysfs_kf_write+0x60/0x70
  kernfs_fop_write_iter+0x124/0x1a8
  new_sync_write+0xe8/0x1b0
  vfs_write+0x1e8/0x450
  ksys_write+0x64/0xf0
  __arm64_sys_write+0x14/0x20
  invoke_syscall+0x40/0xf8
  el0_svc_common+0x60/0x100
  do_el0_svc_compat+0x1c/0x48
  el0_svc_compat+0x20/0x30
  el0t_32_sync_handler+0xec/0x140
  el0t_32_sync+0x168/0x16c
Code: 910003fd f9000bf3 f9403c02 52800040 (f9403842)
---[ end trace 215b72fcd7026947 ]---

Reverting it on top of linux-next fixes s2idle oepration on that board.

> ---
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 14 ++++++++++++--
>   drivers/iommu/arm/arm-smmu/arm-smmu.c       | 15 ++++++++++++---
>   drivers/iommu/arm/arm-smmu/qcom_iommu.c     | 13 +++++++++++--
>   3 files changed, 35 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 54b2f27b81d4..de2499754025 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -3669,10 +3669,20 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
>   	ret = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
>   	if (ret) {
>   		dev_err(dev, "Failed to register iommu\n");
> -		return ret;
> +		goto err_sysfs_remove;
>   	}
>
> -	return arm_smmu_set_bus_ops(&arm_smmu_ops);
> +	ret = arm_smmu_set_bus_ops(&arm_smmu_ops);
> +	if (ret)
> +		goto err_unregister_device;
> +
> +	return 0;
> +
> +err_unregister_device:
> +	iommu_device_unregister(&smmu->iommu);
> +err_sysfs_remove:
> +	iommu_device_sysfs_remove(&smmu->iommu);
> +	return ret;
>   }
>
>   static int arm_smmu_device_remove(struct platform_device *pdev)
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> index 6f72c4d208ca..88a3023676ce 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> @@ -2164,7 +2164,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
>   	err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
>   	if (err) {
>   		dev_err(dev, "Failed to register iommu\n");
> -		return err;
> +		goto err_sysfs_remove;
>   	}
>
>   	platform_set_drvdata(pdev, smmu);
> @@ -2187,10 +2187,19 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
>   	 * any device which might need it, so we want the bus ops in place
>   	 * ready to handle default domain setup as soon as any SMMU exists.
>   	 */
> -	if (!using_legacy_binding)
> -		return arm_smmu_bus_init(&arm_smmu_ops);
> +	if (!using_legacy_binding) {
> +		err = arm_smmu_bus_init(&arm_smmu_ops);
> +		if (err)
> +			goto err_unregister_device;
> +	}
>
>   	return 0;
> +
> +err_unregister_device:
> +	iommu_device_unregister(&smmu->iommu);
> +err_sysfs_remove:
> +	iommu_device_sysfs_remove(&smmu->iommu);
> +	return err;
>   }
>
>   static int arm_smmu_device_remove(struct platform_device *pdev)
> diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
> index 4294abe389b2..b785d9fb7602 100644
> --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
> +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
> @@ -850,10 +850,12 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
>   	ret = iommu_device_register(&qcom_iommu->iommu, &qcom_iommu_ops, dev);
>   	if (ret) {
>   		dev_err(dev, "Failed to register iommu\n");
> -		return ret;
> +		goto err_sysfs_remove;
>   	}
>
> -	bus_set_iommu(&platform_bus_type, &qcom_iommu_ops);
> +	ret = bus_set_iommu(&platform_bus_type, &qcom_iommu_ops);
> +	if (ret)
> +		goto err_unregister_device;
>
>   	if (qcom_iommu->local_base) {
>   		pm_runtime_get_sync(dev);
> @@ -862,6 +864,13 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
>   	}
>
>   	return 0;
> +
> +err_unregister_device:
> +	iommu_device_unregister(&qcom_iommu->iommu);
> +
> +err_sysfs_remove:
> +	iommu_device_sysfs_remove(&qcom_iommu->iommu);
> +	return ret;
>   }
>
>   static int qcom_iommu_device_remove(struct platform_device *pdev)
> --
> 2.31.1
>
Best regards
Will Deacon June 30, 2021, 12:59 p.m. UTC | #3
On Wed, Jun 30, 2021 at 02:48:15PM +0200, Marek Szyprowski wrote:
> On 08.06.2021 18:45, Amey Narkhede wrote:
> > If device registration fails, remove sysfs attribute
> > and if setting bus callbacks fails, unregister the device
> > and cleanup the sysfs attribute.
> >
> > Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
> 
> This patch landed in linux-next some time ago as commit 249c9dc6aa0d 
> ("iommu/arm: Cleanup resources in case of probe error path"). After 
> bisecting and some manual searching I finally found that it is 
> responsible for breaking s2idle on DragonBoard 410c. Here is the log 
> (captured with no_console_suspend):
> 
> # time rtcwake -s10 -mmem
> rtcwake: wakeup from "mem" using /dev/rtc0 at Thu Jan  1 00:02:13 1970
> PM: suspend entry (s2idle)
> Filesystems sync: 0.002 seconds
> Freezing user space processes ... (elapsed 0.006 seconds) done.
> OOM killer disabled.
> Freezing remaining freezable tasks ... (elapsed 0.004 seconds) done.
> Unable to handle kernel NULL pointer dereference at virtual address 
> 0000000000000070
> Mem abort info:
>    ESR = 0x96000006
>    EC = 0x25: DABT (current EL), IL = 32 bits
>    SET = 0, FnV = 0
>    EA = 0, S1PTW = 0
>    FSC = 0x06: level 2 translation fault
> Data abort info:
>    ISV = 0, ISS = 0x00000006
>    CM = 0, WnR = 0
> user pgtable: 4k pages, 48-bit VAs, pgdp=000000008ad08000
> [0000000000000070] pgd=0800000085c3c003, p4d=0800000085c3c003, 
> pud=0800000088dcf003, pmd=0000000000000000
> Internal error: Oops: 96000006 [#1] PREEMPT SMP
> Modules linked in: bluetooth ecdh_generic ecc rfkill ipv6 ax88796b 
> venus_enc venus_dec videobuf2_dma_contig asix crct10dif_ce adv7511 
> snd_soc_msm8916_analog qcom_spmi_temp_alarm rtc_pm8xxx qcom_pon 
> qcom_camss qcom_spmi_vadc videobuf2_dma_sg qcom_vadc_common msm 
> venus_core v4l2_fwnode v4l2_async snd_soc_msm8916_digital 
> videobuf2_memops snd_soc_lpass_apq8016 snd_soc_lpass_cpu v4l2_mem2mem 
> snd_soc_lpass_platform snd_soc_apq8016_sbc videobuf2_v4l2 
> snd_soc_qcom_common qcom_rng videobuf2_common i2c_qcom_cci qnoc_msm8916 
> videodev mc icc_smd_rpm mdt_loader socinfo display_connector rmtfs_mem
> CPU: 1 PID: 1522 Comm: rtcwake Not tainted 5.13.0-next-20210629 #3592
> Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
> pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
> pc : msm_runtime_suspend+0x1c/0x60 [msm]
> lr : msm_pm_suspend+0x18/0x38 [msm]
> ...
> Call trace:
>   msm_runtime_suspend+0x1c/0x60 [msm]
>   msm_pm_suspend+0x18/0x38 [msm]
>   dpm_run_callback+0x84/0x378

I wonder if we're missing a pm_runtime_disable() call on the failure path?
i.e. something like the diff below...

Will

--->8

diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index 25ed444ff94d..ce8f354755d0 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -836,14 +836,14 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
        ret = devm_of_platform_populate(dev);
        if (ret) {
                dev_err(dev, "Failed to populate iommu contexts\n");
-               return ret;
+               goto err_pm_disable;
        }
 
        ret = iommu_device_sysfs_add(&qcom_iommu->iommu, dev, NULL,
                                     dev_name(dev));
        if (ret) {
                dev_err(dev, "Failed to register iommu in sysfs\n");
-               return ret;
+               goto err_pm_disable;
        }
 
        ret = iommu_device_register(&qcom_iommu->iommu, &qcom_iommu_ops, dev);
@@ -869,6 +869,9 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
 
 err_sysfs_remove:
        iommu_device_sysfs_remove(&qcom_iommu->iommu);
+
+err_pm_disable:
+       pm_runtime_disable(dev);
        return ret;
 }
Marek Szyprowski June 30, 2021, 1:48 p.m. UTC | #4
On 30.06.2021 14:59, Will Deacon wrote:
> On Wed, Jun 30, 2021 at 02:48:15PM +0200, Marek Szyprowski wrote:
>> On 08.06.2021 18:45, Amey Narkhede wrote:
>>> If device registration fails, remove sysfs attribute
>>> and if setting bus callbacks fails, unregister the device
>>> and cleanup the sysfs attribute.
>>>
>>> Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
>> This patch landed in linux-next some time ago as commit 249c9dc6aa0d
>> ("iommu/arm: Cleanup resources in case of probe error path"). After
>> bisecting and some manual searching I finally found that it is
>> responsible for breaking s2idle on DragonBoard 410c. Here is the log
>> (captured with no_console_suspend):
>>
>> # time rtcwake -s10 -mmem
>> rtcwake: wakeup from "mem" using /dev/rtc0 at Thu Jan  1 00:02:13 1970
>> PM: suspend entry (s2idle)
>> Filesystems sync: 0.002 seconds
>> Freezing user space processes ... (elapsed 0.006 seconds) done.
>> OOM killer disabled.
>> Freezing remaining freezable tasks ... (elapsed 0.004 seconds) done.
>> Unable to handle kernel NULL pointer dereference at virtual address
>> 0000000000000070
>> Mem abort info:
>>     ESR = 0x96000006
>>     EC = 0x25: DABT (current EL), IL = 32 bits
>>     SET = 0, FnV = 0
>>     EA = 0, S1PTW = 0
>>     FSC = 0x06: level 2 translation fault
>> Data abort info:
>>     ISV = 0, ISS = 0x00000006
>>     CM = 0, WnR = 0
>> user pgtable: 4k pages, 48-bit VAs, pgdp=000000008ad08000
>> [0000000000000070] pgd=0800000085c3c003, p4d=0800000085c3c003,
>> pud=0800000088dcf003, pmd=0000000000000000
>> Internal error: Oops: 96000006 [#1] PREEMPT SMP
>> Modules linked in: bluetooth ecdh_generic ecc rfkill ipv6 ax88796b
>> venus_enc venus_dec videobuf2_dma_contig asix crct10dif_ce adv7511
>> snd_soc_msm8916_analog qcom_spmi_temp_alarm rtc_pm8xxx qcom_pon
>> qcom_camss qcom_spmi_vadc videobuf2_dma_sg qcom_vadc_common msm
>> venus_core v4l2_fwnode v4l2_async snd_soc_msm8916_digital
>> videobuf2_memops snd_soc_lpass_apq8016 snd_soc_lpass_cpu v4l2_mem2mem
>> snd_soc_lpass_platform snd_soc_apq8016_sbc videobuf2_v4l2
>> snd_soc_qcom_common qcom_rng videobuf2_common i2c_qcom_cci qnoc_msm8916
>> videodev mc icc_smd_rpm mdt_loader socinfo display_connector rmtfs_mem
>> CPU: 1 PID: 1522 Comm: rtcwake Not tainted 5.13.0-next-20210629 #3592
>> Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
>> pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
>> pc : msm_runtime_suspend+0x1c/0x60 [msm]
>> lr : msm_pm_suspend+0x18/0x38 [msm]
>> ...
>> Call trace:
>>    msm_runtime_suspend+0x1c/0x60 [msm]
>>    msm_pm_suspend+0x18/0x38 [msm]
>>    dpm_run_callback+0x84/0x378
> I wonder if we're missing a pm_runtime_disable() call on the failure path?
> i.e. something like the diff below...

I've checked and it doesn't fix anything.

> Will
>
> --->8
>
> diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
> index 25ed444ff94d..ce8f354755d0 100644
> --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
> +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
> @@ -836,14 +836,14 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
>          ret = devm_of_platform_populate(dev);
>          if (ret) {
>                  dev_err(dev, "Failed to populate iommu contexts\n");
> -               return ret;
> +               goto err_pm_disable;
>          }
>   
>          ret = iommu_device_sysfs_add(&qcom_iommu->iommu, dev, NULL,
>                                       dev_name(dev));
>          if (ret) {
>                  dev_err(dev, "Failed to register iommu in sysfs\n");
> -               return ret;
> +               goto err_pm_disable;
>          }
>   
>          ret = iommu_device_register(&qcom_iommu->iommu, &qcom_iommu_ops, dev);
> @@ -869,6 +869,9 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
>   
>   err_sysfs_remove:
>          iommu_device_sysfs_remove(&qcom_iommu->iommu);
> +
> +err_pm_disable:
> +       pm_runtime_disable(dev);
>          return ret;
>   }
>
Best regards
Robin Murphy June 30, 2021, 2:01 p.m. UTC | #5
On 2021-06-30 14:48, Marek Szyprowski wrote:
> On 30.06.2021 14:59, Will Deacon wrote:
>> On Wed, Jun 30, 2021 at 02:48:15PM +0200, Marek Szyprowski wrote:
>>> On 08.06.2021 18:45, Amey Narkhede wrote:
>>>> If device registration fails, remove sysfs attribute
>>>> and if setting bus callbacks fails, unregister the device
>>>> and cleanup the sysfs attribute.
>>>>
>>>> Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
>>> This patch landed in linux-next some time ago as commit 249c9dc6aa0d
>>> ("iommu/arm: Cleanup resources in case of probe error path"). After
>>> bisecting and some manual searching I finally found that it is
>>> responsible for breaking s2idle on DragonBoard 410c. Here is the log
>>> (captured with no_console_suspend):
>>>
>>> # time rtcwake -s10 -mmem
>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Thu Jan  1 00:02:13 1970
>>> PM: suspend entry (s2idle)
>>> Filesystems sync: 0.002 seconds
>>> Freezing user space processes ... (elapsed 0.006 seconds) done.
>>> OOM killer disabled.
>>> Freezing remaining freezable tasks ... (elapsed 0.004 seconds) done.
>>> Unable to handle kernel NULL pointer dereference at virtual address
>>> 0000000000000070
>>> Mem abort info:
>>>      ESR = 0x96000006
>>>      EC = 0x25: DABT (current EL), IL = 32 bits
>>>      SET = 0, FnV = 0
>>>      EA = 0, S1PTW = 0
>>>      FSC = 0x06: level 2 translation fault
>>> Data abort info:
>>>      ISV = 0, ISS = 0x00000006
>>>      CM = 0, WnR = 0
>>> user pgtable: 4k pages, 48-bit VAs, pgdp=000000008ad08000
>>> [0000000000000070] pgd=0800000085c3c003, p4d=0800000085c3c003,
>>> pud=0800000088dcf003, pmd=0000000000000000
>>> Internal error: Oops: 96000006 [#1] PREEMPT SMP
>>> Modules linked in: bluetooth ecdh_generic ecc rfkill ipv6 ax88796b
>>> venus_enc venus_dec videobuf2_dma_contig asix crct10dif_ce adv7511
>>> snd_soc_msm8916_analog qcom_spmi_temp_alarm rtc_pm8xxx qcom_pon
>>> qcom_camss qcom_spmi_vadc videobuf2_dma_sg qcom_vadc_common msm
>>> venus_core v4l2_fwnode v4l2_async snd_soc_msm8916_digital
>>> videobuf2_memops snd_soc_lpass_apq8016 snd_soc_lpass_cpu v4l2_mem2mem
>>> snd_soc_lpass_platform snd_soc_apq8016_sbc videobuf2_v4l2
>>> snd_soc_qcom_common qcom_rng videobuf2_common i2c_qcom_cci qnoc_msm8916
>>> videodev mc icc_smd_rpm mdt_loader socinfo display_connector rmtfs_mem
>>> CPU: 1 PID: 1522 Comm: rtcwake Not tainted 5.13.0-next-20210629 #3592
>>> Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
>>> pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
>>> pc : msm_runtime_suspend+0x1c/0x60 [msm]
>>> lr : msm_pm_suspend+0x18/0x38 [msm]
>>> ...
>>> Call trace:
>>>     msm_runtime_suspend+0x1c/0x60 [msm]
>>>     msm_pm_suspend+0x18/0x38 [msm]
>>>     dpm_run_callback+0x84/0x378
>> I wonder if we're missing a pm_runtime_disable() call on the failure path?
>> i.e. something like the diff below...
> 
> I've checked and it doesn't fix anything.

What's happened previously? Has an IOMMU actually failed to probe, or is 
this a fiddly "code movement unveils latent bug elsewhere" kind of 
thing? There doesn't look to be much capable of going wrong in 
msm_runtime_suspend() itself, so is the DRM driver also in a broken 
half-probed state where it's left its pm_runtime_ops behind without its 
drvdata being valid?

Robin.

> 
>> Will
>>
>> --->8
>>
>> diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
>> index 25ed444ff94d..ce8f354755d0 100644
>> --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
>> +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
>> @@ -836,14 +836,14 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
>>           ret = devm_of_platform_populate(dev);
>>           if (ret) {
>>                   dev_err(dev, "Failed to populate iommu contexts\n");
>> -               return ret;
>> +               goto err_pm_disable;
>>           }
>>    
>>           ret = iommu_device_sysfs_add(&qcom_iommu->iommu, dev, NULL,
>>                                        dev_name(dev));
>>           if (ret) {
>>                   dev_err(dev, "Failed to register iommu in sysfs\n");
>> -               return ret;
>> +               goto err_pm_disable;
>>           }
>>    
>>           ret = iommu_device_register(&qcom_iommu->iommu, &qcom_iommu_ops, dev);
>> @@ -869,6 +869,9 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
>>    
>>    err_sysfs_remove:
>>           iommu_device_sysfs_remove(&qcom_iommu->iommu);
>> +
>> +err_pm_disable:
>> +       pm_runtime_disable(dev);
>>           return ret;
>>    }
>>
> Best regards
>
Krishna Reddy July 1, 2021, 12:13 a.m. UTC | #6
> +
> +err_pm_disable:
> +       pm_runtime_disable(dev);
>          return ret;
>   }

Should it be pm_runtime_force_suspend()?  qcom_iommu_device_remove() doesn't use pm_runtime_disable(dev).

875 static int qcom_iommu_device_remove(struct platform_device *pdev)
876 {
...
881 >-------pm_runtime_force_suspend(&pdev->dev);
882 >-------platform_set_drvdata(pdev, NULL);
883 >-------iommu_device_sysfs_remove(&qcom_iommu->iommu);
884 >-------iommu_device_unregister(&qcom_iommu->iommu);
...
887 }

-KR
Marek Szyprowski July 1, 2021, 8:29 a.m. UTC | #7
Hi Robin,

On 30.06.2021 16:01, Robin Murphy wrote:
> On 2021-06-30 14:48, Marek Szyprowski wrote:
>> On 30.06.2021 14:59, Will Deacon wrote:
>>> On Wed, Jun 30, 2021 at 02:48:15PM +0200, Marek Szyprowski wrote:
>>>> On 08.06.2021 18:45, Amey Narkhede wrote:
>>>>> If device registration fails, remove sysfs attribute
>>>>> and if setting bus callbacks fails, unregister the device
>>>>> and cleanup the sysfs attribute.
>>>>>
>>>>> Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
>>>> This patch landed in linux-next some time ago as commit 249c9dc6aa0d
>>>> ("iommu/arm: Cleanup resources in case of probe error path"). After
>>>> bisecting and some manual searching I finally found that it is
>>>> responsible for breaking s2idle on DragonBoard 410c. Here is the log
>>>> (captured with no_console_suspend):
>>>>
>>>> # time rtcwake -s10 -mmem
>>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Thu Jan  1 00:02:13 1970
>>>> PM: suspend entry (s2idle)
>>>> Filesystems sync: 0.002 seconds
>>>> Freezing user space processes ... (elapsed 0.006 seconds) done.
>>>> OOM killer disabled.
>>>> Freezing remaining freezable tasks ... (elapsed 0.004 seconds) done.
>>>> Unable to handle kernel NULL pointer dereference at virtual address
>>>> 0000000000000070
>>>> Mem abort info:
>>>>      ESR = 0x96000006
>>>>      EC = 0x25: DABT (current EL), IL = 32 bits
>>>>      SET = 0, FnV = 0
>>>>      EA = 0, S1PTW = 0
>>>>      FSC = 0x06: level 2 translation fault
>>>> Data abort info:
>>>>      ISV = 0, ISS = 0x00000006
>>>>      CM = 0, WnR = 0
>>>> user pgtable: 4k pages, 48-bit VAs, pgdp=000000008ad08000
>>>> [0000000000000070] pgd=0800000085c3c003, p4d=0800000085c3c003,
>>>> pud=0800000088dcf003, pmd=0000000000000000
>>>> Internal error: Oops: 96000006 [#1] PREEMPT SMP
>>>> Modules linked in: bluetooth ecdh_generic ecc rfkill ipv6 ax88796b
>>>> venus_enc venus_dec videobuf2_dma_contig asix crct10dif_ce adv7511
>>>> snd_soc_msm8916_analog qcom_spmi_temp_alarm rtc_pm8xxx qcom_pon
>>>> qcom_camss qcom_spmi_vadc videobuf2_dma_sg qcom_vadc_common msm
>>>> venus_core v4l2_fwnode v4l2_async snd_soc_msm8916_digital
>>>> videobuf2_memops snd_soc_lpass_apq8016 snd_soc_lpass_cpu v4l2_mem2mem
>>>> snd_soc_lpass_platform snd_soc_apq8016_sbc videobuf2_v4l2
>>>> snd_soc_qcom_common qcom_rng videobuf2_common i2c_qcom_cci 
>>>> qnoc_msm8916
>>>> videodev mc icc_smd_rpm mdt_loader socinfo display_connector rmtfs_mem
>>>> CPU: 1 PID: 1522 Comm: rtcwake Not tainted 5.13.0-next-20210629 #3592
>>>> Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
>>>> pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
>>>> pc : msm_runtime_suspend+0x1c/0x60 [msm]
>>>> lr : msm_pm_suspend+0x18/0x38 [msm]
>>>> ...
>>>> Call trace:
>>>>     msm_runtime_suspend+0x1c/0x60 [msm]
>>>>     msm_pm_suspend+0x18/0x38 [msm]
>>>>     dpm_run_callback+0x84/0x378
>>> I wonder if we're missing a pm_runtime_disable() call on the failure 
>>> path?
>>> i.e. something like the diff below...
>>
>> I've checked and it doesn't fix anything.
>
> What's happened previously? Has an IOMMU actually failed to probe, or 
> is this a fiddly "code movement unveils latent bug elsewhere" kind of 
> thing? There doesn't look to be much capable of going wrong in 
> msm_runtime_suspend() itself, so is the DRM driver also in a broken 
> half-probed state where it's left its pm_runtime_ops behind without 
> its drvdata being valid?
>
I finally had some time to analyze this issue. It turned out that with 
this patch, iommu fails to probe for soc:iommu@1f08000 device, while it 
worked fine before. This happens because this patch adds a check for the 
return value of the bus_set_iommu() in 
drivers/iommu/arm/arm-smmu/qcom_iommu.c. When I removed that check, it 
probes successfully again. It looks that there are already iommu ops 
registered for platform bus, before qcom_iommu probes. On the other 
hand, if I remember correctly they are not used during the device 
registration, but they are needed for some legacy stuff. I can send a 
patch restoring old code flow if you think that this is a right solution.

Best regards
Will Deacon July 1, 2021, 9:01 a.m. UTC | #8
On Thu, Jul 01, 2021 at 10:29:29AM +0200, Marek Szyprowski wrote:
> Hi Robin,
> 
> On 30.06.2021 16:01, Robin Murphy wrote:
> > On 2021-06-30 14:48, Marek Szyprowski wrote:
> >> On 30.06.2021 14:59, Will Deacon wrote:
> >>> On Wed, Jun 30, 2021 at 02:48:15PM +0200, Marek Szyprowski wrote:
> >>>> On 08.06.2021 18:45, Amey Narkhede wrote:
> >>>>> If device registration fails, remove sysfs attribute
> >>>>> and if setting bus callbacks fails, unregister the device
> >>>>> and cleanup the sysfs attribute.
> >>>>>
> >>>>> Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
> >>>> This patch landed in linux-next some time ago as commit 249c9dc6aa0d
> >>>> ("iommu/arm: Cleanup resources in case of probe error path"). After
> >>>> bisecting and some manual searching I finally found that it is
> >>>> responsible for breaking s2idle on DragonBoard 410c. Here is the log
> >>>> (captured with no_console_suspend):
> >>>>
> >>>> # time rtcwake -s10 -mmem
> >>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Thu Jan  1 00:02:13 1970
> >>>> PM: suspend entry (s2idle)
> >>>> Filesystems sync: 0.002 seconds
> >>>> Freezing user space processes ... (elapsed 0.006 seconds) done.
> >>>> OOM killer disabled.
> >>>> Freezing remaining freezable tasks ... (elapsed 0.004 seconds) done.
> >>>> Unable to handle kernel NULL pointer dereference at virtual address
> >>>> 0000000000000070
> >>>> Mem abort info:
> >>>>      ESR = 0x96000006
> >>>>      EC = 0x25: DABT (current EL), IL = 32 bits
> >>>>      SET = 0, FnV = 0
> >>>>      EA = 0, S1PTW = 0
> >>>>      FSC = 0x06: level 2 translation fault
> >>>> Data abort info:
> >>>>      ISV = 0, ISS = 0x00000006
> >>>>      CM = 0, WnR = 0
> >>>> user pgtable: 4k pages, 48-bit VAs, pgdp=000000008ad08000
> >>>> [0000000000000070] pgd=0800000085c3c003, p4d=0800000085c3c003,
> >>>> pud=0800000088dcf003, pmd=0000000000000000
> >>>> Internal error: Oops: 96000006 [#1] PREEMPT SMP
> >>>> Modules linked in: bluetooth ecdh_generic ecc rfkill ipv6 ax88796b
> >>>> venus_enc venus_dec videobuf2_dma_contig asix crct10dif_ce adv7511
> >>>> snd_soc_msm8916_analog qcom_spmi_temp_alarm rtc_pm8xxx qcom_pon
> >>>> qcom_camss qcom_spmi_vadc videobuf2_dma_sg qcom_vadc_common msm
> >>>> venus_core v4l2_fwnode v4l2_async snd_soc_msm8916_digital
> >>>> videobuf2_memops snd_soc_lpass_apq8016 snd_soc_lpass_cpu v4l2_mem2mem
> >>>> snd_soc_lpass_platform snd_soc_apq8016_sbc videobuf2_v4l2
> >>>> snd_soc_qcom_common qcom_rng videobuf2_common i2c_qcom_cci 
> >>>> qnoc_msm8916
> >>>> videodev mc icc_smd_rpm mdt_loader socinfo display_connector rmtfs_mem
> >>>> CPU: 1 PID: 1522 Comm: rtcwake Not tainted 5.13.0-next-20210629 #3592
> >>>> Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
> >>>> pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
> >>>> pc : msm_runtime_suspend+0x1c/0x60 [msm]
> >>>> lr : msm_pm_suspend+0x18/0x38 [msm]
> >>>> ...
> >>>> Call trace:
> >>>>     msm_runtime_suspend+0x1c/0x60 [msm]
> >>>>     msm_pm_suspend+0x18/0x38 [msm]
> >>>>     dpm_run_callback+0x84/0x378
> >>> I wonder if we're missing a pm_runtime_disable() call on the failure 
> >>> path?
> >>> i.e. something like the diff below...
> >>
> >> I've checked and it doesn't fix anything.
> >
> > What's happened previously? Has an IOMMU actually failed to probe, or 
> > is this a fiddly "code movement unveils latent bug elsewhere" kind of 
> > thing? There doesn't look to be much capable of going wrong in 
> > msm_runtime_suspend() itself, so is the DRM driver also in a broken 
> > half-probed state where it's left its pm_runtime_ops behind without 
> > its drvdata being valid?
> >
> I finally had some time to analyze this issue. It turned out that with 
> this patch, iommu fails to probe for soc:iommu@1f08000 device, while it 
> worked fine before. This happens because this patch adds a check for the 
> return value of the bus_set_iommu() in 
> drivers/iommu/arm/arm-smmu/qcom_iommu.c. When I removed that check, it 
> probes successfully again. It looks that there are already iommu ops 
> registered for platform bus, before qcom_iommu probes. On the other 
> hand, if I remember correctly they are not used during the device 
> registration, but they are needed for some legacy stuff. I can send a 
> patch restoring old code flow if you think that this is a right solution.

Yes, let's just revert the qcom_iommu.c changes from that patch for now.
The pm runtime stuff looks dodgy anyway so I think this needs more thought.

Will
Robin Murphy July 1, 2021, 9:11 a.m. UTC | #9
On 2021-07-01 10:01, Will Deacon wrote:
> On Thu, Jul 01, 2021 at 10:29:29AM +0200, Marek Szyprowski wrote:
>> Hi Robin,
>>
>> On 30.06.2021 16:01, Robin Murphy wrote:
>>> On 2021-06-30 14:48, Marek Szyprowski wrote:
>>>> On 30.06.2021 14:59, Will Deacon wrote:
>>>>> On Wed, Jun 30, 2021 at 02:48:15PM +0200, Marek Szyprowski wrote:
>>>>>> On 08.06.2021 18:45, Amey Narkhede wrote:
>>>>>>> If device registration fails, remove sysfs attribute
>>>>>>> and if setting bus callbacks fails, unregister the device
>>>>>>> and cleanup the sysfs attribute.
>>>>>>>
>>>>>>> Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
>>>>>> This patch landed in linux-next some time ago as commit 249c9dc6aa0d
>>>>>> ("iommu/arm: Cleanup resources in case of probe error path"). After
>>>>>> bisecting and some manual searching I finally found that it is
>>>>>> responsible for breaking s2idle on DragonBoard 410c. Here is the log
>>>>>> (captured with no_console_suspend):
>>>>>>
>>>>>> # time rtcwake -s10 -mmem
>>>>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Thu Jan� 1 00:02:13 1970
>>>>>> PM: suspend entry (s2idle)
>>>>>> Filesystems sync: 0.002 seconds
>>>>>> Freezing user space processes ... (elapsed 0.006 seconds) done.
>>>>>> OOM killer disabled.
>>>>>> Freezing remaining freezable tasks ... (elapsed 0.004 seconds) done.
>>>>>> Unable to handle kernel NULL pointer dereference at virtual address
>>>>>> 0000000000000070
>>>>>> Mem abort info:
>>>>>> �� � ESR = 0x96000006
>>>>>> �� � EC = 0x25: DABT (current EL), IL = 32 bits
>>>>>> �� � SET = 0, FnV = 0
>>>>>> �� � EA = 0, S1PTW = 0
>>>>>> �� � FSC = 0x06: level 2 translation fault
>>>>>> Data abort info:
>>>>>> �� � ISV = 0, ISS = 0x00000006
>>>>>> �� � CM = 0, WnR = 0
>>>>>> user pgtable: 4k pages, 48-bit VAs, pgdp=000000008ad08000
>>>>>> [0000000000000070] pgd=0800000085c3c003, p4d=0800000085c3c003,
>>>>>> pud=0800000088dcf003, pmd=0000000000000000
>>>>>> Internal error: Oops: 96000006 [#1] PREEMPT SMP
>>>>>> Modules linked in: bluetooth ecdh_generic ecc rfkill ipv6 ax88796b
>>>>>> venus_enc venus_dec videobuf2_dma_contig asix crct10dif_ce adv7511
>>>>>> snd_soc_msm8916_analog qcom_spmi_temp_alarm rtc_pm8xxx qcom_pon
>>>>>> qcom_camss qcom_spmi_vadc videobuf2_dma_sg qcom_vadc_common msm
>>>>>> venus_core v4l2_fwnode v4l2_async snd_soc_msm8916_digital
>>>>>> videobuf2_memops snd_soc_lpass_apq8016 snd_soc_lpass_cpu v4l2_mem2mem
>>>>>> snd_soc_lpass_platform snd_soc_apq8016_sbc videobuf2_v4l2
>>>>>> snd_soc_qcom_common qcom_rng videobuf2_common i2c_qcom_cci
>>>>>> qnoc_msm8916
>>>>>> videodev mc icc_smd_rpm mdt_loader socinfo display_connector rmtfs_mem
>>>>>> CPU: 1 PID: 1522 Comm: rtcwake Not tainted 5.13.0-next-20210629 #3592
>>>>>> Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
>>>>>> pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
>>>>>> pc : msm_runtime_suspend+0x1c/0x60 [msm]
>>>>>> lr : msm_pm_suspend+0x18/0x38 [msm]
>>>>>> ...
>>>>>> Call trace:
>>>>>> �� �msm_runtime_suspend+0x1c/0x60 [msm]
>>>>>> �� �msm_pm_suspend+0x18/0x38 [msm]
>>>>>> �� �dpm_run_callback+0x84/0x378
>>>>> I wonder if we're missing a pm_runtime_disable() call on the failure
>>>>> path?
>>>>> i.e. something like the diff below...
>>>>
>>>> I've checked and it doesn't fix anything.
>>>
>>> What's happened previously? Has an IOMMU actually failed to probe, or
>>> is this a fiddly "code movement unveils latent bug elsewhere" kind of
>>> thing? There doesn't look to be much capable of going wrong in
>>> msm_runtime_suspend() itself, so is the DRM driver also in a broken
>>> half-probed state where it's left its pm_runtime_ops behind without
>>> its drvdata being valid?
>>>
>> I finally had some time to analyze this issue. It turned out that with
>> this patch, iommu fails to probe for soc:iommu@1f08000 device, while it
>> worked fine before. This happens because this patch adds a check for the
>> return value of the bus_set_iommu() in
>> drivers/iommu/arm/arm-smmu/qcom_iommu.c. When I removed that check, it
>> probes successfully again. It looks that there are already iommu ops
>> registered for platform bus, before qcom_iommu probes. On the other
>> hand, if I remember correctly they are not used during the device
>> registration, but they are needed for some legacy stuff. I can send a
>> patch restoring old code flow if you think that this is a right solution.
> 
> Yes, let's just revert the qcom_iommu.c changes from that patch for now.
> The pm runtime stuff looks dodgy anyway so I think this needs more thought.

Oh, right, blindly returning the -EBUSY from bus_set_iommu() because 
we're not the first instance to probe is definitely the wrong thing to 
do as well. It's still not clear why failing makes the DRM driver fall 
over, but +1 to qcom-iommu needing some deeper consideration.

Robin.
Marek Szyprowski July 1, 2021, 9:26 a.m. UTC | #10
On 01.07.2021 11:11, Robin Murphy wrote:
> On 2021-07-01 10:01, Will Deacon wrote:
>> On Thu, Jul 01, 2021 at 10:29:29AM +0200, Marek Szyprowski wrote:
>>> Hi Robin,
>>>
>>> On 30.06.2021 16:01, Robin Murphy wrote:
>>>> On 2021-06-30 14:48, Marek Szyprowski wrote:
>>>>> On 30.06.2021 14:59, Will Deacon wrote:
>>>>>> On Wed, Jun 30, 2021 at 02:48:15PM +0200, Marek Szyprowski wrote:
>>>>>>> On 08.06.2021 18:45, Amey Narkhede wrote:
>>>>>>>> If device registration fails, remove sysfs attribute
>>>>>>>> and if setting bus callbacks fails, unregister the device
>>>>>>>> and cleanup the sysfs attribute.
>>>>>>>>
>>>>>>>> Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
>>>>>>> This patch landed in linux-next some time ago as commit 
>>>>>>> 249c9dc6aa0d
>>>>>>> ("iommu/arm: Cleanup resources in case of probe error path"). After
>>>>>>> bisecting and some manual searching I finally found that it is
>>>>>>> responsible for breaking s2idle on DragonBoard 410c. Here is the 
>>>>>>> log
>>>>>>> (captured with no_console_suspend):
>>>>>>>
>>>>>>> # time rtcwake -s10 -mmem
>>>>>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Thu Jan� 1 
>>>>>>> 00:02:13 1970
>>>>>>> PM: suspend entry (s2idle)
>>>>>>> Filesystems sync: 0.002 seconds
>>>>>>> Freezing user space processes ... (elapsed 0.006 seconds) done.
>>>>>>> OOM killer disabled.
>>>>>>> Freezing remaining freezable tasks ... (elapsed 0.004 seconds) 
>>>>>>> done.
>>>>>>> Unable to handle kernel NULL pointer dereference at virtual address
>>>>>>> 0000000000000070
>>>>>>> Mem abort info:
>>>>>>> �� � ESR = 0x96000006
>>>>>>> �� � EC = 0x25: DABT (current EL), IL = 32 bits
>>>>>>> �� � SET = 0, FnV = 0
>>>>>>> �� � EA = 0, S1PTW = 0
>>>>>>> �� � FSC = 0x06: level 2 translation fault
>>>>>>> Data abort info:
>>>>>>> �� � ISV = 0, ISS = 0x00000006
>>>>>>> �� � CM = 0, WnR = 0
>>>>>>> user pgtable: 4k pages, 48-bit VAs, pgdp=000000008ad08000
>>>>>>> [0000000000000070] pgd=0800000085c3c003, p4d=0800000085c3c003,
>>>>>>> pud=0800000088dcf003, pmd=0000000000000000
>>>>>>> Internal error: Oops: 96000006 [#1] PREEMPT SMP
>>>>>>> Modules linked in: bluetooth ecdh_generic ecc rfkill ipv6 ax88796b
>>>>>>> venus_enc venus_dec videobuf2_dma_contig asix crct10dif_ce adv7511
>>>>>>> snd_soc_msm8916_analog qcom_spmi_temp_alarm rtc_pm8xxx qcom_pon
>>>>>>> qcom_camss qcom_spmi_vadc videobuf2_dma_sg qcom_vadc_common msm
>>>>>>> venus_core v4l2_fwnode v4l2_async snd_soc_msm8916_digital
>>>>>>> videobuf2_memops snd_soc_lpass_apq8016 snd_soc_lpass_cpu 
>>>>>>> v4l2_mem2mem
>>>>>>> snd_soc_lpass_platform snd_soc_apq8016_sbc videobuf2_v4l2
>>>>>>> snd_soc_qcom_common qcom_rng videobuf2_common i2c_qcom_cci
>>>>>>> qnoc_msm8916
>>>>>>> videodev mc icc_smd_rpm mdt_loader socinfo display_connector 
>>>>>>> rmtfs_mem
>>>>>>> CPU: 1 PID: 1522 Comm: rtcwake Not tainted 5.13.0-next-20210629 
>>>>>>> #3592
>>>>>>> Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
>>>>>>> pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
>>>>>>> pc : msm_runtime_suspend+0x1c/0x60 [msm]
>>>>>>> lr : msm_pm_suspend+0x18/0x38 [msm]
>>>>>>> ...
>>>>>>> Call trace:
>>>>>>> �� �msm_runtime_suspend+0x1c/0x60 [msm]
>>>>>>> �� �msm_pm_suspend+0x18/0x38 [msm]
>>>>>>> �� �dpm_run_callback+0x84/0x378
>>>>>> I wonder if we're missing a pm_runtime_disable() call on the failure
>>>>>> path?
>>>>>> i.e. something like the diff below...
>>>>>
>>>>> I've checked and it doesn't fix anything.
>>>>
>>>> What's happened previously? Has an IOMMU actually failed to probe, or
>>>> is this a fiddly "code movement unveils latent bug elsewhere" kind of
>>>> thing? There doesn't look to be much capable of going wrong in
>>>> msm_runtime_suspend() itself, so is the DRM driver also in a broken
>>>> half-probed state where it's left its pm_runtime_ops behind without
>>>> its drvdata being valid?
>>>>
>>> I finally had some time to analyze this issue. It turned out that with
>>> this patch, iommu fails to probe for soc:iommu@1f08000 device, while it
>>> worked fine before. This happens because this patch adds a check for 
>>> the
>>> return value of the bus_set_iommu() in
>>> drivers/iommu/arm/arm-smmu/qcom_iommu.c. When I removed that check, it
>>> probes successfully again. It looks that there are already iommu ops
>>> registered for platform bus, before qcom_iommu probes. On the other
>>> hand, if I remember correctly they are not used during the device
>>> registration, but they are needed for some legacy stuff. I can send a
>>> patch restoring old code flow if you think that this is a right 
>>> solution.
>>
>> Yes, let's just revert the qcom_iommu.c changes from that patch for now.
>> The pm runtime stuff looks dodgy anyway so I think this needs more 
>> thought.
>
> Oh, right, blindly returning the -EBUSY from bus_set_iommu() because 
> we're not the first instance to probe is definitely the wrong thing to 
> do as well. It's still not clear why failing makes the DRM driver fall 
> over, but +1 to qcom-iommu needing some deeper consideration.

I've just checked and bus_set_iommu() is called for every 
'qcom,msm-iommu-v1' device in the system, thus it fails for the second 
and next devices.

Best regards
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 54b2f27b81d4..de2499754025 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3669,10 +3669,20 @@  static int arm_smmu_device_probe(struct platform_device *pdev)
 	ret = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
 	if (ret) {
 		dev_err(dev, "Failed to register iommu\n");
-		return ret;
+		goto err_sysfs_remove;
 	}

-	return arm_smmu_set_bus_ops(&arm_smmu_ops);
+	ret = arm_smmu_set_bus_ops(&arm_smmu_ops);
+	if (ret)
+		goto err_unregister_device;
+
+	return 0;
+
+err_unregister_device:
+	iommu_device_unregister(&smmu->iommu);
+err_sysfs_remove:
+	iommu_device_sysfs_remove(&smmu->iommu);
+	return ret;
 }

 static int arm_smmu_device_remove(struct platform_device *pdev)
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 6f72c4d208ca..88a3023676ce 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2164,7 +2164,7 @@  static int arm_smmu_device_probe(struct platform_device *pdev)
 	err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
 	if (err) {
 		dev_err(dev, "Failed to register iommu\n");
-		return err;
+		goto err_sysfs_remove;
 	}

 	platform_set_drvdata(pdev, smmu);
@@ -2187,10 +2187,19 @@  static int arm_smmu_device_probe(struct platform_device *pdev)
 	 * any device which might need it, so we want the bus ops in place
 	 * ready to handle default domain setup as soon as any SMMU exists.
 	 */
-	if (!using_legacy_binding)
-		return arm_smmu_bus_init(&arm_smmu_ops);
+	if (!using_legacy_binding) {
+		err = arm_smmu_bus_init(&arm_smmu_ops);
+		if (err)
+			goto err_unregister_device;
+	}

 	return 0;
+
+err_unregister_device:
+	iommu_device_unregister(&smmu->iommu);
+err_sysfs_remove:
+	iommu_device_sysfs_remove(&smmu->iommu);
+	return err;
 }

 static int arm_smmu_device_remove(struct platform_device *pdev)
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index 4294abe389b2..b785d9fb7602 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -850,10 +850,12 @@  static int qcom_iommu_device_probe(struct platform_device *pdev)
 	ret = iommu_device_register(&qcom_iommu->iommu, &qcom_iommu_ops, dev);
 	if (ret) {
 		dev_err(dev, "Failed to register iommu\n");
-		return ret;
+		goto err_sysfs_remove;
 	}

-	bus_set_iommu(&platform_bus_type, &qcom_iommu_ops);
+	ret = bus_set_iommu(&platform_bus_type, &qcom_iommu_ops);
+	if (ret)
+		goto err_unregister_device;

 	if (qcom_iommu->local_base) {
 		pm_runtime_get_sync(dev);
@@ -862,6 +864,13 @@  static int qcom_iommu_device_probe(struct platform_device *pdev)
 	}

 	return 0;
+
+err_unregister_device:
+	iommu_device_unregister(&qcom_iommu->iommu);
+
+err_sysfs_remove:
+	iommu_device_sysfs_remove(&qcom_iommu->iommu);
+	return ret;
 }

 static int qcom_iommu_device_remove(struct platform_device *pdev)