diff mbox series

coresight: cti: write regsiters directly in cti_enable_hw()

Message ID 20200818111057.19755-3-tingwei@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series coresight: cti: write regsiters directly in cti_enable_hw() | expand

Commit Message

Tingwei Zhang Aug. 18, 2020, 11:10 a.m. UTC
Deadlock as below is triggered by one CPU holds drvdata->spinlock
and calls cti_enable_hw(). Smp_call_function_single() is called
in cti_enable_hw() and tries to let another CPU write CTI registers.
That CPU is trying to get drvdata->spinlock in cti_cpu_pm_notify()
and doesn't response to IPI from smp_call_function_single().

[  988.335937] CPU: 6 PID: 10258 Comm: sh Tainted: G        W    L
5.8.0-rc6-mainline-16783-gc38daa79b26b-dirty #1
[  988.346364] Hardware name: Thundercomm Dragonboard 845c (DT)
[  988.352073] pstate: 20400005 (nzCv daif +PAN -UAO BTYPE=--)
[  988.357689] pc : smp_call_function_single+0x158/0x1b8
[  988.362782] lr : smp_call_function_single+0x124/0x1b8
...
[  988.451638] Call trace:
[  988.454119]  smp_call_function_single+0x158/0x1b8
[  988.458866]  cti_enable+0xb4/0xf8 [coresight_cti]
[  988.463618]  coresight_control_assoc_ectdev+0x6c/0x128 [coresight]
[  988.469855]  coresight_enable+0x1f0/0x364 [coresight]
[  988.474957]  enable_source_store+0x5c/0x9c [coresight]
[  988.480140]  dev_attr_store+0x14/0x28
[  988.483839]  sysfs_kf_write+0x38/0x4c
[  988.487532]  kernfs_fop_write+0x1c0/0x2b0
[  988.491585]  vfs_write+0xfc/0x300
[  988.494931]  ksys_write+0x78/0xe0
[  988.498283]  __arm64_sys_write+0x18/0x20
[  988.502240]  el0_svc_common+0x98/0x160
[  988.506024]  do_el0_svc+0x78/0x80
[  988.509377]  el0_sync_handler+0xd4/0x270
[  988.513337]  el0_sync+0x164/0x180

This change write CTI registers directly in cti_enable_hw().
Config->hw_powered has been checked to be true with spinlock holded.
CTI is powered and can be programmed until spinlock is released.

Fixes: 6a0953ce7de9 ("coresight: cti: Add CPU idle pm notifer to CTI devices")
Signed-off-by: Tingwei Zhang <tingwei@codeaurora.org>
---
 drivers/hwtracing/coresight/coresight-cti.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

Comments

Mike Leach Aug. 20, 2020, 8:06 a.m. UTC | #1
Hi Tingwei

On Tue, 18 Aug 2020 at 12:11, Tingwei Zhang <tingwei@codeaurora.org> wrote:
>
> Deadlock as below is triggered by one CPU holds drvdata->spinlock
> and calls cti_enable_hw(). Smp_call_function_single() is called
> in cti_enable_hw() and tries to let another CPU write CTI registers.
> That CPU is trying to get drvdata->spinlock in cti_cpu_pm_notify()
> and doesn't response to IPI from smp_call_function_single().
>
> [  988.335937] CPU: 6 PID: 10258 Comm: sh Tainted: G        W    L
> 5.8.0-rc6-mainline-16783-gc38daa79b26b-dirty #1
> [  988.346364] Hardware name: Thundercomm Dragonboard 845c (DT)
> [  988.352073] pstate: 20400005 (nzCv daif +PAN -UAO BTYPE=--)
> [  988.357689] pc : smp_call_function_single+0x158/0x1b8
> [  988.362782] lr : smp_call_function_single+0x124/0x1b8
> ...
> [  988.451638] Call trace:
> [  988.454119]  smp_call_function_single+0x158/0x1b8
> [  988.458866]  cti_enable+0xb4/0xf8 [coresight_cti]
> [  988.463618]  coresight_control_assoc_ectdev+0x6c/0x128 [coresight]
> [  988.469855]  coresight_enable+0x1f0/0x364 [coresight]
> [  988.474957]  enable_source_store+0x5c/0x9c [coresight]
> [  988.480140]  dev_attr_store+0x14/0x28
> [  988.483839]  sysfs_kf_write+0x38/0x4c
> [  988.487532]  kernfs_fop_write+0x1c0/0x2b0
> [  988.491585]  vfs_write+0xfc/0x300
> [  988.494931]  ksys_write+0x78/0xe0
> [  988.498283]  __arm64_sys_write+0x18/0x20
> [  988.502240]  el0_svc_common+0x98/0x160
> [  988.506024]  do_el0_svc+0x78/0x80
> [  988.509377]  el0_sync_handler+0xd4/0x270
> [  988.513337]  el0_sync+0x164/0x180
>
> This change write CTI registers directly in cti_enable_hw().
> Config->hw_powered has been checked to be true with spinlock holded.
> CTI is powered and can be programmed until spinlock is released.
>

Agreed.

Reviewed-by Mike Leach <mike.leach@linaro.org>

> Fixes: 6a0953ce7de9 ("coresight: cti: Add CPU idle pm notifer to CTI devices")
> Signed-off-by: Tingwei Zhang <tingwei@codeaurora.org>
> ---
>  drivers/hwtracing/coresight/coresight-cti.c | 17 +----------------
>  1 file changed, 1 insertion(+), 16 deletions(-)
>
> diff --git a/drivers/hwtracing/coresight/coresight-cti.c b/drivers/hwtracing/coresight/coresight-cti.c
> index 3ccc703dc940..869569eb8c7f 100644
> --- a/drivers/hwtracing/coresight/coresight-cti.c
> +++ b/drivers/hwtracing/coresight/coresight-cti.c
> @@ -86,13 +86,6 @@ void cti_write_all_hw_regs(struct cti_drvdata *drvdata)
>         CS_LOCK(drvdata->base);
>  }
>
> -static void cti_enable_hw_smp_call(void *info)
> -{
> -       struct cti_drvdata *drvdata = info;
> -
> -       cti_write_all_hw_regs(drvdata);
> -}
> -
>  /* write regs to hardware and enable */
>  static int cti_enable_hw(struct cti_drvdata *drvdata)
>  {
> @@ -112,15 +105,7 @@ static int cti_enable_hw(struct cti_drvdata *drvdata)
>         if (rc)
>                 goto cti_err_not_enabled;
>
> -       if (drvdata->ctidev.cpu >= 0) {
> -               rc = smp_call_function_single(drvdata->ctidev.cpu,
> -                                             cti_enable_hw_smp_call,
> -                                             drvdata, 1);
> -               if (rc)
> -                       goto cti_err_not_enabled;
> -       } else {
> -               cti_write_all_hw_regs(drvdata);
> -       }
> +       cti_write_all_hw_regs(drvdata);
>
>         config->hw_enabled = true;
>         atomic_inc(&drvdata->config.enable_req_count);
> --
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
>
> _______________________________________________
> CoreSight mailing list
> CoreSight@lists.linaro.org
> https://lists.linaro.org/mailman/listinfo/coresight
Mathieu Poirier Aug. 27, 2020, 6:12 p.m. UTC | #2
Hi Tingwei,

On Tue, Aug 18, 2020 at 07:10:57PM +0800, Tingwei Zhang wrote:
> Deadlock as below is triggered by one CPU holds drvdata->spinlock
> and calls cti_enable_hw(). Smp_call_function_single() is called
> in cti_enable_hw() and tries to let another CPU write CTI registers.
> That CPU is trying to get drvdata->spinlock in cti_cpu_pm_notify()
> and doesn't response to IPI from smp_call_function_single().
> 
> [  988.335937] CPU: 6 PID: 10258 Comm: sh Tainted: G        W    L
> 5.8.0-rc6-mainline-16783-gc38daa79b26b-dirty #1
> [  988.346364] Hardware name: Thundercomm Dragonboard 845c (DT)
> [  988.352073] pstate: 20400005 (nzCv daif +PAN -UAO BTYPE=--)
> [  988.357689] pc : smp_call_function_single+0x158/0x1b8
> [  988.362782] lr : smp_call_function_single+0x124/0x1b8
> ...
> [  988.451638] Call trace:
> [  988.454119]  smp_call_function_single+0x158/0x1b8
> [  988.458866]  cti_enable+0xb4/0xf8 [coresight_cti]
> [  988.463618]  coresight_control_assoc_ectdev+0x6c/0x128 [coresight]
> [  988.469855]  coresight_enable+0x1f0/0x364 [coresight]
> [  988.474957]  enable_source_store+0x5c/0x9c [coresight]
> [  988.480140]  dev_attr_store+0x14/0x28
> [  988.483839]  sysfs_kf_write+0x38/0x4c
> [  988.487532]  kernfs_fop_write+0x1c0/0x2b0
> [  988.491585]  vfs_write+0xfc/0x300
> [  988.494931]  ksys_write+0x78/0xe0
> [  988.498283]  __arm64_sys_write+0x18/0x20
> [  988.502240]  el0_svc_common+0x98/0x160
> [  988.506024]  do_el0_svc+0x78/0x80
> [  988.509377]  el0_sync_handler+0xd4/0x270
> [  988.513337]  el0_sync+0x164/0x180
> 

Was this the full log or you did cut some of it?

> This change write CTI registers directly in cti_enable_hw().
> Config->hw_powered has been checked to be true with spinlock holded.
> CTI is powered and can be programmed until spinlock is released.
> 

From your explanation above it seems that cti_enable_hw() was called from, say
CPUy, to enable the CTI associated to CPUx.  CTIx's drvdata->spinlock was taken
and smp_call_function_single() called right after.  That woke up CPUx and
cti_cpu_pm_notify() was executed on CPUx in interrupt context, trying to take
CTIx's drvdata->spinlock.  That hung CPUx and the kernel got angry.  Is my
assessment correct?

If so I don't think the fix suggested in this patch will work.  The same
condition will happen whenever cti_enable_hw() is called on a CPU to enable a
CTI that belongs to another CPU and that cti_cpu_pm_notify() is called on latter
CPU at the same time.

I think a better solution is to grab the lock in cti_enable_hw() and check the
value of ->ctidev.cpu.  If not a global CPU, i.e >= 0, then release the lock and
call smp_call_function_single().  In cti_enable_hw_smp_call() take the lock
again and move forward from there. 

I have applied the other two patches in this set so no need to send them again.

Thanks,
Mathieu

> Fixes: 6a0953ce7de9 ("coresight: cti: Add CPU idle pm notifer to CTI devices")
> Signed-off-by: Tingwei Zhang <tingwei@codeaurora.org>
> ---
>  drivers/hwtracing/coresight/coresight-cti.c | 17 +----------------
>  1 file changed, 1 insertion(+), 16 deletions(-)
> 
> diff --git a/drivers/hwtracing/coresight/coresight-cti.c b/drivers/hwtracing/coresight/coresight-cti.c
> index 3ccc703dc940..869569eb8c7f 100644
> --- a/drivers/hwtracing/coresight/coresight-cti.c
> +++ b/drivers/hwtracing/coresight/coresight-cti.c
> @@ -86,13 +86,6 @@ void cti_write_all_hw_regs(struct cti_drvdata *drvdata)
>  	CS_LOCK(drvdata->base);
>  }
>  
> -static void cti_enable_hw_smp_call(void *info)
> -{
> -	struct cti_drvdata *drvdata = info;
> -
> -	cti_write_all_hw_regs(drvdata);
> -}
> -
>  /* write regs to hardware and enable */
>  static int cti_enable_hw(struct cti_drvdata *drvdata)
>  {
> @@ -112,15 +105,7 @@ static int cti_enable_hw(struct cti_drvdata *drvdata)
>  	if (rc)
>  		goto cti_err_not_enabled;
>  
> -	if (drvdata->ctidev.cpu >= 0) {
> -		rc = smp_call_function_single(drvdata->ctidev.cpu,
> -					      cti_enable_hw_smp_call,
> -					      drvdata, 1);
> -		if (rc)
> -			goto cti_err_not_enabled;
> -	} else {
> -		cti_write_all_hw_regs(drvdata);
> -	}
> +	cti_write_all_hw_regs(drvdata);
>  
>  	config->hw_enabled = true;
>  	atomic_inc(&drvdata->config.enable_req_count);
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
>
Mathieu Poirier Aug. 27, 2020, 7:17 p.m. UTC | #3
On Tue, Aug 18, 2020 at 07:10:57PM +0800, Tingwei Zhang wrote:
> Deadlock as below is triggered by one CPU holds drvdata->spinlock
> and calls cti_enable_hw(). Smp_call_function_single() is called
> in cti_enable_hw() and tries to let another CPU write CTI registers.
> That CPU is trying to get drvdata->spinlock in cti_cpu_pm_notify()
> and doesn't response to IPI from smp_call_function_single().
> 
> [  988.335937] CPU: 6 PID: 10258 Comm: sh Tainted: G        W    L
> 5.8.0-rc6-mainline-16783-gc38daa79b26b-dirty #1
> [  988.346364] Hardware name: Thundercomm Dragonboard 845c (DT)
> [  988.352073] pstate: 20400005 (nzCv daif +PAN -UAO BTYPE=--)
> [  988.357689] pc : smp_call_function_single+0x158/0x1b8
> [  988.362782] lr : smp_call_function_single+0x124/0x1b8
> ...
> [  988.451638] Call trace:
> [  988.454119]  smp_call_function_single+0x158/0x1b8
> [  988.458866]  cti_enable+0xb4/0xf8 [coresight_cti]
> [  988.463618]  coresight_control_assoc_ectdev+0x6c/0x128 [coresight]
> [  988.469855]  coresight_enable+0x1f0/0x364 [coresight]
> [  988.474957]  enable_source_store+0x5c/0x9c [coresight]
> [  988.480140]  dev_attr_store+0x14/0x28
> [  988.483839]  sysfs_kf_write+0x38/0x4c
> [  988.487532]  kernfs_fop_write+0x1c0/0x2b0
> [  988.491585]  vfs_write+0xfc/0x300
> [  988.494931]  ksys_write+0x78/0xe0
> [  988.498283]  __arm64_sys_write+0x18/0x20
> [  988.502240]  el0_svc_common+0x98/0x160
> [  988.506024]  do_el0_svc+0x78/0x80
> [  988.509377]  el0_sync_handler+0xd4/0x270
> [  988.513337]  el0_sync+0x164/0x180
> 
> This change write CTI registers directly in cti_enable_hw().
> Config->hw_powered has been checked to be true with spinlock holded.
> CTI is powered and can be programmed until spinlock is released.
> 
> Fixes: 6a0953ce7de9 ("coresight: cti: Add CPU idle pm notifer to CTI devices")
> Signed-off-by: Tingwei Zhang <tingwei@codeaurora.org>
> ---
>  drivers/hwtracing/coresight/coresight-cti.c | 17 +----------------
>  1 file changed, 1 insertion(+), 16 deletions(-)
> 
> diff --git a/drivers/hwtracing/coresight/coresight-cti.c b/drivers/hwtracing/coresight/coresight-cti.c
> index 3ccc703dc940..869569eb8c7f 100644
> --- a/drivers/hwtracing/coresight/coresight-cti.c
> +++ b/drivers/hwtracing/coresight/coresight-cti.c
> @@ -86,13 +86,6 @@ void cti_write_all_hw_regs(struct cti_drvdata *drvdata)
>  	CS_LOCK(drvdata->base);
>  }
>  
> -static void cti_enable_hw_smp_call(void *info)
> -{
> -	struct cti_drvdata *drvdata = info;
> -
> -	cti_write_all_hw_regs(drvdata);
> -}
> -
>  /* write regs to hardware and enable */
>  static int cti_enable_hw(struct cti_drvdata *drvdata)
>  {
> @@ -112,15 +105,7 @@ static int cti_enable_hw(struct cti_drvdata *drvdata)
>  	if (rc)
>  		goto cti_err_not_enabled;
>  
> -	if (drvdata->ctidev.cpu >= 0) {
> -		rc = smp_call_function_single(drvdata->ctidev.cpu,
> -					      cti_enable_hw_smp_call,
> -					      drvdata, 1);
> -		if (rc)
> -			goto cti_err_not_enabled;
> -	} else {
> -		cti_write_all_hw_regs(drvdata);
> -	}
> +	cti_write_all_hw_regs(drvdata);

I forgot...  Looking at the code in cti_enable_hw() the claim tags are not
released in the event that smp_call_function_single() fails.  Please address
that as part of refactoring the function. 

Thanks,
Mathieu

>  
>  	config->hw_enabled = true;
>  	atomic_inc(&drvdata->config.enable_req_count);
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
>
Tingwei Zhang Aug. 28, 2020, 3:18 a.m. UTC | #4
On Fri, Aug 28, 2020 at 02:12:53AM +0800, Mathieu Poirier wrote:
> Hi Tingwei,
> 
> On Tue, Aug 18, 2020 at 07:10:57PM +0800, Tingwei Zhang wrote:
> > Deadlock as below is triggered by one CPU holds drvdata->spinlock
> > and calls cti_enable_hw(). Smp_call_function_single() is called
> > in cti_enable_hw() and tries to let another CPU write CTI registers.
> > That CPU is trying to get drvdata->spinlock in cti_cpu_pm_notify()
> > and doesn't response to IPI from smp_call_function_single().
> > 
> > [  988.335937] CPU: 6 PID: 10258 Comm: sh Tainted: G        W    L
> > 5.8.0-rc6-mainline-16783-gc38daa79b26b-dirty #1
> > [  988.346364] Hardware name: Thundercomm Dragonboard 845c (DT)
> > [  988.352073] pstate: 20400005 (nzCv daif +PAN -UAO BTYPE=--)
> > [  988.357689] pc : smp_call_function_single+0x158/0x1b8
> > [  988.362782] lr : smp_call_function_single+0x124/0x1b8
> > ...
> > [  988.451638] Call trace:
> > [  988.454119]  smp_call_function_single+0x158/0x1b8
> > [  988.458866]  cti_enable+0xb4/0xf8 [coresight_cti]
> > [  988.463618]  coresight_control_assoc_ectdev+0x6c/0x128 [coresight]
> > [  988.469855]  coresight_enable+0x1f0/0x364 [coresight]
> > [  988.474957]  enable_source_store+0x5c/0x9c [coresight]
> > [  988.480140]  dev_attr_store+0x14/0x28
> > [  988.483839]  sysfs_kf_write+0x38/0x4c
> > [  988.487532]  kernfs_fop_write+0x1c0/0x2b0
> > [  988.491585]  vfs_write+0xfc/0x300
> > [  988.494931]  ksys_write+0x78/0xe0
> > [  988.498283]  __arm64_sys_write+0x18/0x20
> > [  988.502240]  el0_svc_common+0x98/0x160
> > [  988.506024]  do_el0_svc+0x78/0x80
> > [  988.509377]  el0_sync_handler+0xd4/0x270
> > [  988.513337]  el0_sync+0x164/0x180
> > 
> 
> Was this the full log or you did cut some of it?
> 

I cut some CPU registers' value since it's too long and not relevant.
The Call trace is full.

> > This change write CTI registers directly in cti_enable_hw().
> > Config->hw_powered has been checked to be true with spinlock holded.
> > CTI is powered and can be programmed until spinlock is released.
> > 
> 
> From your explanation above it seems that cti_enable_hw() was called from,
> say
> CPUy, to enable the CTI associated to CPUx.  CTIx's drvdata->spinlock was
> taken
> and smp_call_function_single() called right after.  That woke up CPUx and
> cti_cpu_pm_notify() was executed on CPUx in interrupt context, trying to
> take
> CTIx's drvdata->spinlock.  That hung CPUx and the kernel got angry.  Is my
> assessment correct?
> 

Most of them is correct. The only difference is CPUx is power on when
cti_enable_hw() is called.  Otherwise it will goto cti_state_unchanged:
and won't call cti_enable_hw_smp_call(). cti_cpu_pm_notify() is called
when CPUx tries to suspend instead of resume.

> If so I don't think the fix suggested in this patch will work.  The same
> condition will happen whenever cti_enable_hw() is called on a CPU to
> enable a
> CTI that belongs to another CPU and that cti_cpu_pm_notify() is called on
> latter
> CPU at the same time.
> 

I'm not sure I understand this correctly.  Let me clarify it a little bit.
It's a deadlock since cti_enable_hw() holds the spinlock and calls
cti_enable_hw_smp_call() from CPUx to enable CTI associated to CPUy. It
waits for cti_enable_hw_smp_call() to return. IPI is sent to CPUy while
CPUy is in cti_cpu_pm_notify() and waits for spinlock. In this patch,
I remove cti_enable_hw_smp_call() and write CTI CPU directly on CPUx.
It won't wait for CPUy and release spinlock after program registers of
CTI. After cti_enable_hw() releases spinlock, cti_cpu_pm_notify() will
continue to run. Since spinlock is held and config->hw_powered is true,
we don't need to worry about CPUy power down when we program CTI on CPUx.

> I think a better solution is to grab the lock in cti_enable_hw() and check
> the
> value of ->ctidev.cpu.  If not a global CPU, i.e >= 0, then release the
> lock and
> call smp_call_function_single().  In cti_enable_hw_smp_call() take the
> lock
> again and move forward from there. 
> 

After cti_enable_hw() releases the lock, it's possible that CPU is offline
by user, cti_enable_hw_smp_call() will fail in this case.



> I have applied the other two patches in this set so no need to send them
> again.
>
Thanks,
Tingwei 
> Thanks,
> Mathieu
> 
> > Fixes: 6a0953ce7de9 ("coresight: cti: Add CPU idle pm notifer to CTI
> devices")
> > Signed-off-by: Tingwei Zhang <tingwei@codeaurora.org>
> > ---
> >  drivers/hwtracing/coresight/coresight-cti.c | 17 +----------------
> >  1 file changed, 1 insertion(+), 16 deletions(-)
> > 
> > diff --git a/drivers/hwtracing/coresight/coresight-cti.c
> b/drivers/hwtracing/coresight/coresight-cti.c
> > index 3ccc703dc940..869569eb8c7f 100644
> > --- a/drivers/hwtracing/coresight/coresight-cti.c
> > +++ b/drivers/hwtracing/coresight/coresight-cti.c
> > @@ -86,13 +86,6 @@ void cti_write_all_hw_regs(struct cti_drvdata
> *drvdata)
> >  	CS_LOCK(drvdata->base);
> >  }
> >  
> > -static void cti_enable_hw_smp_call(void *info)
> > -{
> > -	struct cti_drvdata *drvdata = info;
> > -
> > -	cti_write_all_hw_regs(drvdata);
> > -}
> > -
> >  /* write regs to hardware and enable */
> >  static int cti_enable_hw(struct cti_drvdata *drvdata)
> >  {
> > @@ -112,15 +105,7 @@ static int cti_enable_hw(struct cti_drvdata
> *drvdata)
> >  	if (rc)
> >  		goto cti_err_not_enabled;
> >  
> > -	if (drvdata->ctidev.cpu >= 0) {
> > -		rc = smp_call_function_single(drvdata->ctidev.cpu,
> > -					      cti_enable_hw_smp_call,
> > -					      drvdata, 1);
> > -		if (rc)
> > -			goto cti_err_not_enabled;
> > -	} else {
> > -		cti_write_all_hw_regs(drvdata);
> > -	}
> > +	cti_write_all_hw_regs(drvdata);
> >  
> >  	config->hw_enabled = true;
> >  	atomic_inc(&drvdata->config.enable_req_count);
> > -- 
> > The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
> Forum,
> > a Linux Foundation Collaborative Project
> > 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Mathieu Poirier Aug. 31, 2020, 8:28 p.m. UTC | #5
On Fri, Aug 28, 2020 at 11:18:18AM +0800, Tingwei Zhang wrote:
> On Fri, Aug 28, 2020 at 02:12:53AM +0800, Mathieu Poirier wrote:
> > Hi Tingwei,
> > 
> > On Tue, Aug 18, 2020 at 07:10:57PM +0800, Tingwei Zhang wrote:
> > > Deadlock as below is triggered by one CPU holds drvdata->spinlock
> > > and calls cti_enable_hw(). Smp_call_function_single() is called
> > > in cti_enable_hw() and tries to let another CPU write CTI registers.
> > > That CPU is trying to get drvdata->spinlock in cti_cpu_pm_notify()
> > > and doesn't response to IPI from smp_call_function_single().
> > > 
> > > [  988.335937] CPU: 6 PID: 10258 Comm: sh Tainted: G        W    L
> > > 5.8.0-rc6-mainline-16783-gc38daa79b26b-dirty #1
> > > [  988.346364] Hardware name: Thundercomm Dragonboard 845c (DT)
> > > [  988.352073] pstate: 20400005 (nzCv daif +PAN -UAO BTYPE=--)
> > > [  988.357689] pc : smp_call_function_single+0x158/0x1b8
> > > [  988.362782] lr : smp_call_function_single+0x124/0x1b8
> > > ...
> > > [  988.451638] Call trace:
> > > [  988.454119]  smp_call_function_single+0x158/0x1b8
> > > [  988.458866]  cti_enable+0xb4/0xf8 [coresight_cti]
> > > [  988.463618]  coresight_control_assoc_ectdev+0x6c/0x128 [coresight]
> > > [  988.469855]  coresight_enable+0x1f0/0x364 [coresight]
> > > [  988.474957]  enable_source_store+0x5c/0x9c [coresight]
> > > [  988.480140]  dev_attr_store+0x14/0x28
> > > [  988.483839]  sysfs_kf_write+0x38/0x4c
> > > [  988.487532]  kernfs_fop_write+0x1c0/0x2b0
> > > [  988.491585]  vfs_write+0xfc/0x300
> > > [  988.494931]  ksys_write+0x78/0xe0
> > > [  988.498283]  __arm64_sys_write+0x18/0x20
> > > [  988.502240]  el0_svc_common+0x98/0x160
> > > [  988.506024]  do_el0_svc+0x78/0x80
> > > [  988.509377]  el0_sync_handler+0xd4/0x270
> > > [  988.513337]  el0_sync+0x164/0x180
> > > 
> > 
> > Was this the full log or you did cut some of it?
> > 
> 
> I cut some CPU registers' value since it's too long and not relevant.
> The Call trace is full.
> 
> > > This change write CTI registers directly in cti_enable_hw().
> > > Config->hw_powered has been checked to be true with spinlock holded.
> > > CTI is powered and can be programmed until spinlock is released.
> > > 
> > 
> > From your explanation above it seems that cti_enable_hw() was called from,
> > say
> > CPUy, to enable the CTI associated to CPUx.  CTIx's drvdata->spinlock was
> > taken
> > and smp_call_function_single() called right after.  That woke up CPUx and
> > cti_cpu_pm_notify() was executed on CPUx in interrupt context, trying to
> > take
> > CTIx's drvdata->spinlock.  That hung CPUx and the kernel got angry.  Is my
> > assessment correct?
> > 
> 
> Most of them is correct. The only difference is CPUx is power on when
> cti_enable_hw() is called.  Otherwise it will goto cti_state_unchanged:
> and won't call cti_enable_hw_smp_call(). cti_cpu_pm_notify() is called
> when CPUx tries to suspend instead of resume.
> 
> > If so I don't think the fix suggested in this patch will work.  The same
> > condition will happen whenever cti_enable_hw() is called on a CPU to
> > enable a
> > CTI that belongs to another CPU and that cti_cpu_pm_notify() is called on
> > latter
> > CPU at the same time.
> > 
> 
> I'm not sure I understand this correctly.  Let me clarify it a little bit.
> It's a deadlock since cti_enable_hw() holds the spinlock and calls
> cti_enable_hw_smp_call() from CPUx to enable CTI associated to CPUy. It
> waits for cti_enable_hw_smp_call() to return. IPI is sent to CPUy while
> CPUy is in cti_cpu_pm_notify() and waits for spinlock. In this patch,
> I remove cti_enable_hw_smp_call() and write CTI CPU directly on CPUx.
> It won't wait for CPUy and release spinlock after program registers of
> CTI. After cti_enable_hw() releases spinlock, cti_cpu_pm_notify() will
> continue to run. Since spinlock is held and config->hw_powered is true,
> we don't need to worry about CPUy power down when we program CTI on CPUx.
> 
> > I think a better solution is to grab the lock in cti_enable_hw() and check
> > the
> > value of ->ctidev.cpu.  If not a global CPU, i.e >= 0, then release the
> > lock and
> > call smp_call_function_single().  In cti_enable_hw_smp_call() take the
> > lock
> > again and move forward from there. 
> > 
> 
> After cti_enable_hw() releases the lock, it's possible that CPU is offline
> by user, cti_enable_hw_smp_call() will fail in this case.
> 
>

I took another look at your solution and you are correct, CPUy won't lock but
rather spin waiting for cti_enable_hw() to finish on CPUx.  To make sure CPUx is
not interrupted while executing cti_enable_hw() I suggest to use
spin_lock_irq().

Thanks,
Mathieu
 
> 
> > I have applied the other two patches in this set so no need to send them
> > again.
> >
> Thanks,
> Tingwei 
> > Thanks,
> > Mathieu
> > 
> > > Fixes: 6a0953ce7de9 ("coresight: cti: Add CPU idle pm notifer to CTI
> > devices")
> > > Signed-off-by: Tingwei Zhang <tingwei@codeaurora.org>
> > > ---
> > >  drivers/hwtracing/coresight/coresight-cti.c | 17 +----------------
> > >  1 file changed, 1 insertion(+), 16 deletions(-)
> > > 
> > > diff --git a/drivers/hwtracing/coresight/coresight-cti.c
> > b/drivers/hwtracing/coresight/coresight-cti.c
> > > index 3ccc703dc940..869569eb8c7f 100644
> > > --- a/drivers/hwtracing/coresight/coresight-cti.c
> > > +++ b/drivers/hwtracing/coresight/coresight-cti.c
> > > @@ -86,13 +86,6 @@ void cti_write_all_hw_regs(struct cti_drvdata
> > *drvdata)
> > >  	CS_LOCK(drvdata->base);
> > >  }
> > >  
> > > -static void cti_enable_hw_smp_call(void *info)
> > > -{
> > > -	struct cti_drvdata *drvdata = info;
> > > -
> > > -	cti_write_all_hw_regs(drvdata);
> > > -}
> > > -
> > >  /* write regs to hardware and enable */
> > >  static int cti_enable_hw(struct cti_drvdata *drvdata)
> > >  {
> > > @@ -112,15 +105,7 @@ static int cti_enable_hw(struct cti_drvdata
> > *drvdata)
> > >  	if (rc)
> > >  		goto cti_err_not_enabled;
> > >  
> > > -	if (drvdata->ctidev.cpu >= 0) {
> > > -		rc = smp_call_function_single(drvdata->ctidev.cpu,
> > > -					      cti_enable_hw_smp_call,
> > > -					      drvdata, 1);
> > > -		if (rc)
> > > -			goto cti_err_not_enabled;
> > > -	} else {
> > > -		cti_write_all_hw_regs(drvdata);
> > > -	}
> > > +	cti_write_all_hw_regs(drvdata);
> > >  
> > >  	config->hw_enabled = true;
> > >  	atomic_inc(&drvdata->config.enable_req_count);
> > > -- 
> > > The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
> > Forum,
> > > a Linux Foundation Collaborative Project
> > > 
> > 
> > _______________________________________________
> > linux-arm-kernel mailing list
> > linux-arm-kernel@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Tingwei Zhang Sept. 1, 2020, 6:35 a.m. UTC | #6
On Tue, Sep 01, 2020 at 04:28:19AM +0800, Mathieu Poirier wrote:
> On Fri, Aug 28, 2020 at 11:18:18AM +0800, Tingwei Zhang wrote:
> > On Fri, Aug 28, 2020 at 02:12:53AM +0800, Mathieu Poirier wrote:
> > > Hi Tingwei,
> > > 
> > > On Tue, Aug 18, 2020 at 07:10:57PM +0800, Tingwei Zhang wrote:
> > > > Deadlock as below is triggered by one CPU holds drvdata->spinlock
> > > > and calls cti_enable_hw(). Smp_call_function_single() is called
> > > > in cti_enable_hw() and tries to let another CPU write CTI registers.
> > > > That CPU is trying to get drvdata->spinlock in cti_cpu_pm_notify()
> > > > and doesn't response to IPI from smp_call_function_single().
> > > > 
> > > > [  988.335937] CPU: 6 PID: 10258 Comm: sh Tainted: G        W    L
> > > > 5.8.0-rc6-mainline-16783-gc38daa79b26b-dirty #1
> > > > [  988.346364] Hardware name: Thundercomm Dragonboard 845c (DT)
> > > > [  988.352073] pstate: 20400005 (nzCv daif +PAN -UAO BTYPE=--)
> > > > [  988.357689] pc : smp_call_function_single+0x158/0x1b8
> > > > [  988.362782] lr : smp_call_function_single+0x124/0x1b8
> > > > ...
> > > > [  988.451638] Call trace:
> > > > [  988.454119]  smp_call_function_single+0x158/0x1b8
> > > > [  988.458866]  cti_enable+0xb4/0xf8 [coresight_cti]
> > > > [  988.463618]  coresight_control_assoc_ectdev+0x6c/0x128
> [coresight]
> > > > [  988.469855]  coresight_enable+0x1f0/0x364 [coresight]
> > > > [  988.474957]  enable_source_store+0x5c/0x9c [coresight]
> > > > [  988.480140]  dev_attr_store+0x14/0x28
> > > > [  988.483839]  sysfs_kf_write+0x38/0x4c
> > > > [  988.487532]  kernfs_fop_write+0x1c0/0x2b0
> > > > [  988.491585]  vfs_write+0xfc/0x300
> > > > [  988.494931]  ksys_write+0x78/0xe0
> > > > [  988.498283]  __arm64_sys_write+0x18/0x20
> > > > [  988.502240]  el0_svc_common+0x98/0x160
> > > > [  988.506024]  do_el0_svc+0x78/0x80
> > > > [  988.509377]  el0_sync_handler+0xd4/0x270
> > > > [  988.513337]  el0_sync+0x164/0x180
> > > > 
> > > 
> > > Was this the full log or you did cut some of it?
> > > 
> > 
> > I cut some CPU registers' value since it's too long and not relevant.
> > The Call trace is full.
> > 
> > > > This change write CTI registers directly in cti_enable_hw().
> > > > Config->hw_powered has been checked to be true with spinlock holded.
> > > > CTI is powered and can be programmed until spinlock is released.
> > > > 
> > > 
> > > From your explanation above it seems that cti_enable_hw() was called
> from,
> > > say
> > > CPUy, to enable the CTI associated to CPUx.  CTIx's drvdata->spinlock
> was
> > > taken
> > > and smp_call_function_single() called right after.  That woke up CPUx
> and
> > > cti_cpu_pm_notify() was executed on CPUx in interrupt context, trying
> to
> > > take
> > > CTIx's drvdata->spinlock.  That hung CPUx and the kernel got angry.
> Is my
> > > assessment correct?
> > > 
> > 
> > Most of them is correct. The only difference is CPUx is power on when
> > cti_enable_hw() is called.  Otherwise it will goto cti_state_unchanged:
> > and won't call cti_enable_hw_smp_call(). cti_cpu_pm_notify() is called
> > when CPUx tries to suspend instead of resume.
> > 
> > > If so I don't think the fix suggested in this patch will work.  The
> same
> > > condition will happen whenever cti_enable_hw() is called on a CPU to
> > > enable a
> > > CTI that belongs to another CPU and that cti_cpu_pm_notify() is called
> on
> > > latter
> > > CPU at the same time.
> > > 
> > 
> > I'm not sure I understand this correctly.  Let me clarify it a little
> bit.
> > It's a deadlock since cti_enable_hw() holds the spinlock and calls
> > cti_enable_hw_smp_call() from CPUx to enable CTI associated to CPUy. It
> > waits for cti_enable_hw_smp_call() to return. IPI is sent to CPUy while
> > CPUy is in cti_cpu_pm_notify() and waits for spinlock. In this patch,
> > I remove cti_enable_hw_smp_call() and write CTI CPU directly on CPUx.
> > It won't wait for CPUy and release spinlock after program registers of
> > CTI. After cti_enable_hw() releases spinlock, cti_cpu_pm_notify() will
> > continue to run. Since spinlock is held and config->hw_powered is true,
> > we don't need to worry about CPUy power down when we program CTI on
> CPUx.
> > 
> > > I think a better solution is to grab the lock in cti_enable_hw() and
> check
> > > the
> > > value of ->ctidev.cpu.  If not a global CPU, i.e >= 0, then release
> the
> > > lock and
> > > call smp_call_function_single().  In cti_enable_hw_smp_call() take the
> > > lock
> > > again and move forward from there. 
> > > 
> > 
> > After cti_enable_hw() releases the lock, it's possible that CPU is
> offline
> > by user, cti_enable_hw_smp_call() will fail in this case.
> > 
> >
> 
> I took another look at your solution and you are correct, CPUy won't lock
> but
> rather spin waiting for cti_enable_hw() to finish on CPUx.  To make sure
> CPUx is
> not interrupted while executing cti_enable_hw() I suggest to use
> spin_lock_irq().
> 
> Thanks,
> Mathieu
>  
Sure. I've sent v2 according to your comments.
> > 
> > > I have applied the other two patches in this set so no need to send
> them
> > > again.
> > >
> > Thanks,
> > Tingwei 
> > > Thanks,
> > > Mathieu
> > > 
> > > > Fixes: 6a0953ce7de9 ("coresight: cti: Add CPU idle pm notifer to CTI
> > > devices")
> > > > Signed-off-by: Tingwei Zhang <tingwei@codeaurora.org>
> > > > ---
> > > >  drivers/hwtracing/coresight/coresight-cti.c | 17 +----------------
> > > >  1 file changed, 1 insertion(+), 16 deletions(-)
> > > > 
> > > > diff --git a/drivers/hwtracing/coresight/coresight-cti.c
> > > b/drivers/hwtracing/coresight/coresight-cti.c
> > > > index 3ccc703dc940..869569eb8c7f 100644
> > > > --- a/drivers/hwtracing/coresight/coresight-cti.c
> > > > +++ b/drivers/hwtracing/coresight/coresight-cti.c
> > > > @@ -86,13 +86,6 @@ void cti_write_all_hw_regs(struct cti_drvdata
> > > *drvdata)
> > > >  	CS_LOCK(drvdata->base);
> > > >  }
> > > >  
> > > > -static void cti_enable_hw_smp_call(void *info)
> > > > -{
> > > > -	struct cti_drvdata *drvdata = info;
> > > > -
> > > > -	cti_write_all_hw_regs(drvdata);
> > > > -}
> > > > -
> > > >  /* write regs to hardware and enable */
> > > >  static int cti_enable_hw(struct cti_drvdata *drvdata)
> > > >  {
> > > > @@ -112,15 +105,7 @@ static int cti_enable_hw(struct cti_drvdata
> > > *drvdata)
> > > >  	if (rc)
> > > >  		goto cti_err_not_enabled;
> > > >  
> > > > -	if (drvdata->ctidev.cpu >= 0) {
> > > > -		rc = smp_call_function_single(drvdata->ctidev.cpu,
> > > > -					      cti_enable_hw_smp_call,
> > > > -					      drvdata, 1);
> > > > -		if (rc)
> > > > -			goto cti_err_not_enabled;
> > > > -	} else {
> > > > -		cti_write_all_hw_regs(drvdata);
> > > > -	}
> > > > +	cti_write_all_hw_regs(drvdata);
> > > >  
> > > >  	config->hw_enabled = true;
> > > >  	atomic_inc(&drvdata->config.enable_req_count);
> > > > -- 
> > > > The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
> > > Forum,
> > > > a Linux Foundation Collaborative Project
> > > > 
> > > 
> > > _______________________________________________
> > > linux-arm-kernel mailing list
> > > linux-arm-kernel@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
diff mbox series

Patch

diff --git a/drivers/hwtracing/coresight/coresight-cti.c b/drivers/hwtracing/coresight/coresight-cti.c
index 3ccc703dc940..869569eb8c7f 100644
--- a/drivers/hwtracing/coresight/coresight-cti.c
+++ b/drivers/hwtracing/coresight/coresight-cti.c
@@ -86,13 +86,6 @@  void cti_write_all_hw_regs(struct cti_drvdata *drvdata)
 	CS_LOCK(drvdata->base);
 }
 
-static void cti_enable_hw_smp_call(void *info)
-{
-	struct cti_drvdata *drvdata = info;
-
-	cti_write_all_hw_regs(drvdata);
-}
-
 /* write regs to hardware and enable */
 static int cti_enable_hw(struct cti_drvdata *drvdata)
 {
@@ -112,15 +105,7 @@  static int cti_enable_hw(struct cti_drvdata *drvdata)
 	if (rc)
 		goto cti_err_not_enabled;
 
-	if (drvdata->ctidev.cpu >= 0) {
-		rc = smp_call_function_single(drvdata->ctidev.cpu,
-					      cti_enable_hw_smp_call,
-					      drvdata, 1);
-		if (rc)
-			goto cti_err_not_enabled;
-	} else {
-		cti_write_all_hw_regs(drvdata);
-	}
+	cti_write_all_hw_regs(drvdata);
 
 	config->hw_enabled = true;
 	atomic_inc(&drvdata->config.enable_req_count);