diff mbox series

bus: mhi: host: Add MHI_PM_SYS_ERR_FAIL state

Message ID 20231208221353.1560177-1-quic_jhugo@quicinc.com (mailing list archive)
State Superseded
Headers show
Series bus: mhi: host: Add MHI_PM_SYS_ERR_FAIL state | expand

Commit Message

Jeffrey Hugo Dec. 8, 2023, 10:13 p.m. UTC
When processing a SYSERR, if the device does not respond to the MHI_RESET
from the host, the host will be stuck in a difficult to recover state.
The host will remain in MHI_PM_SYS_ERR_PROCESS and not clean up the host
channels.  Clients will not be notified of the SYSERR via the destruction
of their channel devices, which means clients may think that the device is
still up.  Subsequent SYSERR events such as a device fatal error will not
be processed as the state machine cannot transition from PROCESS back to
DETECT.  The only way to recover from this is to unload the mhi module
(wipe the state machine state) or for the mhi controller to initiate
SHUTDOWN.

In this failure case, to recover the device, we need a state similar to
PROCESS, but can transition to DETECT.  There is not a viable existing
state to use.  POR has the needed transitions, but assumes the device is
in a good state and could allow the host to attempt to use the device.
Allowing PROCESS to transition to DETECT invites the possibility of
parallel SYSERR processing which could get the host and device out of
sync.

Thus, invent a new state - MHI_PM_SYS_ERR_FAIL

This essentially a holding state.  It allows us to clean up the host
elements that are based on the old state of the device (channels), but
does not allow us to directly advance back to an operational state.  It
does allow the detection and processing of another SYSERR which may
recover the device, or allows the controller to do a clean shutdown.

Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>
---
 drivers/bus/mhi/host/init.c     |  1 +
 drivers/bus/mhi/host/internal.h |  9 ++++++---
 drivers/bus/mhi/host/pm.c       | 20 +++++++++++++++++---
 3 files changed, 24 insertions(+), 6 deletions(-)

Comments

Manivannan Sadhasivam Jan. 4, 2024, 11:26 a.m. UTC | #1
On Fri, Dec 08, 2023 at 03:13:53PM -0700, Jeffrey Hugo wrote:
> When processing a SYSERR, if the device does not respond to the MHI_RESET
> from the host, the host will be stuck in a difficult to recover state.

Under what scenario this can happen? Is the device not honoring MHI_RESET state
or crashed completely?

- Mani

> The host will remain in MHI_PM_SYS_ERR_PROCESS and not clean up the host
> channels.  Clients will not be notified of the SYSERR via the destruction
> of their channel devices, which means clients may think that the device is
> still up.  Subsequent SYSERR events such as a device fatal error will not
> be processed as the state machine cannot transition from PROCESS back to
> DETECT.  The only way to recover from this is to unload the mhi module
> (wipe the state machine state) or for the mhi controller to initiate
> SHUTDOWN.
> 
> In this failure case, to recover the device, we need a state similar to
> PROCESS, but can transition to DETECT.  There is not a viable existing
> state to use.  POR has the needed transitions, but assumes the device is
> in a good state and could allow the host to attempt to use the device.
> Allowing PROCESS to transition to DETECT invites the possibility of
> parallel SYSERR processing which could get the host and device out of
> sync.
> 
> Thus, invent a new state - MHI_PM_SYS_ERR_FAIL
> 
> This essentially a holding state.  It allows us to clean up the host
> elements that are based on the old state of the device (channels), but
> does not allow us to directly advance back to an operational state.  It
> does allow the detection and processing of another SYSERR which may
> recover the device, or allows the controller to do a clean shutdown.
> 
> Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
> Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>
> ---
>  drivers/bus/mhi/host/init.c     |  1 +
>  drivers/bus/mhi/host/internal.h |  9 ++++++---
>  drivers/bus/mhi/host/pm.c       | 20 +++++++++++++++++---
>  3 files changed, 24 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
> index e2c2f510b04f..d3f919277cf9 100644
> --- a/drivers/bus/mhi/host/init.c
> +++ b/drivers/bus/mhi/host/init.c
> @@ -62,6 +62,7 @@ static const char * const mhi_pm_state_str[] = {
>  	[MHI_PM_STATE_FW_DL_ERR] = "Firmware Download Error",
>  	[MHI_PM_STATE_SYS_ERR_DETECT] = "SYS ERROR Detect",
>  	[MHI_PM_STATE_SYS_ERR_PROCESS] = "SYS ERROR Process",
> +	[MHI_PM_STATE_SYS_ERR_FAIL] = "SYS ERROR Failure",
>  	[MHI_PM_STATE_SHUTDOWN_PROCESS] = "SHUTDOWN Process",
>  	[MHI_PM_STATE_LD_ERR_FATAL_DETECT] = "Linkdown or Error Fatal Detect",
>  };
> diff --git a/drivers/bus/mhi/host/internal.h b/drivers/bus/mhi/host/internal.h
> index 30ac415a3000..4b6deea17bcd 100644
> --- a/drivers/bus/mhi/host/internal.h
> +++ b/drivers/bus/mhi/host/internal.h
> @@ -88,6 +88,7 @@ enum mhi_pm_state {
>  	MHI_PM_STATE_FW_DL_ERR,
>  	MHI_PM_STATE_SYS_ERR_DETECT,
>  	MHI_PM_STATE_SYS_ERR_PROCESS,
> +	MHI_PM_STATE_SYS_ERR_FAIL,
>  	MHI_PM_STATE_SHUTDOWN_PROCESS,
>  	MHI_PM_STATE_LD_ERR_FATAL_DETECT,
>  	MHI_PM_STATE_MAX
> @@ -104,14 +105,16 @@ enum mhi_pm_state {
>  #define MHI_PM_FW_DL_ERR				BIT(7)
>  #define MHI_PM_SYS_ERR_DETECT				BIT(8)
>  #define MHI_PM_SYS_ERR_PROCESS				BIT(9)
> -#define MHI_PM_SHUTDOWN_PROCESS				BIT(10)
> +#define MHI_PM_SYS_ERR_FAIL				BIT(10)
> +#define MHI_PM_SHUTDOWN_PROCESS				BIT(11)
>  /* link not accessible */
> -#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(11)
> +#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(12)
>  
>  #define MHI_REG_ACCESS_VALID(pm_state)			((pm_state & (MHI_PM_POR | MHI_PM_M0 | \
>  						MHI_PM_M2 | MHI_PM_M3_ENTER | MHI_PM_M3_EXIT | \
>  						MHI_PM_SYS_ERR_DETECT | MHI_PM_SYS_ERR_PROCESS | \
> -						MHI_PM_SHUTDOWN_PROCESS | MHI_PM_FW_DL_ERR)))
> +						MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |  \
> +						MHI_PM_FW_DL_ERR)))
>  #define MHI_PM_IN_ERROR_STATE(pm_state)			(pm_state >= MHI_PM_FW_DL_ERR)
>  #define MHI_PM_IN_FATAL_STATE(pm_state)			(pm_state == MHI_PM_LD_ERR_FATAL_DETECT)
>  #define MHI_DB_ACCESS_VALID(mhi_cntrl)			(mhi_cntrl->pm_state & mhi_cntrl->db_access)
> diff --git a/drivers/bus/mhi/host/pm.c b/drivers/bus/mhi/host/pm.c
> index a2f2feef1476..d0d033ce9984 100644
> --- a/drivers/bus/mhi/host/pm.c
> +++ b/drivers/bus/mhi/host/pm.c
> @@ -36,7 +36,10 @@
>   *     M0 <--> M0
>   *     M0 -> FW_DL_ERR
>   *     M0 -> M3_ENTER -> M3 -> M3_EXIT --> M0
> - * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS --> POR
> + * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS
> + *     SYS_ERR_PROCESS -> SYS_ERR_FAIL
> + *     SYS_ERR_FAIL -> SYS_ERR_DETECT
> + *     SYS_ERR_PROCESS --> POR
>   * L2: SHUTDOWN_PROCESS -> LD_ERR_FATAL_DETECT
>   *     SHUTDOWN_PROCESS -> DISABLE
>   * L3: LD_ERR_FATAL_DETECT <--> LD_ERR_FATAL_DETECT
> @@ -93,7 +96,12 @@ static const struct mhi_pm_transitions dev_state_transitions[] = {
>  	},
>  	{
>  		MHI_PM_SYS_ERR_PROCESS,
> -		MHI_PM_POR | MHI_PM_SHUTDOWN_PROCESS |
> +		MHI_PM_POR | MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |
> +		MHI_PM_LD_ERR_FATAL_DETECT
> +	},
> +	{
> +		MHI_PM_SYS_ERR_FAIL,
> +		MHI_PM_SYS_ERR_DETECT | MHI_PM_SHUTDOWN_PROCESS |
>  		MHI_PM_LD_ERR_FATAL_DETECT
>  	},
>  	/* L2 States */
> @@ -629,7 +637,13 @@ static void mhi_pm_sys_error_transition(struct mhi_controller *mhi_cntrl)
>  					!in_reset, timeout);
>  		if (!ret || in_reset) {
>  			dev_err(dev, "Device failed to exit MHI Reset state\n");
> -			goto exit_sys_error_transition;
> +			write_lock_irq(&mhi_cntrl->pm_lock);
> +			cur_state = mhi_tryset_pm_state(mhi_cntrl,
> +							MHI_PM_SYS_ERR_FAIL);
> +			write_unlock_irq(&mhi_cntrl->pm_lock);
> +			/* Shutdown may have occurred, otherwise cleanup now */
> +			if (cur_state != MHI_PM_SYS_ERR_FAIL)
> +				goto exit_sys_error_transition;
>  		}
>  
>  		/*
> -- 
> 2.34.1
> 
>
Jeffrey Hugo Jan. 4, 2024, 3:53 p.m. UTC | #2
On 1/4/2024 4:26 AM, Manivannan Sadhasivam wrote:
> On Fri, Dec 08, 2023 at 03:13:53PM -0700, Jeffrey Hugo wrote:
>> When processing a SYSERR, if the device does not respond to the MHI_RESET
>> from the host, the host will be stuck in a difficult to recover state.
> 
> Under what scenario this can happen? Is the device not honoring MHI_RESET state
> or crashed completely?

Digging up my notes from this patch, it was originally discovered via 
soc_reset stress testing.

On AIC100 (and I assume other MHI devices because the hardware 
implementation is common), soc_reset is processed entirely in hardware. 
  When the register write hits the endpoint, it causes the soc to reset 
without firmware involvement.

If you stress test soc_reset and hit the timing just right, you can have 
PBL signal syserr (fatal error) for soc_reset N, and then before PBL can 
process the MHI_RESET request from the host, soc_reset N+1 hits the 
endpoint causing the soc to reset, and re-run PBL from the beginning 
which causes PBL to lose all state.  This is how we discovered this 
issue, although the reproduction rate was rather low.

I was able to hack the AMSS EE firmware (QSM) to synthetically reproduce 
the issue as well.  Send a trigger to QSM via an unused MHI register to 
invoke syserr (non-fatal error), and then have QSM ignore the MHI_RESET 
request which would simulate some kind of FW hang.  soc_reset would not 
recover the device.

> 
> - Mani
> 
>> The host will remain in MHI_PM_SYS_ERR_PROCESS and not clean up the host
>> channels.  Clients will not be notified of the SYSERR via the destruction
>> of their channel devices, which means clients may think that the device is
>> still up.  Subsequent SYSERR events such as a device fatal error will not
>> be processed as the state machine cannot transition from PROCESS back to
>> DETECT.  The only way to recover from this is to unload the mhi module
>> (wipe the state machine state) or for the mhi controller to initiate
>> SHUTDOWN.
>>
>> In this failure case, to recover the device, we need a state similar to
>> PROCESS, but can transition to DETECT.  There is not a viable existing
>> state to use.  POR has the needed transitions, but assumes the device is
>> in a good state and could allow the host to attempt to use the device.
>> Allowing PROCESS to transition to DETECT invites the possibility of
>> parallel SYSERR processing which could get the host and device out of
>> sync.
>>
>> Thus, invent a new state - MHI_PM_SYS_ERR_FAIL
>>
>> This essentially a holding state.  It allows us to clean up the host
>> elements that are based on the old state of the device (channels), but
>> does not allow us to directly advance back to an operational state.  It
>> does allow the detection and processing of another SYSERR which may
>> recover the device, or allows the controller to do a clean shutdown.
>>
>> Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
>> Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>
>> ---
>>   drivers/bus/mhi/host/init.c     |  1 +
>>   drivers/bus/mhi/host/internal.h |  9 ++++++---
>>   drivers/bus/mhi/host/pm.c       | 20 +++++++++++++++++---
>>   3 files changed, 24 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
>> index e2c2f510b04f..d3f919277cf9 100644
>> --- a/drivers/bus/mhi/host/init.c
>> +++ b/drivers/bus/mhi/host/init.c
>> @@ -62,6 +62,7 @@ static const char * const mhi_pm_state_str[] = {
>>   	[MHI_PM_STATE_FW_DL_ERR] = "Firmware Download Error",
>>   	[MHI_PM_STATE_SYS_ERR_DETECT] = "SYS ERROR Detect",
>>   	[MHI_PM_STATE_SYS_ERR_PROCESS] = "SYS ERROR Process",
>> +	[MHI_PM_STATE_SYS_ERR_FAIL] = "SYS ERROR Failure",
>>   	[MHI_PM_STATE_SHUTDOWN_PROCESS] = "SHUTDOWN Process",
>>   	[MHI_PM_STATE_LD_ERR_FATAL_DETECT] = "Linkdown or Error Fatal Detect",
>>   };
>> diff --git a/drivers/bus/mhi/host/internal.h b/drivers/bus/mhi/host/internal.h
>> index 30ac415a3000..4b6deea17bcd 100644
>> --- a/drivers/bus/mhi/host/internal.h
>> +++ b/drivers/bus/mhi/host/internal.h
>> @@ -88,6 +88,7 @@ enum mhi_pm_state {
>>   	MHI_PM_STATE_FW_DL_ERR,
>>   	MHI_PM_STATE_SYS_ERR_DETECT,
>>   	MHI_PM_STATE_SYS_ERR_PROCESS,
>> +	MHI_PM_STATE_SYS_ERR_FAIL,
>>   	MHI_PM_STATE_SHUTDOWN_PROCESS,
>>   	MHI_PM_STATE_LD_ERR_FATAL_DETECT,
>>   	MHI_PM_STATE_MAX
>> @@ -104,14 +105,16 @@ enum mhi_pm_state {
>>   #define MHI_PM_FW_DL_ERR				BIT(7)
>>   #define MHI_PM_SYS_ERR_DETECT				BIT(8)
>>   #define MHI_PM_SYS_ERR_PROCESS				BIT(9)
>> -#define MHI_PM_SHUTDOWN_PROCESS				BIT(10)
>> +#define MHI_PM_SYS_ERR_FAIL				BIT(10)
>> +#define MHI_PM_SHUTDOWN_PROCESS				BIT(11)
>>   /* link not accessible */
>> -#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(11)
>> +#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(12)
>>   
>>   #define MHI_REG_ACCESS_VALID(pm_state)			((pm_state & (MHI_PM_POR | MHI_PM_M0 | \
>>   						MHI_PM_M2 | MHI_PM_M3_ENTER | MHI_PM_M3_EXIT | \
>>   						MHI_PM_SYS_ERR_DETECT | MHI_PM_SYS_ERR_PROCESS | \
>> -						MHI_PM_SHUTDOWN_PROCESS | MHI_PM_FW_DL_ERR)))
>> +						MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |  \
>> +						MHI_PM_FW_DL_ERR)))
>>   #define MHI_PM_IN_ERROR_STATE(pm_state)			(pm_state >= MHI_PM_FW_DL_ERR)
>>   #define MHI_PM_IN_FATAL_STATE(pm_state)			(pm_state == MHI_PM_LD_ERR_FATAL_DETECT)
>>   #define MHI_DB_ACCESS_VALID(mhi_cntrl)			(mhi_cntrl->pm_state & mhi_cntrl->db_access)
>> diff --git a/drivers/bus/mhi/host/pm.c b/drivers/bus/mhi/host/pm.c
>> index a2f2feef1476..d0d033ce9984 100644
>> --- a/drivers/bus/mhi/host/pm.c
>> +++ b/drivers/bus/mhi/host/pm.c
>> @@ -36,7 +36,10 @@
>>    *     M0 <--> M0
>>    *     M0 -> FW_DL_ERR
>>    *     M0 -> M3_ENTER -> M3 -> M3_EXIT --> M0
>> - * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS --> POR
>> + * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS
>> + *     SYS_ERR_PROCESS -> SYS_ERR_FAIL
>> + *     SYS_ERR_FAIL -> SYS_ERR_DETECT
>> + *     SYS_ERR_PROCESS --> POR
>>    * L2: SHUTDOWN_PROCESS -> LD_ERR_FATAL_DETECT
>>    *     SHUTDOWN_PROCESS -> DISABLE
>>    * L3: LD_ERR_FATAL_DETECT <--> LD_ERR_FATAL_DETECT
>> @@ -93,7 +96,12 @@ static const struct mhi_pm_transitions dev_state_transitions[] = {
>>   	},
>>   	{
>>   		MHI_PM_SYS_ERR_PROCESS,
>> -		MHI_PM_POR | MHI_PM_SHUTDOWN_PROCESS |
>> +		MHI_PM_POR | MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |
>> +		MHI_PM_LD_ERR_FATAL_DETECT
>> +	},
>> +	{
>> +		MHI_PM_SYS_ERR_FAIL,
>> +		MHI_PM_SYS_ERR_DETECT | MHI_PM_SHUTDOWN_PROCESS |
>>   		MHI_PM_LD_ERR_FATAL_DETECT
>>   	},
>>   	/* L2 States */
>> @@ -629,7 +637,13 @@ static void mhi_pm_sys_error_transition(struct mhi_controller *mhi_cntrl)
>>   					!in_reset, timeout);
>>   		if (!ret || in_reset) {
>>   			dev_err(dev, "Device failed to exit MHI Reset state\n");
>> -			goto exit_sys_error_transition;
>> +			write_lock_irq(&mhi_cntrl->pm_lock);
>> +			cur_state = mhi_tryset_pm_state(mhi_cntrl,
>> +							MHI_PM_SYS_ERR_FAIL);
>> +			write_unlock_irq(&mhi_cntrl->pm_lock);
>> +			/* Shutdown may have occurred, otherwise cleanup now */
>> +			if (cur_state != MHI_PM_SYS_ERR_FAIL)
>> +				goto exit_sys_error_transition;
>>   		}
>>   
>>   		/*
>> -- 
>> 2.34.1
>>
>>
>
Manivannan Sadhasivam Jan. 6, 2024, 2:11 p.m. UTC | #3
On Thu, Jan 04, 2024 at 08:53:47AM -0700, Jeffrey Hugo wrote:
> On 1/4/2024 4:26 AM, Manivannan Sadhasivam wrote:
> > On Fri, Dec 08, 2023 at 03:13:53PM -0700, Jeffrey Hugo wrote:
> > > When processing a SYSERR, if the device does not respond to the MHI_RESET
> > > from the host, the host will be stuck in a difficult to recover state.
> > 
> > Under what scenario this can happen? Is the device not honoring MHI_RESET state
> > or crashed completely?
> 
> Digging up my notes from this patch, it was originally discovered via
> soc_reset stress testing.
> 

Through sysfs I believe...

> On AIC100 (and I assume other MHI devices because the hardware
> implementation is common), soc_reset is processed entirely in hardware.
> When the register write hits the endpoint, it causes the soc to reset
> without firmware involvement.
> 
> If you stress test soc_reset and hit the timing just right, you can have PBL
> signal syserr (fatal error) for soc_reset N, and then before PBL can process
> the MHI_RESET request from the host, soc_reset N+1 hits the endpoint causing
> the soc to reset, and re-run PBL from the beginning which causes PBL to lose
> all state.  This is how we discovered this issue, although the reproduction
> rate was rather low.
> 

Thanks for the info. Could you please add the same in commit message and send
next version?

Otherwise, the patch looks good to me.

> I was able to hack the AMSS EE firmware (QSM) to synthetically reproduce the
> issue as well.  Send a trigger to QSM via an unused MHI register to invoke
> syserr (non-fatal error), and then have QSM ignore the MHI_RESET request
> which would simulate some kind of FW hang.  soc_reset would not recover the
> device.
> 

Ok. Even though this is not an issue that would impact the users (commonly),
I'm inclined to have this change, just in case...

- Mani

> > 
> > - Mani
> > 
> > > The host will remain in MHI_PM_SYS_ERR_PROCESS and not clean up the host
> > > channels.  Clients will not be notified of the SYSERR via the destruction
> > > of their channel devices, which means clients may think that the device is
> > > still up.  Subsequent SYSERR events such as a device fatal error will not
> > > be processed as the state machine cannot transition from PROCESS back to
> > > DETECT.  The only way to recover from this is to unload the mhi module
> > > (wipe the state machine state) or for the mhi controller to initiate
> > > SHUTDOWN.
> > > 
> > > In this failure case, to recover the device, we need a state similar to
> > > PROCESS, but can transition to DETECT.  There is not a viable existing
> > > state to use.  POR has the needed transitions, but assumes the device is
> > > in a good state and could allow the host to attempt to use the device.
> > > Allowing PROCESS to transition to DETECT invites the possibility of
> > > parallel SYSERR processing which could get the host and device out of
> > > sync.
> > > 
> > > Thus, invent a new state - MHI_PM_SYS_ERR_FAIL
> > > 
> > > This essentially a holding state.  It allows us to clean up the host
> > > elements that are based on the old state of the device (channels), but
> > > does not allow us to directly advance back to an operational state.  It
> > > does allow the detection and processing of another SYSERR which may
> > > recover the device, or allows the controller to do a clean shutdown.
> > > 
> > > Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
> > > Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>
> > > ---
> > >   drivers/bus/mhi/host/init.c     |  1 +
> > >   drivers/bus/mhi/host/internal.h |  9 ++++++---
> > >   drivers/bus/mhi/host/pm.c       | 20 +++++++++++++++++---
> > >   3 files changed, 24 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
> > > index e2c2f510b04f..d3f919277cf9 100644
> > > --- a/drivers/bus/mhi/host/init.c
> > > +++ b/drivers/bus/mhi/host/init.c
> > > @@ -62,6 +62,7 @@ static const char * const mhi_pm_state_str[] = {
> > >   	[MHI_PM_STATE_FW_DL_ERR] = "Firmware Download Error",
> > >   	[MHI_PM_STATE_SYS_ERR_DETECT] = "SYS ERROR Detect",
> > >   	[MHI_PM_STATE_SYS_ERR_PROCESS] = "SYS ERROR Process",
> > > +	[MHI_PM_STATE_SYS_ERR_FAIL] = "SYS ERROR Failure",
> > >   	[MHI_PM_STATE_SHUTDOWN_PROCESS] = "SHUTDOWN Process",
> > >   	[MHI_PM_STATE_LD_ERR_FATAL_DETECT] = "Linkdown or Error Fatal Detect",
> > >   };
> > > diff --git a/drivers/bus/mhi/host/internal.h b/drivers/bus/mhi/host/internal.h
> > > index 30ac415a3000..4b6deea17bcd 100644
> > > --- a/drivers/bus/mhi/host/internal.h
> > > +++ b/drivers/bus/mhi/host/internal.h
> > > @@ -88,6 +88,7 @@ enum mhi_pm_state {
> > >   	MHI_PM_STATE_FW_DL_ERR,
> > >   	MHI_PM_STATE_SYS_ERR_DETECT,
> > >   	MHI_PM_STATE_SYS_ERR_PROCESS,
> > > +	MHI_PM_STATE_SYS_ERR_FAIL,
> > >   	MHI_PM_STATE_SHUTDOWN_PROCESS,
> > >   	MHI_PM_STATE_LD_ERR_FATAL_DETECT,
> > >   	MHI_PM_STATE_MAX
> > > @@ -104,14 +105,16 @@ enum mhi_pm_state {
> > >   #define MHI_PM_FW_DL_ERR				BIT(7)
> > >   #define MHI_PM_SYS_ERR_DETECT				BIT(8)
> > >   #define MHI_PM_SYS_ERR_PROCESS				BIT(9)
> > > -#define MHI_PM_SHUTDOWN_PROCESS				BIT(10)
> > > +#define MHI_PM_SYS_ERR_FAIL				BIT(10)
> > > +#define MHI_PM_SHUTDOWN_PROCESS				BIT(11)
> > >   /* link not accessible */
> > > -#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(11)
> > > +#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(12)
> > >   #define MHI_REG_ACCESS_VALID(pm_state)			((pm_state & (MHI_PM_POR | MHI_PM_M0 | \
> > >   						MHI_PM_M2 | MHI_PM_M3_ENTER | MHI_PM_M3_EXIT | \
> > >   						MHI_PM_SYS_ERR_DETECT | MHI_PM_SYS_ERR_PROCESS | \
> > > -						MHI_PM_SHUTDOWN_PROCESS | MHI_PM_FW_DL_ERR)))
> > > +						MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |  \
> > > +						MHI_PM_FW_DL_ERR)))
> > >   #define MHI_PM_IN_ERROR_STATE(pm_state)			(pm_state >= MHI_PM_FW_DL_ERR)
> > >   #define MHI_PM_IN_FATAL_STATE(pm_state)			(pm_state == MHI_PM_LD_ERR_FATAL_DETECT)
> > >   #define MHI_DB_ACCESS_VALID(mhi_cntrl)			(mhi_cntrl->pm_state & mhi_cntrl->db_access)
> > > diff --git a/drivers/bus/mhi/host/pm.c b/drivers/bus/mhi/host/pm.c
> > > index a2f2feef1476..d0d033ce9984 100644
> > > --- a/drivers/bus/mhi/host/pm.c
> > > +++ b/drivers/bus/mhi/host/pm.c
> > > @@ -36,7 +36,10 @@
> > >    *     M0 <--> M0
> > >    *     M0 -> FW_DL_ERR
> > >    *     M0 -> M3_ENTER -> M3 -> M3_EXIT --> M0
> > > - * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS --> POR
> > > + * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS
> > > + *     SYS_ERR_PROCESS -> SYS_ERR_FAIL
> > > + *     SYS_ERR_FAIL -> SYS_ERR_DETECT
> > > + *     SYS_ERR_PROCESS --> POR
> > >    * L2: SHUTDOWN_PROCESS -> LD_ERR_FATAL_DETECT
> > >    *     SHUTDOWN_PROCESS -> DISABLE
> > >    * L3: LD_ERR_FATAL_DETECT <--> LD_ERR_FATAL_DETECT
> > > @@ -93,7 +96,12 @@ static const struct mhi_pm_transitions dev_state_transitions[] = {
> > >   	},
> > >   	{
> > >   		MHI_PM_SYS_ERR_PROCESS,
> > > -		MHI_PM_POR | MHI_PM_SHUTDOWN_PROCESS |
> > > +		MHI_PM_POR | MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |
> > > +		MHI_PM_LD_ERR_FATAL_DETECT
> > > +	},
> > > +	{
> > > +		MHI_PM_SYS_ERR_FAIL,
> > > +		MHI_PM_SYS_ERR_DETECT | MHI_PM_SHUTDOWN_PROCESS |
> > >   		MHI_PM_LD_ERR_FATAL_DETECT
> > >   	},
> > >   	/* L2 States */
> > > @@ -629,7 +637,13 @@ static void mhi_pm_sys_error_transition(struct mhi_controller *mhi_cntrl)
> > >   					!in_reset, timeout);
> > >   		if (!ret || in_reset) {
> > >   			dev_err(dev, "Device failed to exit MHI Reset state\n");
> > > -			goto exit_sys_error_transition;
> > > +			write_lock_irq(&mhi_cntrl->pm_lock);
> > > +			cur_state = mhi_tryset_pm_state(mhi_cntrl,
> > > +							MHI_PM_SYS_ERR_FAIL);
> > > +			write_unlock_irq(&mhi_cntrl->pm_lock);
> > > +			/* Shutdown may have occurred, otherwise cleanup now */
> > > +			if (cur_state != MHI_PM_SYS_ERR_FAIL)
> > > +				goto exit_sys_error_transition;
> > >   		}
> > >   		/*
> > > -- 
> > > 2.34.1
> > > 
> > > 
> > 
>
Jeffrey Hugo Jan. 8, 2024, 3:22 p.m. UTC | #4
On 1/6/2024 7:11 AM, Manivannan Sadhasivam wrote:
> On Thu, Jan 04, 2024 at 08:53:47AM -0700, Jeffrey Hugo wrote:
>> On 1/4/2024 4:26 AM, Manivannan Sadhasivam wrote:
>>> On Fri, Dec 08, 2023 at 03:13:53PM -0700, Jeffrey Hugo wrote:
>>>> When processing a SYSERR, if the device does not respond to the MHI_RESET
>>>> from the host, the host will be stuck in a difficult to recover state.
>>>
>>> Under what scenario this can happen? Is the device not honoring MHI_RESET state
>>> or crashed completely?
>>
>> Digging up my notes from this patch, it was originally discovered via
>> soc_reset stress testing.
>>
> 
> Through sysfs I believe...

Yes, through the sysfs node.

> 
>> On AIC100 (and I assume other MHI devices because the hardware
>> implementation is common), soc_reset is processed entirely in hardware.
>> When the register write hits the endpoint, it causes the soc to reset
>> without firmware involvement.
>>
>> If you stress test soc_reset and hit the timing just right, you can have PBL
>> signal syserr (fatal error) for soc_reset N, and then before PBL can process
>> the MHI_RESET request from the host, soc_reset N+1 hits the endpoint causing
>> the soc to reset, and re-run PBL from the beginning which causes PBL to lose
>> all state.  This is how we discovered this issue, although the reproduction
>> rate was rather low.
>>
> 
> Thanks for the info. Could you please add the same in commit message and send
> next version?

Will do.

> 
> Otherwise, the patch looks good to me.
> 
>> I was able to hack the AMSS EE firmware (QSM) to synthetically reproduce the
>> issue as well.  Send a trigger to QSM via an unused MHI register to invoke
>> syserr (non-fatal error), and then have QSM ignore the MHI_RESET request
>> which would simulate some kind of FW hang.  soc_reset would not recover the
>> device.
>>
> 
> Ok. Even though this is not an issue that would impact the users (commonly),
> I'm inclined to have this change, just in case...
> 
> - Mani
> 
>>>
>>> - Mani
>>>
>>>> The host will remain in MHI_PM_SYS_ERR_PROCESS and not clean up the host
>>>> channels.  Clients will not be notified of the SYSERR via the destruction
>>>> of their channel devices, which means clients may think that the device is
>>>> still up.  Subsequent SYSERR events such as a device fatal error will not
>>>> be processed as the state machine cannot transition from PROCESS back to
>>>> DETECT.  The only way to recover from this is to unload the mhi module
>>>> (wipe the state machine state) or for the mhi controller to initiate
>>>> SHUTDOWN.
>>>>
>>>> In this failure case, to recover the device, we need a state similar to
>>>> PROCESS, but can transition to DETECT.  There is not a viable existing
>>>> state to use.  POR has the needed transitions, but assumes the device is
>>>> in a good state and could allow the host to attempt to use the device.
>>>> Allowing PROCESS to transition to DETECT invites the possibility of
>>>> parallel SYSERR processing which could get the host and device out of
>>>> sync.
>>>>
>>>> Thus, invent a new state - MHI_PM_SYS_ERR_FAIL
>>>>
>>>> This essentially a holding state.  It allows us to clean up the host
>>>> elements that are based on the old state of the device (channels), but
>>>> does not allow us to directly advance back to an operational state.  It
>>>> does allow the detection and processing of another SYSERR which may
>>>> recover the device, or allows the controller to do a clean shutdown.
>>>>
>>>> Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
>>>> Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>
>>>> ---
>>>>    drivers/bus/mhi/host/init.c     |  1 +
>>>>    drivers/bus/mhi/host/internal.h |  9 ++++++---
>>>>    drivers/bus/mhi/host/pm.c       | 20 +++++++++++++++++---
>>>>    3 files changed, 24 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
>>>> index e2c2f510b04f..d3f919277cf9 100644
>>>> --- a/drivers/bus/mhi/host/init.c
>>>> +++ b/drivers/bus/mhi/host/init.c
>>>> @@ -62,6 +62,7 @@ static const char * const mhi_pm_state_str[] = {
>>>>    	[MHI_PM_STATE_FW_DL_ERR] = "Firmware Download Error",
>>>>    	[MHI_PM_STATE_SYS_ERR_DETECT] = "SYS ERROR Detect",
>>>>    	[MHI_PM_STATE_SYS_ERR_PROCESS] = "SYS ERROR Process",
>>>> +	[MHI_PM_STATE_SYS_ERR_FAIL] = "SYS ERROR Failure",
>>>>    	[MHI_PM_STATE_SHUTDOWN_PROCESS] = "SHUTDOWN Process",
>>>>    	[MHI_PM_STATE_LD_ERR_FATAL_DETECT] = "Linkdown or Error Fatal Detect",
>>>>    };
>>>> diff --git a/drivers/bus/mhi/host/internal.h b/drivers/bus/mhi/host/internal.h
>>>> index 30ac415a3000..4b6deea17bcd 100644
>>>> --- a/drivers/bus/mhi/host/internal.h
>>>> +++ b/drivers/bus/mhi/host/internal.h
>>>> @@ -88,6 +88,7 @@ enum mhi_pm_state {
>>>>    	MHI_PM_STATE_FW_DL_ERR,
>>>>    	MHI_PM_STATE_SYS_ERR_DETECT,
>>>>    	MHI_PM_STATE_SYS_ERR_PROCESS,
>>>> +	MHI_PM_STATE_SYS_ERR_FAIL,
>>>>    	MHI_PM_STATE_SHUTDOWN_PROCESS,
>>>>    	MHI_PM_STATE_LD_ERR_FATAL_DETECT,
>>>>    	MHI_PM_STATE_MAX
>>>> @@ -104,14 +105,16 @@ enum mhi_pm_state {
>>>>    #define MHI_PM_FW_DL_ERR				BIT(7)
>>>>    #define MHI_PM_SYS_ERR_DETECT				BIT(8)
>>>>    #define MHI_PM_SYS_ERR_PROCESS				BIT(9)
>>>> -#define MHI_PM_SHUTDOWN_PROCESS				BIT(10)
>>>> +#define MHI_PM_SYS_ERR_FAIL				BIT(10)
>>>> +#define MHI_PM_SHUTDOWN_PROCESS				BIT(11)
>>>>    /* link not accessible */
>>>> -#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(11)
>>>> +#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(12)
>>>>    #define MHI_REG_ACCESS_VALID(pm_state)			((pm_state & (MHI_PM_POR | MHI_PM_M0 | \
>>>>    						MHI_PM_M2 | MHI_PM_M3_ENTER | MHI_PM_M3_EXIT | \
>>>>    						MHI_PM_SYS_ERR_DETECT | MHI_PM_SYS_ERR_PROCESS | \
>>>> -						MHI_PM_SHUTDOWN_PROCESS | MHI_PM_FW_DL_ERR)))
>>>> +						MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |  \
>>>> +						MHI_PM_FW_DL_ERR)))
>>>>    #define MHI_PM_IN_ERROR_STATE(pm_state)			(pm_state >= MHI_PM_FW_DL_ERR)
>>>>    #define MHI_PM_IN_FATAL_STATE(pm_state)			(pm_state == MHI_PM_LD_ERR_FATAL_DETECT)
>>>>    #define MHI_DB_ACCESS_VALID(mhi_cntrl)			(mhi_cntrl->pm_state & mhi_cntrl->db_access)
>>>> diff --git a/drivers/bus/mhi/host/pm.c b/drivers/bus/mhi/host/pm.c
>>>> index a2f2feef1476..d0d033ce9984 100644
>>>> --- a/drivers/bus/mhi/host/pm.c
>>>> +++ b/drivers/bus/mhi/host/pm.c
>>>> @@ -36,7 +36,10 @@
>>>>     *     M0 <--> M0
>>>>     *     M0 -> FW_DL_ERR
>>>>     *     M0 -> M3_ENTER -> M3 -> M3_EXIT --> M0
>>>> - * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS --> POR
>>>> + * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS
>>>> + *     SYS_ERR_PROCESS -> SYS_ERR_FAIL
>>>> + *     SYS_ERR_FAIL -> SYS_ERR_DETECT
>>>> + *     SYS_ERR_PROCESS --> POR
>>>>     * L2: SHUTDOWN_PROCESS -> LD_ERR_FATAL_DETECT
>>>>     *     SHUTDOWN_PROCESS -> DISABLE
>>>>     * L3: LD_ERR_FATAL_DETECT <--> LD_ERR_FATAL_DETECT
>>>> @@ -93,7 +96,12 @@ static const struct mhi_pm_transitions dev_state_transitions[] = {
>>>>    	},
>>>>    	{
>>>>    		MHI_PM_SYS_ERR_PROCESS,
>>>> -		MHI_PM_POR | MHI_PM_SHUTDOWN_PROCESS |
>>>> +		MHI_PM_POR | MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |
>>>> +		MHI_PM_LD_ERR_FATAL_DETECT
>>>> +	},
>>>> +	{
>>>> +		MHI_PM_SYS_ERR_FAIL,
>>>> +		MHI_PM_SYS_ERR_DETECT | MHI_PM_SHUTDOWN_PROCESS |
>>>>    		MHI_PM_LD_ERR_FATAL_DETECT
>>>>    	},
>>>>    	/* L2 States */
>>>> @@ -629,7 +637,13 @@ static void mhi_pm_sys_error_transition(struct mhi_controller *mhi_cntrl)
>>>>    					!in_reset, timeout);
>>>>    		if (!ret || in_reset) {
>>>>    			dev_err(dev, "Device failed to exit MHI Reset state\n");
>>>> -			goto exit_sys_error_transition;
>>>> +			write_lock_irq(&mhi_cntrl->pm_lock);
>>>> +			cur_state = mhi_tryset_pm_state(mhi_cntrl,
>>>> +							MHI_PM_SYS_ERR_FAIL);
>>>> +			write_unlock_irq(&mhi_cntrl->pm_lock);
>>>> +			/* Shutdown may have occurred, otherwise cleanup now */
>>>> +			if (cur_state != MHI_PM_SYS_ERR_FAIL)
>>>> +				goto exit_sys_error_transition;
>>>>    		}
>>>>    		/*
>>>> -- 
>>>> 2.34.1
>>>>
>>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
index e2c2f510b04f..d3f919277cf9 100644
--- a/drivers/bus/mhi/host/init.c
+++ b/drivers/bus/mhi/host/init.c
@@ -62,6 +62,7 @@  static const char * const mhi_pm_state_str[] = {
 	[MHI_PM_STATE_FW_DL_ERR] = "Firmware Download Error",
 	[MHI_PM_STATE_SYS_ERR_DETECT] = "SYS ERROR Detect",
 	[MHI_PM_STATE_SYS_ERR_PROCESS] = "SYS ERROR Process",
+	[MHI_PM_STATE_SYS_ERR_FAIL] = "SYS ERROR Failure",
 	[MHI_PM_STATE_SHUTDOWN_PROCESS] = "SHUTDOWN Process",
 	[MHI_PM_STATE_LD_ERR_FATAL_DETECT] = "Linkdown or Error Fatal Detect",
 };
diff --git a/drivers/bus/mhi/host/internal.h b/drivers/bus/mhi/host/internal.h
index 30ac415a3000..4b6deea17bcd 100644
--- a/drivers/bus/mhi/host/internal.h
+++ b/drivers/bus/mhi/host/internal.h
@@ -88,6 +88,7 @@  enum mhi_pm_state {
 	MHI_PM_STATE_FW_DL_ERR,
 	MHI_PM_STATE_SYS_ERR_DETECT,
 	MHI_PM_STATE_SYS_ERR_PROCESS,
+	MHI_PM_STATE_SYS_ERR_FAIL,
 	MHI_PM_STATE_SHUTDOWN_PROCESS,
 	MHI_PM_STATE_LD_ERR_FATAL_DETECT,
 	MHI_PM_STATE_MAX
@@ -104,14 +105,16 @@  enum mhi_pm_state {
 #define MHI_PM_FW_DL_ERR				BIT(7)
 #define MHI_PM_SYS_ERR_DETECT				BIT(8)
 #define MHI_PM_SYS_ERR_PROCESS				BIT(9)
-#define MHI_PM_SHUTDOWN_PROCESS				BIT(10)
+#define MHI_PM_SYS_ERR_FAIL				BIT(10)
+#define MHI_PM_SHUTDOWN_PROCESS				BIT(11)
 /* link not accessible */
-#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(11)
+#define MHI_PM_LD_ERR_FATAL_DETECT			BIT(12)
 
 #define MHI_REG_ACCESS_VALID(pm_state)			((pm_state & (MHI_PM_POR | MHI_PM_M0 | \
 						MHI_PM_M2 | MHI_PM_M3_ENTER | MHI_PM_M3_EXIT | \
 						MHI_PM_SYS_ERR_DETECT | MHI_PM_SYS_ERR_PROCESS | \
-						MHI_PM_SHUTDOWN_PROCESS | MHI_PM_FW_DL_ERR)))
+						MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |  \
+						MHI_PM_FW_DL_ERR)))
 #define MHI_PM_IN_ERROR_STATE(pm_state)			(pm_state >= MHI_PM_FW_DL_ERR)
 #define MHI_PM_IN_FATAL_STATE(pm_state)			(pm_state == MHI_PM_LD_ERR_FATAL_DETECT)
 #define MHI_DB_ACCESS_VALID(mhi_cntrl)			(mhi_cntrl->pm_state & mhi_cntrl->db_access)
diff --git a/drivers/bus/mhi/host/pm.c b/drivers/bus/mhi/host/pm.c
index a2f2feef1476..d0d033ce9984 100644
--- a/drivers/bus/mhi/host/pm.c
+++ b/drivers/bus/mhi/host/pm.c
@@ -36,7 +36,10 @@ 
  *     M0 <--> M0
  *     M0 -> FW_DL_ERR
  *     M0 -> M3_ENTER -> M3 -> M3_EXIT --> M0
- * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS --> POR
+ * L1: SYS_ERR_DETECT -> SYS_ERR_PROCESS
+ *     SYS_ERR_PROCESS -> SYS_ERR_FAIL
+ *     SYS_ERR_FAIL -> SYS_ERR_DETECT
+ *     SYS_ERR_PROCESS --> POR
  * L2: SHUTDOWN_PROCESS -> LD_ERR_FATAL_DETECT
  *     SHUTDOWN_PROCESS -> DISABLE
  * L3: LD_ERR_FATAL_DETECT <--> LD_ERR_FATAL_DETECT
@@ -93,7 +96,12 @@  static const struct mhi_pm_transitions dev_state_transitions[] = {
 	},
 	{
 		MHI_PM_SYS_ERR_PROCESS,
-		MHI_PM_POR | MHI_PM_SHUTDOWN_PROCESS |
+		MHI_PM_POR | MHI_PM_SYS_ERR_FAIL | MHI_PM_SHUTDOWN_PROCESS |
+		MHI_PM_LD_ERR_FATAL_DETECT
+	},
+	{
+		MHI_PM_SYS_ERR_FAIL,
+		MHI_PM_SYS_ERR_DETECT | MHI_PM_SHUTDOWN_PROCESS |
 		MHI_PM_LD_ERR_FATAL_DETECT
 	},
 	/* L2 States */
@@ -629,7 +637,13 @@  static void mhi_pm_sys_error_transition(struct mhi_controller *mhi_cntrl)
 					!in_reset, timeout);
 		if (!ret || in_reset) {
 			dev_err(dev, "Device failed to exit MHI Reset state\n");
-			goto exit_sys_error_transition;
+			write_lock_irq(&mhi_cntrl->pm_lock);
+			cur_state = mhi_tryset_pm_state(mhi_cntrl,
+							MHI_PM_SYS_ERR_FAIL);
+			write_unlock_irq(&mhi_cntrl->pm_lock);
+			/* Shutdown may have occurred, otherwise cleanup now */
+			if (cur_state != MHI_PM_SYS_ERR_FAIL)
+				goto exit_sys_error_transition;
 		}
 
 		/*