diff mbox series

[v1] ufs: core: wlun resume SSU(Acitve) fail recovery

Message ID 20221221123537.30148-1-peter.wang@mediatek.com (mailing list archive)
State Superseded
Headers show
Series [v1] ufs: core: wlun resume SSU(Acitve) fail recovery | expand

Commit Message

Peter Wang (王信友) Dec. 21, 2022, 12:35 p.m. UTC
From: Peter Wang <peter.wang@mediatek.com>

When wlun resume SSU(Active) timeout, scsi try eh_host_reset_handler.
But ufshcd_eh_host_reset_handler hang at wait flush_work(&hba->eh_work).
And ufshcd_err_handler hang at wait rpm resume.
Do link recovery only in this case. Below is IO hang stack dump.

<ffffffdd78e02b34> schedule+0x110/0x204
<ffffffdd78e0be60> schedule_timeout+0x98/0x138
<ffffffdd78e040e8> wait_for_common_io+0x130/0x2d0
<ffffffdd77d6a000> blk_execute_rq+0x10c/0x16c
<ffffffdd78126d90> __scsi_execute+0xfc/0x278
<ffffffdd7813891c> ufshcd_set_dev_pwr_mode+0x1c8/0x40c
<ffffffdd78137d1c> __ufshcd_wl_resume+0xf0/0x5cc
<ffffffdd78137ae0> ufshcd_wl_runtime_resume+0x40/0x18c
<ffffffdd78136108> scsi_runtime_resume+0x88/0x104
<ffffffdd7809a4f8> __rpm_callback+0x1a0/0xaec
<ffffffdd7809b624> rpm_resume+0x7e0/0xcd0
<ffffffdd7809a788> __rpm_callback+0x430/0xaec
<ffffffdd7809b644> rpm_resume+0x800/0xcd0
<ffffffdd780a0778> pm_runtime_work+0x148/0x198

<ffffffdd78e02b34> schedule+0x110/0x204
<ffffffdd78e0be10> schedule_timeout+0x48/0x138
<ffffffdd78e03d9c> wait_for_common+0x144/0x2dc
<ffffffdd7758bba4> __flush_work+0x3d0/0x508
<ffffffdd7815572c> ufshcd_eh_host_reset_handler+0x134/0x3a8
<ffffffdd781216f4> scsi_try_host_reset+0x54/0x204
<ffffffdd78120594> scsi_eh_ready_devs+0xb30/0xd48
<ffffffdd7812373c> scsi_error_handler+0x260/0x874

<ffffffdd78e02b34> schedule+0x110/0x204
<ffffffdd7809af64> rpm_resume+0x120/0xcd0
<ffffffdd7809fde8> __pm_runtime_resume+0xa0/0x17c
<ffffffdd7815193c> ufshcd_err_handling_prepare+0x40/0x430
<ffffffdd7814cce8> ufshcd_err_handler+0x1c4/0xd4c

Signed-off-by: Peter Wang <peter.wang@mediatek.com>
---
 drivers/ufs/core/ufshcd.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

Comments

kernel test robot Dec. 21, 2022, 4:40 p.m. UTC | #1
Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on jejb-scsi/for-next]
[also build test ERROR on mkp-scsi/for-next linus/master v6.1 next-20221220]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/peter-wang-mediatek-com/ufs-core-wlun-resume-SSU-Acitve-fail-recovery/20221221-203634
base:   https://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git for-next
patch link:    https://lore.kernel.org/r/20221221123537.30148-1-peter.wang%40mediatek.com
patch subject: [PATCH v1] ufs: core: wlun resume SSU(Acitve) fail recovery
config: m68k-allmodconfig
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/97f75d9637531953204ec88d6550a6b2f04f2a89
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review peter-wang-mediatek-com/ufs-core-wlun-resume-SSU-Acitve-fail-recovery/20221221-203634
        git checkout 97f75d9637531953204ec88d6550a6b2f04f2a89
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash drivers/ufs/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   drivers/ufs/core/ufshcd.c: In function 'ufshcd_eh_host_reset_handler':
>> drivers/ufs/core/ufshcd.c:7376:19: error: 'struct ufs_hba' has no member named 'sdev_ufs_device'
    7376 |         dev = &hba->sdev_ufs_device->sdev_gendev;
         |                   ^~
>> drivers/ufs/core/ufshcd.c:7377:23: error: 'struct dev_pm_info' has no member named 'runtime_status'
    7377 |         if (dev->power.runtime_status == RPM_RESUMING) {
         |                       ^
   In file included from include/linux/device.h:15,
                    from include/linux/async.h:14,
                    from drivers/ufs/core/ufshcd.c:12:
   drivers/ufs/core/ufshcd.c:7381:43: error: 'struct dev_pm_info' has no member named 'runtime_status'
    7381 |                                 dev->power.runtime_status,
         |                                           ^
   include/linux/dev_printk.h:110:37: note: in definition of macro 'dev_printk_index_wrap'
     110 |                 _p_func(dev, fmt, ##__VA_ARGS__);                       \
         |                                     ^~~~~~~~~~~
   drivers/ufs/core/ufshcd.c:7380:25: note: in expansion of macro 'dev_err'
    7380 |                         dev_err(hba->dev, "WL Device PM: status:%d, err:%d\n",
         |                         ^~~~~~~
>> drivers/ufs/core/ufshcd.c:7382:43: error: 'struct dev_pm_info' has no member named 'runtime_error'
    7382 |                                 dev->power.runtime_error);
         |                                           ^
   include/linux/dev_printk.h:110:37: note: in definition of macro 'dev_printk_index_wrap'
     110 |                 _p_func(dev, fmt, ##__VA_ARGS__);                       \
         |                                     ^~~~~~~~~~~
   drivers/ufs/core/ufshcd.c:7380:25: note: in expansion of macro 'dev_err'
    7380 |                         dev_err(hba->dev, "WL Device PM: status:%d, err:%d\n",
         |                         ^~~~~~~


vim +7376 drivers/ufs/core/ufshcd.c

  7354	
  7355	/**
  7356	 * ufshcd_eh_host_reset_handler - host reset handler registered to scsi layer
  7357	 * @cmd: SCSI command pointer
  7358	 *
  7359	 * Returns SUCCESS/FAILED
  7360	 */
  7361	static int ufshcd_eh_host_reset_handler(struct scsi_cmnd *cmd)
  7362	{
  7363		int err = SUCCESS;
  7364		unsigned long flags;
  7365		struct ufs_hba *hba;
  7366		struct device *dev;
  7367	
  7368		hba = shost_priv(cmd->device->host);
  7369	
  7370		/*
  7371		 * If __ufshcd_wl_suspend get fail and runtime_status = RPM_RESUMING,
  7372		 * do link recovery only. Because schedule eh work will get dead lock
  7373		 * in ufshcd_rpm_get_sync to wait wlun resume, but wlun resume get
  7374		 * error and wait eh work finish.
  7375		 */
> 7376		dev = &hba->sdev_ufs_device->sdev_gendev;
> 7377		if (dev->power.runtime_status == RPM_RESUMING) {
  7378			err = ufshcd_link_recovery(hba);
  7379			if (err) {
  7380				dev_err(hba->dev, "WL Device PM: status:%d, err:%d\n",
  7381					dev->power.runtime_status,
> 7382					dev->power.runtime_error);
  7383			}
  7384			return err;
  7385		}
  7386	
  7387		spin_lock_irqsave(hba->host->host_lock, flags);
  7388		hba->force_reset = true;
  7389		ufshcd_schedule_eh_work(hba);
  7390		dev_err(hba->dev, "%s: reset in progress - 1\n", __func__);
  7391		spin_unlock_irqrestore(hba->host->host_lock, flags);
  7392	
  7393		flush_work(&hba->eh_work);
  7394	
  7395		spin_lock_irqsave(hba->host->host_lock, flags);
  7396		if (hba->ufshcd_state == UFSHCD_STATE_ERROR)
  7397			err = FAILED;
  7398		spin_unlock_irqrestore(hba->host->host_lock, flags);
  7399	
  7400		return err;
  7401	}
  7402
kernel test robot Dec. 21, 2022, 7:01 p.m. UTC | #2
Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on jejb-scsi/for-next]
[also build test ERROR on mkp-scsi/for-next linus/master v6.1 next-20221220]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/peter-wang-mediatek-com/ufs-core-wlun-resume-SSU-Acitve-fail-recovery/20221221-203634
base:   https://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git for-next
patch link:    https://lore.kernel.org/r/20221221123537.30148-1-peter.wang%40mediatek.com
patch subject: [PATCH v1] ufs: core: wlun resume SSU(Acitve) fail recovery
config: i386-randconfig-a014-20221219
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/97f75d9637531953204ec88d6550a6b2f04f2a89
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review peter-wang-mediatek-com/ufs-core-wlun-resume-SSU-Acitve-fail-recovery/20221221-203634
        git checkout 97f75d9637531953204ec88d6550a6b2f04f2a89
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/ufs/core/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> drivers/ufs/core/ufshcd.c:7376:14: error: no member named 'sdev_ufs_device' in 'struct ufs_hba'
           dev = &hba->sdev_ufs_device->sdev_gendev;
                  ~~~  ^
   drivers/ufs/core/ufshcd.c:9630:44: warning: shift count >= width of type [-Wshift-count-overflow]
                   if (!dma_set_mask_and_coherent(hba->dev, DMA_BIT_MASK(64)))
                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~
   include/linux/dma-mapping.h:76:54: note: expanded from macro 'DMA_BIT_MASK'
   #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
                                                        ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                              ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/compiler.h:58:52: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                      ^~~~
   drivers/ufs/core/ufshcd.c:9630:44: warning: shift count >= width of type [-Wshift-count-overflow]
                   if (!dma_set_mask_and_coherent(hba->dev, DMA_BIT_MASK(64)))
                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~
   include/linux/dma-mapping.h:76:54: note: expanded from macro 'DMA_BIT_MASK'
   #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
                                                        ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                              ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/compiler.h:58:61: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                               ^~~~
   drivers/ufs/core/ufshcd.c:9630:44: warning: shift count >= width of type [-Wshift-count-overflow]
                   if (!dma_set_mask_and_coherent(hba->dev, DMA_BIT_MASK(64)))
                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~
   include/linux/dma-mapping.h:76:54: note: expanded from macro 'DMA_BIT_MASK'
   #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
                                                        ^
   include/linux/compiler.h:56:47: note: expanded from macro 'if'
   #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                              ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/compiler.h:58:86: note: expanded from macro '__trace_if_var'
   #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                                       ~~~~~~~~~~~~~~~~~^~~~~
   include/linux/compiler.h:69:3: note: expanded from macro '__trace_if_value'
           (cond) ?                                        \
            ^~~~
   3 warnings and 1 error generated.


vim +7376 drivers/ufs/core/ufshcd.c

  7354	
  7355	/**
  7356	 * ufshcd_eh_host_reset_handler - host reset handler registered to scsi layer
  7357	 * @cmd: SCSI command pointer
  7358	 *
  7359	 * Returns SUCCESS/FAILED
  7360	 */
  7361	static int ufshcd_eh_host_reset_handler(struct scsi_cmnd *cmd)
  7362	{
  7363		int err = SUCCESS;
  7364		unsigned long flags;
  7365		struct ufs_hba *hba;
  7366		struct device *dev;
  7367	
  7368		hba = shost_priv(cmd->device->host);
  7369	
  7370		/*
  7371		 * If __ufshcd_wl_suspend get fail and runtime_status = RPM_RESUMING,
  7372		 * do link recovery only. Because schedule eh work will get dead lock
  7373		 * in ufshcd_rpm_get_sync to wait wlun resume, but wlun resume get
  7374		 * error and wait eh work finish.
  7375		 */
> 7376		dev = &hba->sdev_ufs_device->sdev_gendev;
  7377		if (dev->power.runtime_status == RPM_RESUMING) {
  7378			err = ufshcd_link_recovery(hba);
  7379			if (err) {
  7380				dev_err(hba->dev, "WL Device PM: status:%d, err:%d\n",
  7381					dev->power.runtime_status,
  7382					dev->power.runtime_error);
  7383			}
  7384			return err;
  7385		}
  7386	
  7387		spin_lock_irqsave(hba->host->host_lock, flags);
  7388		hba->force_reset = true;
  7389		ufshcd_schedule_eh_work(hba);
  7390		dev_err(hba->dev, "%s: reset in progress - 1\n", __func__);
  7391		spin_unlock_irqrestore(hba->host->host_lock, flags);
  7392	
  7393		flush_work(&hba->eh_work);
  7394	
  7395		spin_lock_irqsave(hba->host->host_lock, flags);
  7396		if (hba->ufshcd_state == UFSHCD_STATE_ERROR)
  7397			err = FAILED;
  7398		spin_unlock_irqrestore(hba->host->host_lock, flags);
  7399	
  7400		return err;
  7401	}
  7402
diff mbox series

Patch

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index e18c9f4463ec..5aaffd13e132 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -7363,9 +7363,27 @@  static int ufshcd_eh_host_reset_handler(struct scsi_cmnd *cmd)
 	int err = SUCCESS;
 	unsigned long flags;
 	struct ufs_hba *hba;
+	struct device *dev;
 
 	hba = shost_priv(cmd->device->host);
 
+	/*
+	 * If __ufshcd_wl_suspend get fail and runtime_status = RPM_RESUMING,
+	 * do link recovery only. Because schedule eh work will get dead lock
+	 * in ufshcd_rpm_get_sync to wait wlun resume, but wlun resume get
+	 * error and wait eh work finish.
+	 */
+	dev = &hba->sdev_ufs_device->sdev_gendev;
+	if (dev->power.runtime_status == RPM_RESUMING) {
+		err = ufshcd_link_recovery(hba);
+		if (err) {
+			dev_err(hba->dev, "WL Device PM: status:%d, err:%d\n",
+				dev->power.runtime_status,
+				dev->power.runtime_error);
+		}
+		return err;
+	}
+
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	hba->force_reset = true;
 	ufshcd_schedule_eh_work(hba);