diff mbox series

PCI: vmd: Use raw spinlock for cfg_lock

Message ID SEZPR01MB45274AF863D3BD2F028141D9A8CF2@SEZPR01MB4527.apcprd01.prod.exchangelabs.com (mailing list archive)
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series PCI: vmd: Use raw spinlock for cfg_lock | expand

Commit Message

Jiwei Sun June 19, 2024, 11:27 a.m. UTC
From: Jiwei Sun <sunjw10@lenovo.com>

If the kernel is built with the following configurations and booting
  CONFIG_VMD=y
  CONFIG_DEBUG_LOCKDEP=y
  CONFIG_DEBUG_SPINLOCK=y
  CONFIG_PROVE_LOCKING=y
  CONFIG_PROVE_RAW_LOCK_NESTING=y

The following log appears,

=============================
[ BUG: Invalid wait context ]
6.10.0-rc4 #80 Not tainted
-----------------------------
kworker/18:2/633 is trying to lock:
ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
other info that might help us debug this:
context-{5:5}
4 locks held by kworker/18:2/633:
 #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
 #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
 #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
 #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
stack backtrace:
CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
Workqueue: events work_for_cpu_fn
Call Trace:
 <TASK>
 dump_stack_lvl+0x7c/0xc0
 __lock_acquire+0x9e5/0x1ed0
 lock_acquire+0x194/0x490
 _raw_spin_lock_irqsave+0x42/0x90
 vmd_pci_write+0x185/0x2a0
 pci_msi_update_mask+0x10c/0x170
 __pci_enable_msi_range+0x291/0x800
 pci_alloc_irq_vectors_affinity+0x13e/0x1d0
 pcie_portdrv_probe+0x570/0xe60
 local_pci_probe+0xdc/0x190
 work_for_cpu_fn+0x4e/0xa0
 process_one_work+0x86d/0x1920
 process_scheduled_works+0xd7/0x140
 worker_thread+0x3e9/0xb90
 kthread+0x2e9/0x3d0
 ret_from_fork+0x2d/0x60
 ret_from_fork_asm+0x1a/0x30
 </TASK>

The root cause is that the dev->msi_lock is a raw spinlock, but
vmd->cfg_lock is a spinlock.

Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
Suggested-by: Adrian Huang <ahuang12@lenovo.com>
---
 drivers/pci/controller/vmd.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

Comments

Bjorn Helgaas June 19, 2024, 8 p.m. UTC | #1
[+cc Thomas in case he has msi_lock comment, Keith in case he has
cfg_lock comment]

On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote:
> From: Jiwei Sun <sunjw10@lenovo.com>
> 
> If the kernel is built with the following configurations and booting
>   CONFIG_VMD=y
>   CONFIG_DEBUG_LOCKDEP=y
>   CONFIG_DEBUG_SPINLOCK=y
>   CONFIG_PROVE_LOCKING=y
>   CONFIG_PROVE_RAW_LOCK_NESTING=y
> 
> The following log appears,
> 
> =============================
> [ BUG: Invalid wait context ]
> 6.10.0-rc4 #80 Not tainted
> -----------------------------
> kworker/18:2/633 is trying to lock:
> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
> other info that might help us debug this:
> context-{5:5}
> 4 locks held by kworker/18:2/633:
>  #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
>  #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
>  #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
>  #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
> stack backtrace:
> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
> Workqueue: events work_for_cpu_fn
> Call Trace:
>  <TASK>
>  dump_stack_lvl+0x7c/0xc0
>  __lock_acquire+0x9e5/0x1ed0
>  lock_acquire+0x194/0x490
>  _raw_spin_lock_irqsave+0x42/0x90
>  vmd_pci_write+0x185/0x2a0
>  pci_msi_update_mask+0x10c/0x170
>  __pci_enable_msi_range+0x291/0x800
>  pci_alloc_irq_vectors_affinity+0x13e/0x1d0
>  pcie_portdrv_probe+0x570/0xe60
>  local_pci_probe+0xdc/0x190
>  work_for_cpu_fn+0x4e/0xa0
>  process_one_work+0x86d/0x1920
>  process_scheduled_works+0xd7/0x140
>  worker_thread+0x3e9/0xb90
>  kthread+0x2e9/0x3d0
>  ret_from_fork+0x2d/0x60
>  ret_from_fork_asm+0x1a/0x30
>  </TASK>
> 
> The root cause is that the dev->msi_lock is a raw spinlock, but
> vmd->cfg_lock is a spinlock.

Can you expand this a little bit?  This isn't enough unless one
already knows the difference between raw_spinlock_t and spinlock_t,
which I didn't.

Documentation/locking/locktypes.rst says they are the same except when
CONFIG_PREEMPT_RT is set (might be worth mentioning with the config
list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on
rt_mutex.

And I guess there's a rule that you can't acquire rt_mutex while
holding a raw_spinlock.

The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect
msi_desc::masked for multi-MSI") and only used in
pci_msi_update_mask():

  raw_spin_lock_irqsave(lock, flags);
  desc->pci.msi_mask &= ~clear;
  desc->pci.msi_mask |= set;
  pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
			 desc->pci.msi_mask);
  raw_spin_unlock_irqrestore(lock, flags);

The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for
Intel Volume Management Device (VMD)") and is only used around VMD
config accesses, e.g.,

  * CPU may deadlock if config space is not serialized on some versions of this
  * hardware, so all config space access is done under a spinlock.

  static int vmd_pci_read(...)
  {
    spin_lock_irqsave(&vmd->cfg_lock, flags);
    switch (len) {
    case 1:
	    *value = readb(addr);
	    break;
    case 2:
	    *value = readw(addr);
	    break;
    case 4:
	    *value = readl(addr);
	    break;
    default:
	    ret = -EINVAL;
	    break;
    }
    spin_unlock_irqrestore(&vmd->cfg_lock, flags);
  }

IIUC those reads turn into single PCIe MMIO reads, so I wouldn't
expect any concurrency issues there that need locking.

But apparently there's something weird that can deadlock the CPU.

> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
> Suggested-by: Adrian Huang <ahuang12@lenovo.com>
> ---
>  drivers/pci/controller/vmd.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
> index 87b7856f375a..45d0ebf96adc 100644
> --- a/drivers/pci/controller/vmd.c
> +++ b/drivers/pci/controller/vmd.c
> @@ -125,7 +125,7 @@ struct vmd_irq_list {
>  struct vmd_dev {
>  	struct pci_dev		*dev;
>  
> -	spinlock_t		cfg_lock;
> +	raw_spinlock_t		cfg_lock;
>  	void __iomem		*cfgbar;
>  
>  	int msix_count;
> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>  	if (!addr)
>  		return -EFAULT;
>  
> -	spin_lock_irqsave(&vmd->cfg_lock, flags);
> +	raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>  	switch (len) {
>  	case 1:
>  		*value = readb(addr);
> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>  		ret = -EINVAL;
>  		break;
>  	}
> -	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
> +	raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>  	return ret;
>  }
>  
> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>  	if (!addr)
>  		return -EFAULT;
>  
> -	spin_lock_irqsave(&vmd->cfg_lock, flags);
> +	raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>  	switch (len) {
>  	case 1:
>  		writeb(value, addr);
> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>  		ret = -EINVAL;
>  		break;
>  	}
> -	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
> +	raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>  	return ret;
>  }
>  
> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>  	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
>  		vmd->first_vec = 1;
>  
> -	spin_lock_init(&vmd->cfg_lock);
> +	raw_spin_lock_init(&vmd->cfg_lock);
>  	pci_set_drvdata(dev, vmd);
>  	err = vmd_enable_domain(vmd, features);
>  	if (err)
> -- 
> 2.27.0
>
Jiwei Sun June 20, 2024, 8:57 a.m. UTC | #2
On 6/20/24 04:00, Bjorn Helgaas wrote:
> [+cc Thomas in case he has msi_lock comment, Keith in case he has
> cfg_lock comment]
> 
> On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote:
>> From: Jiwei Sun <sunjw10@lenovo.com>
>>
>> If the kernel is built with the following configurations and booting
>>   CONFIG_VMD=y
>>   CONFIG_DEBUG_LOCKDEP=y
>>   CONFIG_DEBUG_SPINLOCK=y
>>   CONFIG_PROVE_LOCKING=y
>>   CONFIG_PROVE_RAW_LOCK_NESTING=y
>>
>> The following log appears,
>>
>> =============================
>> [ BUG: Invalid wait context ]
>> 6.10.0-rc4 #80 Not tainted
>> -----------------------------
>> kworker/18:2/633 is trying to lock:
>> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
>> other info that might help us debug this:
>> context-{5:5}
>> 4 locks held by kworker/18:2/633:
>>  #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
>>  #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
>>  #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
>>  #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
>> stack backtrace:
>> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
>> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
>> Workqueue: events work_for_cpu_fn
>> Call Trace:
>>  <TASK>
>>  dump_stack_lvl+0x7c/0xc0
>>  __lock_acquire+0x9e5/0x1ed0
>>  lock_acquire+0x194/0x490
>>  _raw_spin_lock_irqsave+0x42/0x90
>>  vmd_pci_write+0x185/0x2a0
>>  pci_msi_update_mask+0x10c/0x170
>>  __pci_enable_msi_range+0x291/0x800
>>  pci_alloc_irq_vectors_affinity+0x13e/0x1d0
>>  pcie_portdrv_probe+0x570/0xe60
>>  local_pci_probe+0xdc/0x190
>>  work_for_cpu_fn+0x4e/0xa0
>>  process_one_work+0x86d/0x1920
>>  process_scheduled_works+0xd7/0x140
>>  worker_thread+0x3e9/0xb90
>>  kthread+0x2e9/0x3d0
>>  ret_from_fork+0x2d/0x60
>>  ret_from_fork_asm+0x1a/0x30
>>  </TASK>
>>
>> The root cause is that the dev->msi_lock is a raw spinlock, but
>> vmd->cfg_lock is a spinlock.
> 
> Can you expand this a little bit?  This isn't enough unless one
> already knows the difference between raw_spinlock_t and spinlock_t,
> which I didn't.
> 
> Documentation/locking/locktypes.rst says they are the same except when
> CONFIG_PREEMPT_RT is set (might be worth mentioning with the config
> list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on
> rt_mutex.
> 
> And I guess there's a rule that you can't acquire rt_mutex while
> holding a raw_spinlock.

Thanks for your review and comments. Sorry for not explaining this clearly.
Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is 
based on raw_spinlock, there is no any question in the above call trace.

But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based
on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example, 
there are two threads are trying to hold a rt_mutex lock, if A hold the
lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting
for A to release the lock. The raw_spinlock is a real spinning lock, which
is not allowed the task of the raw_spinlock owner is scheduled in its
critical region. In other words, we should not try to acquire rt_mutex lock
in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set.

CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are
used to detect the invalid lock nesting (the raw_spinlock vs. spinlock
nesting checks) [1]. Here is the call path:

  pci_msi_update_mask  ---> hold raw_spinlock dev->msi_lock
    pci_write_config_dword
     pci_bus_write_config_dword
       vmd_pci_write   ---> hold spinlock_t vmd->cfg_lock

The above call path is the invalid lock nesting becuase the vmd driver
tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock
region (dev->msi_lock). That's why the message "BUG: Invalid wait contex"
is shown. 

[1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/

Thanks,
Regards,
Jiwei

> 
> The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect
> msi_desc::masked for multi-MSI") and only used in
> pci_msi_update_mask():
> 
>   raw_spin_lock_irqsave(lock, flags);
>   desc->pci.msi_mask &= ~clear;
>   desc->pci.msi_mask |= set;
>   pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
> 			 desc->pci.msi_mask);
>   raw_spin_unlock_irqrestore(lock, flags);
> 
> The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for
> Intel Volume Management Device (VMD)") and is only used around VMD
> config accesses, e.g.,
> 
>   * CPU may deadlock if config space is not serialized on some versions of this
>   * hardware, so all config space access is done under a spinlock.
> 
>   static int vmd_pci_read(...)
>   {
>     spin_lock_irqsave(&vmd->cfg_lock, flags);
>     switch (len) {
>     case 1:
> 	    *value = readb(addr);
> 	    break;
>     case 2:
> 	    *value = readw(addr);
> 	    break;
>     case 4:
> 	    *value = readl(addr);
> 	    break;
>     default:
> 	    ret = -EINVAL;
> 	    break;
>     }
>     spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>   }
> 
> IIUC those reads turn into single PCIe MMIO reads, so I wouldn't
> expect any concurrency issues there that need locking.
> 
> But apparently there's something weird that can deadlock the CPU.
> 
>> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
>> Suggested-by: Adrian Huang <ahuang12@lenovo.com>
>> ---
>>  drivers/pci/controller/vmd.c | 12 ++++++------
>>  1 file changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
>> index 87b7856f375a..45d0ebf96adc 100644
>> --- a/drivers/pci/controller/vmd.c
>> +++ b/drivers/pci/controller/vmd.c
>> @@ -125,7 +125,7 @@ struct vmd_irq_list {
>>  struct vmd_dev {
>>  	struct pci_dev		*dev;
>>  
>> -	spinlock_t		cfg_lock;
>> +	raw_spinlock_t		cfg_lock;
>>  	void __iomem		*cfgbar;
>>  
>>  	int msix_count;
>> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>  	if (!addr)
>>  		return -EFAULT;
>>  
>> -	spin_lock_irqsave(&vmd->cfg_lock, flags);
>> +	raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>  	switch (len) {
>>  	case 1:
>>  		*value = readb(addr);
>> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>  		ret = -EINVAL;
>>  		break;
>>  	}
>> -	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>> +	raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>  	return ret;
>>  }
>>  
>> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>  	if (!addr)
>>  		return -EFAULT;
>>  
>> -	spin_lock_irqsave(&vmd->cfg_lock, flags);
>> +	raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>  	switch (len) {
>>  	case 1:
>>  		writeb(value, addr);
>> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>  		ret = -EINVAL;
>>  		break;
>>  	}
>> -	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>> +	raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>  	return ret;
>>  }
>>  
>> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>>  	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
>>  		vmd->first_vec = 1;
>>  
>> -	spin_lock_init(&vmd->cfg_lock);
>> +	raw_spin_lock_init(&vmd->cfg_lock);
>>  	pci_set_drvdata(dev, vmd);
>>  	err = vmd_enable_domain(vmd, features);
>>  	if (err)
>> -- 
>> 2.27.0
>>
Paul M Stillwell Jr June 20, 2024, 3:10 p.m. UTC | #3
On 6/20/2024 1:57 AM, Jiwei Sun wrote:
> 
> 
> On 6/20/24 04:00, Bjorn Helgaas wrote:
>> [+cc Thomas in case he has msi_lock comment, Keith in case he has
>> cfg_lock comment]
>>
>> On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote:
>>> From: Jiwei Sun <sunjw10@lenovo.com>
>>>
>>> If the kernel is built with the following configurations and booting
>>>    CONFIG_VMD=y
>>>    CONFIG_DEBUG_LOCKDEP=y
>>>    CONFIG_DEBUG_SPINLOCK=y
>>>    CONFIG_PROVE_LOCKING=y
>>>    CONFIG_PROVE_RAW_LOCK_NESTING=y
>>>
>>> The following log appears,
>>>
>>> =============================
>>> [ BUG: Invalid wait context ]
>>> 6.10.0-rc4 #80 Not tainted
>>> -----------------------------
>>> kworker/18:2/633 is trying to lock:
>>> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
>>> other info that might help us debug this:
>>> context-{5:5}
>>> 4 locks held by kworker/18:2/633:
>>>   #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
>>>   #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
>>>   #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
>>>   #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
>>> stack backtrace:
>>> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
>>> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
>>> Workqueue: events work_for_cpu_fn
>>> Call Trace:
>>>   <TASK>
>>>   dump_stack_lvl+0x7c/0xc0
>>>   __lock_acquire+0x9e5/0x1ed0
>>>   lock_acquire+0x194/0x490
>>>   _raw_spin_lock_irqsave+0x42/0x90
>>>   vmd_pci_write+0x185/0x2a0
>>>   pci_msi_update_mask+0x10c/0x170
>>>   __pci_enable_msi_range+0x291/0x800
>>>   pci_alloc_irq_vectors_affinity+0x13e/0x1d0
>>>   pcie_portdrv_probe+0x570/0xe60
>>>   local_pci_probe+0xdc/0x190
>>>   work_for_cpu_fn+0x4e/0xa0
>>>   process_one_work+0x86d/0x1920
>>>   process_scheduled_works+0xd7/0x140
>>>   worker_thread+0x3e9/0xb90
>>>   kthread+0x2e9/0x3d0
>>>   ret_from_fork+0x2d/0x60
>>>   ret_from_fork_asm+0x1a/0x30
>>>   </TASK>
>>>
>>> The root cause is that the dev->msi_lock is a raw spinlock, but
>>> vmd->cfg_lock is a spinlock.
>>
>> Can you expand this a little bit?  This isn't enough unless one
>> already knows the difference between raw_spinlock_t and spinlock_t,
>> which I didn't.
>>
>> Documentation/locking/locktypes.rst says they are the same except when
>> CONFIG_PREEMPT_RT is set (might be worth mentioning with the config
>> list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on
>> rt_mutex.
>>
>> And I guess there's a rule that you can't acquire rt_mutex while
>> holding a raw_spinlock.
> 
> Thanks for your review and comments. Sorry for not explaining this clearly.
> Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is
> based on raw_spinlock, there is no any question in the above call trace.
> 
> But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based
> on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example,
> there are two threads are trying to hold a rt_mutex lock, if A hold the
> lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting
> for A to release the lock. The raw_spinlock is a real spinning lock, which
> is not allowed the task of the raw_spinlock owner is scheduled in its
> critical region. In other words, we should not try to acquire rt_mutex lock
> in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set.
> 
> CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are
> used to detect the invalid lock nesting (the raw_spinlock vs. spinlock
> nesting checks) [1]. Here is the call path:
> 
>    pci_msi_update_mask  ---> hold raw_spinlock dev->msi_lock
>      pci_write_config_dword
>       pci_bus_write_config_dword
>         vmd_pci_write   ---> hold spinlock_t vmd->cfg_lock
> 
> The above call path is the invalid lock nesting becuase the vmd driver
> tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock
> region (dev->msi_lock). That's why the message "BUG: Invalid wait contex"
> is shown.
> 

It looks like this only happens when CONFIG_PREEMPT_RT is set so I would 
mention that in the commit message (as Bjorn mentioned). I also think 
thsi level of detail is helpful and should be in the commit message as 
well since it's not obvious to the casual observer :)

Paul

> [1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/
> 
> Thanks,
> Regards,
> Jiwei
> 
>>
>> The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect
>> msi_desc::masked for multi-MSI") and only used in
>> pci_msi_update_mask():
>>
>>    raw_spin_lock_irqsave(lock, flags);
>>    desc->pci.msi_mask &= ~clear;
>>    desc->pci.msi_mask |= set;
>>    pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
>> 			 desc->pci.msi_mask);
>>    raw_spin_unlock_irqrestore(lock, flags);
>>
>> The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for
>> Intel Volume Management Device (VMD)") and is only used around VMD
>> config accesses, e.g.,
>>
>>    * CPU may deadlock if config space is not serialized on some versions of this
>>    * hardware, so all config space access is done under a spinlock.
>>
>>    static int vmd_pci_read(...)
>>    {
>>      spin_lock_irqsave(&vmd->cfg_lock, flags);
>>      switch (len) {
>>      case 1:
>> 	    *value = readb(addr);
>> 	    break;
>>      case 2:
>> 	    *value = readw(addr);
>> 	    break;
>>      case 4:
>> 	    *value = readl(addr);
>> 	    break;
>>      default:
>> 	    ret = -EINVAL;
>> 	    break;
>>      }
>>      spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>    }
>>
>> IIUC those reads turn into single PCIe MMIO reads, so I wouldn't
>> expect any concurrency issues there that need locking.
>>
>> But apparently there's something weird that can deadlock the CPU.
>>
>>> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
>>> Suggested-by: Adrian Huang <ahuang12@lenovo.com>
>>> ---
>>>   drivers/pci/controller/vmd.c | 12 ++++++------
>>>   1 file changed, 6 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
>>> index 87b7856f375a..45d0ebf96adc 100644
>>> --- a/drivers/pci/controller/vmd.c
>>> +++ b/drivers/pci/controller/vmd.c
>>> @@ -125,7 +125,7 @@ struct vmd_irq_list {
>>>   struct vmd_dev {
>>>   	struct pci_dev		*dev;
>>>   
>>> -	spinlock_t		cfg_lock;
>>> +	raw_spinlock_t		cfg_lock;
>>>   	void __iomem		*cfgbar;
>>>   
>>>   	int msix_count;
>>> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>>   	if (!addr)
>>>   		return -EFAULT;
>>>   
>>> -	spin_lock_irqsave(&vmd->cfg_lock, flags);
>>> +	raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>   	switch (len) {
>>>   	case 1:
>>>   		*value = readb(addr);
>>> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>>   		ret = -EINVAL;
>>>   		break;
>>>   	}
>>> -	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>> +	raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>   	return ret;
>>>   }
>>>   
>>> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>>   	if (!addr)
>>>   		return -EFAULT;
>>>   
>>> -	spin_lock_irqsave(&vmd->cfg_lock, flags);
>>> +	raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>   	switch (len) {
>>>   	case 1:
>>>   		writeb(value, addr);
>>> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>>   		ret = -EINVAL;
>>>   		break;
>>>   	}
>>> -	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>> +	raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>   	return ret;
>>>   }
>>>   
>>> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>>>   	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
>>>   		vmd->first_vec = 1;
>>>   
>>> -	spin_lock_init(&vmd->cfg_lock);
>>> +	raw_spin_lock_init(&vmd->cfg_lock);
>>>   	pci_set_drvdata(dev, vmd);
>>>   	err = vmd_enable_domain(vmd, features);
>>>   	if (err)
>>> -- 
>>> 2.27.0
>>>
>
Jiwei Sun June 21, 2024, 9:12 a.m. UTC | #4
On 6/20/24 23:10, Paul M Stillwell Jr wrote:
> On 6/20/2024 1:57 AM, Jiwei Sun wrote:
>>
>>
>> On 6/20/24 04:00, Bjorn Helgaas wrote:
>>> [+cc Thomas in case he has msi_lock comment, Keith in case he has
>>> cfg_lock comment]
>>>
>>> On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote:
>>>> From: Jiwei Sun <sunjw10@lenovo.com>
>>>>
>>>> If the kernel is built with the following configurations and booting
>>>>    CONFIG_VMD=y
>>>>    CONFIG_DEBUG_LOCKDEP=y
>>>>    CONFIG_DEBUG_SPINLOCK=y
>>>>    CONFIG_PROVE_LOCKING=y
>>>>    CONFIG_PROVE_RAW_LOCK_NESTING=y
>>>>
>>>> The following log appears,
>>>>
>>>> =============================
>>>> [ BUG: Invalid wait context ]
>>>> 6.10.0-rc4 #80 Not tainted
>>>> -----------------------------
>>>> kworker/18:2/633 is trying to lock:
>>>> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
>>>> other info that might help us debug this:
>>>> context-{5:5}
>>>> 4 locks held by kworker/18:2/633:
>>>>   #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
>>>>   #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
>>>>   #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
>>>>   #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
>>>> stack backtrace:
>>>> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
>>>> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
>>>> Workqueue: events work_for_cpu_fn
>>>> Call Trace:
>>>>   <TASK>
>>>>   dump_stack_lvl+0x7c/0xc0
>>>>   __lock_acquire+0x9e5/0x1ed0
>>>>   lock_acquire+0x194/0x490
>>>>   _raw_spin_lock_irqsave+0x42/0x90
>>>>   vmd_pci_write+0x185/0x2a0
>>>>   pci_msi_update_mask+0x10c/0x170
>>>>   __pci_enable_msi_range+0x291/0x800
>>>>   pci_alloc_irq_vectors_affinity+0x13e/0x1d0
>>>>   pcie_portdrv_probe+0x570/0xe60
>>>>   local_pci_probe+0xdc/0x190
>>>>   work_for_cpu_fn+0x4e/0xa0
>>>>   process_one_work+0x86d/0x1920
>>>>   process_scheduled_works+0xd7/0x140
>>>>   worker_thread+0x3e9/0xb90
>>>>   kthread+0x2e9/0x3d0
>>>>   ret_from_fork+0x2d/0x60
>>>>   ret_from_fork_asm+0x1a/0x30
>>>>   </TASK>
>>>>
>>>> The root cause is that the dev->msi_lock is a raw spinlock, but
>>>> vmd->cfg_lock is a spinlock.
>>>
>>> Can you expand this a little bit?  This isn't enough unless one
>>> already knows the difference between raw_spinlock_t and spinlock_t,
>>> which I didn't.
>>>
>>> Documentation/locking/locktypes.rst says they are the same except when
>>> CONFIG_PREEMPT_RT is set (might be worth mentioning with the config
>>> list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on
>>> rt_mutex.
>>>
>>> And I guess there's a rule that you can't acquire rt_mutex while
>>> holding a raw_spinlock.
>>
>> Thanks for your review and comments. Sorry for not explaining this clearly.
>> Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is
>> based on raw_spinlock, there is no any question in the above call trace.
>>
>> But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based
>> on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example,
>> there are two threads are trying to hold a rt_mutex lock, if A hold the
>> lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting
>> for A to release the lock. The raw_spinlock is a real spinning lock, which
>> is not allowed the task of the raw_spinlock owner is scheduled in its
>> critical region. In other words, we should not try to acquire rt_mutex lock
>> in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set.
>>
>> CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are
>> used to detect the invalid lock nesting (the raw_spinlock vs. spinlock
>> nesting checks) [1]. Here is the call path:
>>
>>    pci_msi_update_mask  ---> hold raw_spinlock dev->msi_lock
>>      pci_write_config_dword
>>       pci_bus_write_config_dword
>>         vmd_pci_write   ---> hold spinlock_t vmd->cfg_lock
>>
>> The above call path is the invalid lock nesting becuase the vmd driver
>> tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock
>> region (dev->msi_lock). That's why the message "BUG: Invalid wait contex"
>> is shown.
>>
> 
> It looks like this only happens when CONFIG_PREEMPT_RT is set so I would mention that in the commit message (as Bjorn mentioned). I also think thsi level of detail is helpful and should be in the commit message as well since it's not obvious to the casual observer :)

Thanks for your suggestions and comments, I totally agree with you. 
I will add those key information into V2 patch commit message.

Thanks,
Regards,
Jiwei

> 
> Paul
> 
>> [1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/
>>
>> Thanks,
>> Regards,
>> Jiwei
>>
>>>
>>> The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect
>>> msi_desc::masked for multi-MSI") and only used in
>>> pci_msi_update_mask():
>>>
>>>    raw_spin_lock_irqsave(lock, flags);
>>>    desc->pci.msi_mask &= ~clear;
>>>    desc->pci.msi_mask |= set;
>>>    pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
>>>              desc->pci.msi_mask);
>>>    raw_spin_unlock_irqrestore(lock, flags);
>>>
>>> The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for
>>> Intel Volume Management Device (VMD)") and is only used around VMD
>>> config accesses, e.g.,
>>>
>>>    * CPU may deadlock if config space is not serialized on some versions of this
>>>    * hardware, so all config space access is done under a spinlock.
>>>
>>>    static int vmd_pci_read(...)
>>>    {
>>>      spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>      switch (len) {
>>>      case 1:
>>>         *value = readb(addr);
>>>         break;
>>>      case 2:
>>>         *value = readw(addr);
>>>         break;
>>>      case 4:
>>>         *value = readl(addr);
>>>         break;
>>>      default:
>>>         ret = -EINVAL;
>>>         break;
>>>      }
>>>      spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>    }
>>>
>>> IIUC those reads turn into single PCIe MMIO reads, so I wouldn't
>>> expect any concurrency issues there that need locking.
>>>
>>> But apparently there's something weird that can deadlock the CPU.
>>>
>>>> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
>>>> Suggested-by: Adrian Huang <ahuang12@lenovo.com>
>>>> ---
>>>>   drivers/pci/controller/vmd.c | 12 ++++++------
>>>>   1 file changed, 6 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
>>>> index 87b7856f375a..45d0ebf96adc 100644
>>>> --- a/drivers/pci/controller/vmd.c
>>>> +++ b/drivers/pci/controller/vmd.c
>>>> @@ -125,7 +125,7 @@ struct vmd_irq_list {
>>>>   struct vmd_dev {
>>>>       struct pci_dev        *dev;
>>>>   -    spinlock_t        cfg_lock;
>>>> +    raw_spinlock_t        cfg_lock;
>>>>       void __iomem        *cfgbar;
>>>>         int msix_count;
>>>> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>>>       if (!addr)
>>>>           return -EFAULT;
>>>>   -    spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>> +    raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>>       switch (len) {
>>>>       case 1:
>>>>           *value = readb(addr);
>>>> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>>>           ret = -EINVAL;
>>>>           break;
>>>>       }
>>>> -    spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>> +    raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>>       return ret;
>>>>   }
>>>>   @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>>>       if (!addr)
>>>>           return -EFAULT;
>>>>   -    spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>> +    raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>>       switch (len) {
>>>>       case 1:
>>>>           writeb(value, addr);
>>>> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>>>           ret = -EINVAL;
>>>>           break;
>>>>       }
>>>> -    spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>> +    raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>>       return ret;
>>>>   }
>>>>   @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>>>>       if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
>>>>           vmd->first_vec = 1;
>>>>   -    spin_lock_init(&vmd->cfg_lock);
>>>> +    raw_spin_lock_init(&vmd->cfg_lock);
>>>>       pci_set_drvdata(dev, vmd);
>>>>       err = vmd_enable_domain(vmd, features);
>>>>       if (err)
>>>> -- 
>>>> 2.27.0
>>>>
>>
>
diff mbox series

Patch

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 87b7856f375a..45d0ebf96adc 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -125,7 +125,7 @@  struct vmd_irq_list {
 struct vmd_dev {
 	struct pci_dev		*dev;
 
-	spinlock_t		cfg_lock;
+	raw_spinlock_t		cfg_lock;
 	void __iomem		*cfgbar;
 
 	int msix_count;
@@ -402,7 +402,7 @@  static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
 	if (!addr)
 		return -EFAULT;
 
-	spin_lock_irqsave(&vmd->cfg_lock, flags);
+	raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
 	switch (len) {
 	case 1:
 		*value = readb(addr);
@@ -417,7 +417,7 @@  static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
 		ret = -EINVAL;
 		break;
 	}
-	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
+	raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
 	return ret;
 }
 
@@ -437,7 +437,7 @@  static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
 	if (!addr)
 		return -EFAULT;
 
-	spin_lock_irqsave(&vmd->cfg_lock, flags);
+	raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
 	switch (len) {
 	case 1:
 		writeb(value, addr);
@@ -455,7 +455,7 @@  static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
 		ret = -EINVAL;
 		break;
 	}
-	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
+	raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
 	return ret;
 }
 
@@ -1015,7 +1015,7 @@  static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
 		vmd->first_vec = 1;
 
-	spin_lock_init(&vmd->cfg_lock);
+	raw_spin_lock_init(&vmd->cfg_lock);
 	pci_set_drvdata(dev, vmd);
 	err = vmd_enable_domain(vmd, features);
 	if (err)