Message ID | SEZPR01MB45274AF863D3BD2F028141D9A8CF2@SEZPR01MB4527.apcprd01.prod.exchangelabs.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | PCI: vmd: Use raw spinlock for cfg_lock | expand |
[+cc Thomas in case he has msi_lock comment, Keith in case he has cfg_lock comment] On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote: > From: Jiwei Sun <sunjw10@lenovo.com> > > If the kernel is built with the following configurations and booting > CONFIG_VMD=y > CONFIG_DEBUG_LOCKDEP=y > CONFIG_DEBUG_SPINLOCK=y > CONFIG_PROVE_LOCKING=y > CONFIG_PROVE_RAW_LOCK_NESTING=y > > The following log appears, > > ============================= > [ BUG: Invalid wait context ] > 6.10.0-rc4 #80 Not tainted > ----------------------------- > kworker/18:2/633 is trying to lock: > ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0 > other info that might help us debug this: > context-{5:5} > 4 locks held by kworker/18:2/633: > #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920 > #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920 > #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800 > #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170 > stack backtrace: > CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75 > Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020 > Workqueue: events work_for_cpu_fn > Call Trace: > <TASK> > dump_stack_lvl+0x7c/0xc0 > __lock_acquire+0x9e5/0x1ed0 > lock_acquire+0x194/0x490 > _raw_spin_lock_irqsave+0x42/0x90 > vmd_pci_write+0x185/0x2a0 > pci_msi_update_mask+0x10c/0x170 > __pci_enable_msi_range+0x291/0x800 > pci_alloc_irq_vectors_affinity+0x13e/0x1d0 > pcie_portdrv_probe+0x570/0xe60 > local_pci_probe+0xdc/0x190 > work_for_cpu_fn+0x4e/0xa0 > process_one_work+0x86d/0x1920 > process_scheduled_works+0xd7/0x140 > worker_thread+0x3e9/0xb90 > kthread+0x2e9/0x3d0 > ret_from_fork+0x2d/0x60 > ret_from_fork_asm+0x1a/0x30 > </TASK> > > The root cause is that the dev->msi_lock is a raw spinlock, but > vmd->cfg_lock is a spinlock. Can you expand this a little bit? This isn't enough unless one already knows the difference between raw_spinlock_t and spinlock_t, which I didn't. Documentation/locking/locktypes.rst says they are the same except when CONFIG_PREEMPT_RT is set (might be worth mentioning with the config list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on rt_mutex. And I guess there's a rule that you can't acquire rt_mutex while holding a raw_spinlock. The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect msi_desc::masked for multi-MSI") and only used in pci_msi_update_mask(): raw_spin_lock_irqsave(lock, flags); desc->pci.msi_mask &= ~clear; desc->pci.msi_mask |= set; pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos, desc->pci.msi_mask); raw_spin_unlock_irqrestore(lock, flags); The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for Intel Volume Management Device (VMD)") and is only used around VMD config accesses, e.g., * CPU may deadlock if config space is not serialized on some versions of this * hardware, so all config space access is done under a spinlock. static int vmd_pci_read(...) { spin_lock_irqsave(&vmd->cfg_lock, flags); switch (len) { case 1: *value = readb(addr); break; case 2: *value = readw(addr); break; case 4: *value = readl(addr); break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&vmd->cfg_lock, flags); } IIUC those reads turn into single PCIe MMIO reads, so I wouldn't expect any concurrency issues there that need locking. But apparently there's something weird that can deadlock the CPU. > Signed-off-by: Jiwei Sun<sunjw10@lenovo.com> > Suggested-by: Adrian Huang <ahuang12@lenovo.com> > --- > drivers/pci/controller/vmd.c | 12 ++++++------ > 1 file changed, 6 insertions(+), 6 deletions(-) > > diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c > index 87b7856f375a..45d0ebf96adc 100644 > --- a/drivers/pci/controller/vmd.c > +++ b/drivers/pci/controller/vmd.c > @@ -125,7 +125,7 @@ struct vmd_irq_list { > struct vmd_dev { > struct pci_dev *dev; > > - spinlock_t cfg_lock; > + raw_spinlock_t cfg_lock; > void __iomem *cfgbar; > > int msix_count; > @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, > if (!addr) > return -EFAULT; > > - spin_lock_irqsave(&vmd->cfg_lock, flags); > + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); > switch (len) { > case 1: > *value = readb(addr); > @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, > ret = -EINVAL; > break; > } > - spin_unlock_irqrestore(&vmd->cfg_lock, flags); > + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); > return ret; > } > > @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, > if (!addr) > return -EFAULT; > > - spin_lock_irqsave(&vmd->cfg_lock, flags); > + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); > switch (len) { > case 1: > writeb(value, addr); > @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, > ret = -EINVAL; > break; > } > - spin_unlock_irqrestore(&vmd->cfg_lock, flags); > + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); > return ret; > } > > @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) > if (features & VMD_FEAT_OFFSET_FIRST_VECTOR) > vmd->first_vec = 1; > > - spin_lock_init(&vmd->cfg_lock); > + raw_spin_lock_init(&vmd->cfg_lock); > pci_set_drvdata(dev, vmd); > err = vmd_enable_domain(vmd, features); > if (err) > -- > 2.27.0 >
On 6/20/24 04:00, Bjorn Helgaas wrote: > [+cc Thomas in case he has msi_lock comment, Keith in case he has > cfg_lock comment] > > On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote: >> From: Jiwei Sun <sunjw10@lenovo.com> >> >> If the kernel is built with the following configurations and booting >> CONFIG_VMD=y >> CONFIG_DEBUG_LOCKDEP=y >> CONFIG_DEBUG_SPINLOCK=y >> CONFIG_PROVE_LOCKING=y >> CONFIG_PROVE_RAW_LOCK_NESTING=y >> >> The following log appears, >> >> ============================= >> [ BUG: Invalid wait context ] >> 6.10.0-rc4 #80 Not tainted >> ----------------------------- >> kworker/18:2/633 is trying to lock: >> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0 >> other info that might help us debug this: >> context-{5:5} >> 4 locks held by kworker/18:2/633: >> #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920 >> #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920 >> #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800 >> #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170 >> stack backtrace: >> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75 >> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020 >> Workqueue: events work_for_cpu_fn >> Call Trace: >> <TASK> >> dump_stack_lvl+0x7c/0xc0 >> __lock_acquire+0x9e5/0x1ed0 >> lock_acquire+0x194/0x490 >> _raw_spin_lock_irqsave+0x42/0x90 >> vmd_pci_write+0x185/0x2a0 >> pci_msi_update_mask+0x10c/0x170 >> __pci_enable_msi_range+0x291/0x800 >> pci_alloc_irq_vectors_affinity+0x13e/0x1d0 >> pcie_portdrv_probe+0x570/0xe60 >> local_pci_probe+0xdc/0x190 >> work_for_cpu_fn+0x4e/0xa0 >> process_one_work+0x86d/0x1920 >> process_scheduled_works+0xd7/0x140 >> worker_thread+0x3e9/0xb90 >> kthread+0x2e9/0x3d0 >> ret_from_fork+0x2d/0x60 >> ret_from_fork_asm+0x1a/0x30 >> </TASK> >> >> The root cause is that the dev->msi_lock is a raw spinlock, but >> vmd->cfg_lock is a spinlock. > > Can you expand this a little bit? This isn't enough unless one > already knows the difference between raw_spinlock_t and spinlock_t, > which I didn't. > > Documentation/locking/locktypes.rst says they are the same except when > CONFIG_PREEMPT_RT is set (might be worth mentioning with the config > list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on > rt_mutex. > > And I guess there's a rule that you can't acquire rt_mutex while > holding a raw_spinlock. Thanks for your review and comments. Sorry for not explaining this clearly. Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is based on raw_spinlock, there is no any question in the above call trace. But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example, there are two threads are trying to hold a rt_mutex lock, if A hold the lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting for A to release the lock. The raw_spinlock is a real spinning lock, which is not allowed the task of the raw_spinlock owner is scheduled in its critical region. In other words, we should not try to acquire rt_mutex lock in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set. CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are used to detect the invalid lock nesting (the raw_spinlock vs. spinlock nesting checks) [1]. Here is the call path: pci_msi_update_mask ---> hold raw_spinlock dev->msi_lock pci_write_config_dword pci_bus_write_config_dword vmd_pci_write ---> hold spinlock_t vmd->cfg_lock The above call path is the invalid lock nesting becuase the vmd driver tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock region (dev->msi_lock). That's why the message "BUG: Invalid wait contex" is shown. [1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/ Thanks, Regards, Jiwei > > The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect > msi_desc::masked for multi-MSI") and only used in > pci_msi_update_mask(): > > raw_spin_lock_irqsave(lock, flags); > desc->pci.msi_mask &= ~clear; > desc->pci.msi_mask |= set; > pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos, > desc->pci.msi_mask); > raw_spin_unlock_irqrestore(lock, flags); > > The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for > Intel Volume Management Device (VMD)") and is only used around VMD > config accesses, e.g., > > * CPU may deadlock if config space is not serialized on some versions of this > * hardware, so all config space access is done under a spinlock. > > static int vmd_pci_read(...) > { > spin_lock_irqsave(&vmd->cfg_lock, flags); > switch (len) { > case 1: > *value = readb(addr); > break; > case 2: > *value = readw(addr); > break; > case 4: > *value = readl(addr); > break; > default: > ret = -EINVAL; > break; > } > spin_unlock_irqrestore(&vmd->cfg_lock, flags); > } > > IIUC those reads turn into single PCIe MMIO reads, so I wouldn't > expect any concurrency issues there that need locking. > > But apparently there's something weird that can deadlock the CPU. > >> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com> >> Suggested-by: Adrian Huang <ahuang12@lenovo.com> >> --- >> drivers/pci/controller/vmd.c | 12 ++++++------ >> 1 file changed, 6 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c >> index 87b7856f375a..45d0ebf96adc 100644 >> --- a/drivers/pci/controller/vmd.c >> +++ b/drivers/pci/controller/vmd.c >> @@ -125,7 +125,7 @@ struct vmd_irq_list { >> struct vmd_dev { >> struct pci_dev *dev; >> >> - spinlock_t cfg_lock; >> + raw_spinlock_t cfg_lock; >> void __iomem *cfgbar; >> >> int msix_count; >> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, >> if (!addr) >> return -EFAULT; >> >> - spin_lock_irqsave(&vmd->cfg_lock, flags); >> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); >> switch (len) { >> case 1: >> *value = readb(addr); >> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, >> ret = -EINVAL; >> break; >> } >> - spin_unlock_irqrestore(&vmd->cfg_lock, flags); >> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); >> return ret; >> } >> >> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, >> if (!addr) >> return -EFAULT; >> >> - spin_lock_irqsave(&vmd->cfg_lock, flags); >> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); >> switch (len) { >> case 1: >> writeb(value, addr); >> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, >> ret = -EINVAL; >> break; >> } >> - spin_unlock_irqrestore(&vmd->cfg_lock, flags); >> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); >> return ret; >> } >> >> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) >> if (features & VMD_FEAT_OFFSET_FIRST_VECTOR) >> vmd->first_vec = 1; >> >> - spin_lock_init(&vmd->cfg_lock); >> + raw_spin_lock_init(&vmd->cfg_lock); >> pci_set_drvdata(dev, vmd); >> err = vmd_enable_domain(vmd, features); >> if (err) >> -- >> 2.27.0 >>
On 6/20/2024 1:57 AM, Jiwei Sun wrote: > > > On 6/20/24 04:00, Bjorn Helgaas wrote: >> [+cc Thomas in case he has msi_lock comment, Keith in case he has >> cfg_lock comment] >> >> On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote: >>> From: Jiwei Sun <sunjw10@lenovo.com> >>> >>> If the kernel is built with the following configurations and booting >>> CONFIG_VMD=y >>> CONFIG_DEBUG_LOCKDEP=y >>> CONFIG_DEBUG_SPINLOCK=y >>> CONFIG_PROVE_LOCKING=y >>> CONFIG_PROVE_RAW_LOCK_NESTING=y >>> >>> The following log appears, >>> >>> ============================= >>> [ BUG: Invalid wait context ] >>> 6.10.0-rc4 #80 Not tainted >>> ----------------------------- >>> kworker/18:2/633 is trying to lock: >>> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0 >>> other info that might help us debug this: >>> context-{5:5} >>> 4 locks held by kworker/18:2/633: >>> #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920 >>> #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920 >>> #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800 >>> #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170 >>> stack backtrace: >>> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75 >>> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020 >>> Workqueue: events work_for_cpu_fn >>> Call Trace: >>> <TASK> >>> dump_stack_lvl+0x7c/0xc0 >>> __lock_acquire+0x9e5/0x1ed0 >>> lock_acquire+0x194/0x490 >>> _raw_spin_lock_irqsave+0x42/0x90 >>> vmd_pci_write+0x185/0x2a0 >>> pci_msi_update_mask+0x10c/0x170 >>> __pci_enable_msi_range+0x291/0x800 >>> pci_alloc_irq_vectors_affinity+0x13e/0x1d0 >>> pcie_portdrv_probe+0x570/0xe60 >>> local_pci_probe+0xdc/0x190 >>> work_for_cpu_fn+0x4e/0xa0 >>> process_one_work+0x86d/0x1920 >>> process_scheduled_works+0xd7/0x140 >>> worker_thread+0x3e9/0xb90 >>> kthread+0x2e9/0x3d0 >>> ret_from_fork+0x2d/0x60 >>> ret_from_fork_asm+0x1a/0x30 >>> </TASK> >>> >>> The root cause is that the dev->msi_lock is a raw spinlock, but >>> vmd->cfg_lock is a spinlock. >> >> Can you expand this a little bit? This isn't enough unless one >> already knows the difference between raw_spinlock_t and spinlock_t, >> which I didn't. >> >> Documentation/locking/locktypes.rst says they are the same except when >> CONFIG_PREEMPT_RT is set (might be worth mentioning with the config >> list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on >> rt_mutex. >> >> And I guess there's a rule that you can't acquire rt_mutex while >> holding a raw_spinlock. > > Thanks for your review and comments. Sorry for not explaining this clearly. > Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is > based on raw_spinlock, there is no any question in the above call trace. > > But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based > on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example, > there are two threads are trying to hold a rt_mutex lock, if A hold the > lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting > for A to release the lock. The raw_spinlock is a real spinning lock, which > is not allowed the task of the raw_spinlock owner is scheduled in its > critical region. In other words, we should not try to acquire rt_mutex lock > in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set. > > CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are > used to detect the invalid lock nesting (the raw_spinlock vs. spinlock > nesting checks) [1]. Here is the call path: > > pci_msi_update_mask ---> hold raw_spinlock dev->msi_lock > pci_write_config_dword > pci_bus_write_config_dword > vmd_pci_write ---> hold spinlock_t vmd->cfg_lock > > The above call path is the invalid lock nesting becuase the vmd driver > tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock > region (dev->msi_lock). That's why the message "BUG: Invalid wait contex" > is shown. > It looks like this only happens when CONFIG_PREEMPT_RT is set so I would mention that in the commit message (as Bjorn mentioned). I also think thsi level of detail is helpful and should be in the commit message as well since it's not obvious to the casual observer :) Paul > [1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/ > > Thanks, > Regards, > Jiwei > >> >> The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect >> msi_desc::masked for multi-MSI") and only used in >> pci_msi_update_mask(): >> >> raw_spin_lock_irqsave(lock, flags); >> desc->pci.msi_mask &= ~clear; >> desc->pci.msi_mask |= set; >> pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos, >> desc->pci.msi_mask); >> raw_spin_unlock_irqrestore(lock, flags); >> >> The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for >> Intel Volume Management Device (VMD)") and is only used around VMD >> config accesses, e.g., >> >> * CPU may deadlock if config space is not serialized on some versions of this >> * hardware, so all config space access is done under a spinlock. >> >> static int vmd_pci_read(...) >> { >> spin_lock_irqsave(&vmd->cfg_lock, flags); >> switch (len) { >> case 1: >> *value = readb(addr); >> break; >> case 2: >> *value = readw(addr); >> break; >> case 4: >> *value = readl(addr); >> break; >> default: >> ret = -EINVAL; >> break; >> } >> spin_unlock_irqrestore(&vmd->cfg_lock, flags); >> } >> >> IIUC those reads turn into single PCIe MMIO reads, so I wouldn't >> expect any concurrency issues there that need locking. >> >> But apparently there's something weird that can deadlock the CPU. >> >>> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com> >>> Suggested-by: Adrian Huang <ahuang12@lenovo.com> >>> --- >>> drivers/pci/controller/vmd.c | 12 ++++++------ >>> 1 file changed, 6 insertions(+), 6 deletions(-) >>> >>> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c >>> index 87b7856f375a..45d0ebf96adc 100644 >>> --- a/drivers/pci/controller/vmd.c >>> +++ b/drivers/pci/controller/vmd.c >>> @@ -125,7 +125,7 @@ struct vmd_irq_list { >>> struct vmd_dev { >>> struct pci_dev *dev; >>> >>> - spinlock_t cfg_lock; >>> + raw_spinlock_t cfg_lock; >>> void __iomem *cfgbar; >>> >>> int msix_count; >>> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, >>> if (!addr) >>> return -EFAULT; >>> >>> - spin_lock_irqsave(&vmd->cfg_lock, flags); >>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); >>> switch (len) { >>> case 1: >>> *value = readb(addr); >>> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, >>> ret = -EINVAL; >>> break; >>> } >>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>> return ret; >>> } >>> >>> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, >>> if (!addr) >>> return -EFAULT; >>> >>> - spin_lock_irqsave(&vmd->cfg_lock, flags); >>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); >>> switch (len) { >>> case 1: >>> writeb(value, addr); >>> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, >>> ret = -EINVAL; >>> break; >>> } >>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>> return ret; >>> } >>> >>> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) >>> if (features & VMD_FEAT_OFFSET_FIRST_VECTOR) >>> vmd->first_vec = 1; >>> >>> - spin_lock_init(&vmd->cfg_lock); >>> + raw_spin_lock_init(&vmd->cfg_lock); >>> pci_set_drvdata(dev, vmd); >>> err = vmd_enable_domain(vmd, features); >>> if (err) >>> -- >>> 2.27.0 >>> >
On 6/20/24 23:10, Paul M Stillwell Jr wrote: > On 6/20/2024 1:57 AM, Jiwei Sun wrote: >> >> >> On 6/20/24 04:00, Bjorn Helgaas wrote: >>> [+cc Thomas in case he has msi_lock comment, Keith in case he has >>> cfg_lock comment] >>> >>> On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote: >>>> From: Jiwei Sun <sunjw10@lenovo.com> >>>> >>>> If the kernel is built with the following configurations and booting >>>> CONFIG_VMD=y >>>> CONFIG_DEBUG_LOCKDEP=y >>>> CONFIG_DEBUG_SPINLOCK=y >>>> CONFIG_PROVE_LOCKING=y >>>> CONFIG_PROVE_RAW_LOCK_NESTING=y >>>> >>>> The following log appears, >>>> >>>> ============================= >>>> [ BUG: Invalid wait context ] >>>> 6.10.0-rc4 #80 Not tainted >>>> ----------------------------- >>>> kworker/18:2/633 is trying to lock: >>>> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0 >>>> other info that might help us debug this: >>>> context-{5:5} >>>> 4 locks held by kworker/18:2/633: >>>> #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920 >>>> #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920 >>>> #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800 >>>> #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170 >>>> stack backtrace: >>>> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75 >>>> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020 >>>> Workqueue: events work_for_cpu_fn >>>> Call Trace: >>>> <TASK> >>>> dump_stack_lvl+0x7c/0xc0 >>>> __lock_acquire+0x9e5/0x1ed0 >>>> lock_acquire+0x194/0x490 >>>> _raw_spin_lock_irqsave+0x42/0x90 >>>> vmd_pci_write+0x185/0x2a0 >>>> pci_msi_update_mask+0x10c/0x170 >>>> __pci_enable_msi_range+0x291/0x800 >>>> pci_alloc_irq_vectors_affinity+0x13e/0x1d0 >>>> pcie_portdrv_probe+0x570/0xe60 >>>> local_pci_probe+0xdc/0x190 >>>> work_for_cpu_fn+0x4e/0xa0 >>>> process_one_work+0x86d/0x1920 >>>> process_scheduled_works+0xd7/0x140 >>>> worker_thread+0x3e9/0xb90 >>>> kthread+0x2e9/0x3d0 >>>> ret_from_fork+0x2d/0x60 >>>> ret_from_fork_asm+0x1a/0x30 >>>> </TASK> >>>> >>>> The root cause is that the dev->msi_lock is a raw spinlock, but >>>> vmd->cfg_lock is a spinlock. >>> >>> Can you expand this a little bit? This isn't enough unless one >>> already knows the difference between raw_spinlock_t and spinlock_t, >>> which I didn't. >>> >>> Documentation/locking/locktypes.rst says they are the same except when >>> CONFIG_PREEMPT_RT is set (might be worth mentioning with the config >>> list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on >>> rt_mutex. >>> >>> And I guess there's a rule that you can't acquire rt_mutex while >>> holding a raw_spinlock. >> >> Thanks for your review and comments. Sorry for not explaining this clearly. >> Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is >> based on raw_spinlock, there is no any question in the above call trace. >> >> But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based >> on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example, >> there are two threads are trying to hold a rt_mutex lock, if A hold the >> lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting >> for A to release the lock. The raw_spinlock is a real spinning lock, which >> is not allowed the task of the raw_spinlock owner is scheduled in its >> critical region. In other words, we should not try to acquire rt_mutex lock >> in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set. >> >> CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are >> used to detect the invalid lock nesting (the raw_spinlock vs. spinlock >> nesting checks) [1]. Here is the call path: >> >> pci_msi_update_mask ---> hold raw_spinlock dev->msi_lock >> pci_write_config_dword >> pci_bus_write_config_dword >> vmd_pci_write ---> hold spinlock_t vmd->cfg_lock >> >> The above call path is the invalid lock nesting becuase the vmd driver >> tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock >> region (dev->msi_lock). That's why the message "BUG: Invalid wait contex" >> is shown. >> > > It looks like this only happens when CONFIG_PREEMPT_RT is set so I would mention that in the commit message (as Bjorn mentioned). I also think thsi level of detail is helpful and should be in the commit message as well since it's not obvious to the casual observer :) Thanks for your suggestions and comments, I totally agree with you. I will add those key information into V2 patch commit message. Thanks, Regards, Jiwei > > Paul > >> [1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/ >> >> Thanks, >> Regards, >> Jiwei >> >>> >>> The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect >>> msi_desc::masked for multi-MSI") and only used in >>> pci_msi_update_mask(): >>> >>> raw_spin_lock_irqsave(lock, flags); >>> desc->pci.msi_mask &= ~clear; >>> desc->pci.msi_mask |= set; >>> pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos, >>> desc->pci.msi_mask); >>> raw_spin_unlock_irqrestore(lock, flags); >>> >>> The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for >>> Intel Volume Management Device (VMD)") and is only used around VMD >>> config accesses, e.g., >>> >>> * CPU may deadlock if config space is not serialized on some versions of this >>> * hardware, so all config space access is done under a spinlock. >>> >>> static int vmd_pci_read(...) >>> { >>> spin_lock_irqsave(&vmd->cfg_lock, flags); >>> switch (len) { >>> case 1: >>> *value = readb(addr); >>> break; >>> case 2: >>> *value = readw(addr); >>> break; >>> case 4: >>> *value = readl(addr); >>> break; >>> default: >>> ret = -EINVAL; >>> break; >>> } >>> spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>> } >>> >>> IIUC those reads turn into single PCIe MMIO reads, so I wouldn't >>> expect any concurrency issues there that need locking. >>> >>> But apparently there's something weird that can deadlock the CPU. >>> >>>> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com> >>>> Suggested-by: Adrian Huang <ahuang12@lenovo.com> >>>> --- >>>> drivers/pci/controller/vmd.c | 12 ++++++------ >>>> 1 file changed, 6 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c >>>> index 87b7856f375a..45d0ebf96adc 100644 >>>> --- a/drivers/pci/controller/vmd.c >>>> +++ b/drivers/pci/controller/vmd.c >>>> @@ -125,7 +125,7 @@ struct vmd_irq_list { >>>> struct vmd_dev { >>>> struct pci_dev *dev; >>>> - spinlock_t cfg_lock; >>>> + raw_spinlock_t cfg_lock; >>>> void __iomem *cfgbar; >>>> int msix_count; >>>> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, >>>> if (!addr) >>>> return -EFAULT; >>>> - spin_lock_irqsave(&vmd->cfg_lock, flags); >>>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); >>>> switch (len) { >>>> case 1: >>>> *value = readb(addr); >>>> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, >>>> ret = -EINVAL; >>>> break; >>>> } >>>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>>> return ret; >>>> } >>>> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, >>>> if (!addr) >>>> return -EFAULT; >>>> - spin_lock_irqsave(&vmd->cfg_lock, flags); >>>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); >>>> switch (len) { >>>> case 1: >>>> writeb(value, addr); >>>> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, >>>> ret = -EINVAL; >>>> break; >>>> } >>>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); >>>> return ret; >>>> } >>>> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) >>>> if (features & VMD_FEAT_OFFSET_FIRST_VECTOR) >>>> vmd->first_vec = 1; >>>> - spin_lock_init(&vmd->cfg_lock); >>>> + raw_spin_lock_init(&vmd->cfg_lock); >>>> pci_set_drvdata(dev, vmd); >>>> err = vmd_enable_domain(vmd, features); >>>> if (err) >>>> -- >>>> 2.27.0 >>>> >> >
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 87b7856f375a..45d0ebf96adc 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -125,7 +125,7 @@ struct vmd_irq_list { struct vmd_dev { struct pci_dev *dev; - spinlock_t cfg_lock; + raw_spinlock_t cfg_lock; void __iomem *cfgbar; int msix_count; @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, if (!addr) return -EFAULT; - spin_lock_irqsave(&vmd->cfg_lock, flags); + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); switch (len) { case 1: *value = readb(addr); @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, ret = -EINVAL; break; } - spin_unlock_irqrestore(&vmd->cfg_lock, flags); + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); return ret; } @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, if (!addr) return -EFAULT; - spin_lock_irqsave(&vmd->cfg_lock, flags); + raw_spin_lock_irqsave(&vmd->cfg_lock, flags); switch (len) { case 1: writeb(value, addr); @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, ret = -EINVAL; break; } - spin_unlock_irqrestore(&vmd->cfg_lock, flags); + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags); return ret; } @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) if (features & VMD_FEAT_OFFSET_FIRST_VECTOR) vmd->first_vec = 1; - spin_lock_init(&vmd->cfg_lock); + raw_spin_lock_init(&vmd->cfg_lock); pci_set_drvdata(dev, vmd); err = vmd_enable_domain(vmd, features); if (err)