Message ID | 20240827192826.710031-3-kbusch@meta.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | pci cleanup/prep patches | expand |
On Tue, 27 Aug 2024, Keith Busch wrote: >+static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) >+{ >+ return test_and_set_bit(PCI_DEV_REMOVED, &dev->priv_flags); >+} Same ordering/dependency description observations as mentioned in patch 1 (both these cases are fully ordered). >+ > #ifdef CONFIG_PCIEAER > #include <linux/aer.h> > >diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c >index ec3064a115bf8..8284ab20949c9 100644 >--- a/drivers/pci/remove.c >+++ b/drivers/pci/remove.c >@@ -29,7 +29,7 @@ static void pci_stop_dev(struct pci_dev *dev) > > static void pci_destroy_dev(struct pci_dev *dev) > { >- if (!dev->dev.kobj.parent) >+ if (pci_dev_test_and_set_removed(dev)) Doesn't this want to be if (!pci_dev_test_and_set_removed()) ? This also fixes a splat when triggering a removal when you add subordinate refcounting is added: https://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git/commit/?h=pci-bus-locking-2024-09-09&id=3883c485d5e45b5e17f685f77ff4020bec162336 fyi: [ 22.739614] BUG: kernel NULL pointer dereference, address: 0000000000000028 [ 22.739910] #PF: supervisor read access in kernel mode [ 22.740132] #PF: error_code(0x0000) - not-present page [ 22.740351] PGD 0 P4D 0 [ 22.740468] Oops: Oops: 0000 [#1] PREEMPT SMP KASAN PTI [ 22.740695] CPU: 0 UID: 0 PID: 266 Comm: bash Tainted: G B 6.11.0-rc1-g3883c485d5e4-dirty #13 [ 22.741111] Tainted: [B]=BAD_PAGE [ 22.741258] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 22.741727] RIP: 0010:pcie_aspm_check_latency.isra.0+0x192/0x4d0 [ 22.741990] Code: 18 e8 e2 6f 6f ff 48 89 df e8 aa ad 92 ff 4c 8b 2b 49 8d 7d 18 e8 9e ad 92 ff 49 8b 6d 18 4c 8d 65 28 4c f [ 22.743438] RSP: 0018:ffff88800554f970 EFLAGS: 00010282 [ 22.743673] RAX: 0000000000000001 RBX: ffff888001cc0c80 RCX: 0000000000000001 [ 22.743976] RDX: ffff88804c63ce00 RSI: 0000000000000000 RDI: 0000000000000007 [ 22.744285] RBP: 0000000000000000 R08: 0000000000000001 R09: fffffbfff7b7275c [ 22.744596] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000028 [ 22.744906] R13: ffff888001f62000 R14: 0000000000000040 R15: 00000000000003e8 [ 22.745216] FS: 00007f14a687f740(0000) GS:ffff88806d200000(0000) knlGS:0000000000000000 [ 22.745565] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 22.745823] CR2: 0000000000000028 CR3: 000000004dc06000 CR4: 00000000000006f0 [ 22.746132] Call Trace: [ 22.746246] <TASK> [ 22.746346] ? show_regs+0x8c/0xa0 [ 22.746507] ? __die+0x2c/0x80 [ 22.746652] ? page_fault_oops+0x31a/0x830 [ 22.746839] ? __pfx_page_fault_oops+0x10/0x10 [ 22.747042] ? is_prefetch.constprop.0+0x9b/0x450 [ 22.747253] ? pcie_aspm_check_latency.isra.0+0x192/0x4d0 [ 22.747495] ? __pfx_is_prefetch.constprop.0+0x10/0x10 [ 22.747725] ? pcie_aspm_check_latency.isra.0+0x192/0x4d0 [ 22.747966] ? search_module_extables+0x93/0xc0 [ 22.748173] ? fixup_exception+0xd7/0x560 [ 22.748358] ? kernelmode_fixup_or_oops.constprop.0+0x9c/0xc0 [ 22.748613] ? __bad_area_nosemaphore+0x2f8/0x420 [ 22.748826] ? lock_mm_and_find_vma+0x90/0x4f0 [ 22.749028] ? do_user_addr_fault+0x58a/0xc80 [ 22.749226] ? rcu_is_watching+0x20/0x50 [ 22.749407] ? exc_page_fault+0x5c/0xd0 [ 22.749586] ? asm_exc_page_fault+0x26/0x30 [ 22.749776] ? pcie_aspm_check_latency.isra.0+0x192/0x4d0 [ 22.750020] ? __pfx_pcie_aspm_check_latency.isra.0+0x10/0x10 [ 22.750278] ? mark_held_locks+0x65/0x90 [ 22.750457] ? kobject_get+0x95/0x110 [ 22.750629] pcie_update_aspm_capable+0x128/0x1c0 [ 22.750843] pcie_aspm_exit_link_state+0x137/0x1e0 [ 22.751059] pci_remove_bus_device+0x15b/0x200 [ 22.751260] pci_remove_bus+0x4a/0x130 [ 22.751432] pci_remove_bus_device+0x88/0x200 [ 22.751631] pci_remove_bus+0x4a/0x130 [ 22.751802] pci_remove_bus_device+0x88/0x200 [ 22.752000] pci_remove_bus+0x4a/0x130 [ 22.752172] pci_remove_bus_device+0x88/0x200 [ 22.752370] pci_stop_and_remove_bus_device_locked+0x22/0x30 [ 22.752622] remove_store+0x125/0x140 [ 22.752791] ? __pfx_remove_store+0x10/0x10 [ 22.752981] ? __pfx___mutex_lock+0x10/0x10 [ 22.753170] ? __pfx__copy_from_iter+0x10/0x10 [ 22.753372] ? __pfx_remove_store+0x10/0x10 [ 22.753562] dev_attr_store+0x46/0x70 [ 22.753752] ? __pfx_dev_attr_store+0x10/0x10 [ 22.753965] sysfs_kf_write+0xa0/0xc0 [ 22.754142] kernfs_fop_write_iter+0x23d/0x300 [ 22.754346] ? __pfx_sysfs_kf_write+0x10/0x10 [ 22.754549] vfs_write+0x508/0xa90 [ 22.754709] ? __pfx_kernfs_fop_write_iter+0x10/0x10 [ 22.754935] ? __pfx_vfs_write+0x10/0x10 [ 22.755118] ? __fget_light+0xcd/0x120 [ 22.755295] ksys_write+0x108/0x200 [ 22.755458] ? __pfx_ksys_write+0x10/0x10 [ 22.755644] ? mark_held_locks+0x24/0x90 [ 22.755826] do_syscall_64+0xc1/0x1d0 [ 22.755998] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 22.756229] RIP: 0033:0x7f14a697a240 [ 22.756394] Code: 40 00 48 8b 15 c1 9b 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 80 3d a1 23 0e 00 00 74 17 9 [ 22.757184] RSP: 002b:00007ffdc41b64e8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ 22.757508] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f14a697a240 [ 22.757813] RDX: 0000000000000002 RSI: 0000558b25aeda50 RDI: 0000000000000001 [ 22.758117] RBP: 0000558b25aeda50 R08: 00007f14a6a54d90 R09: 00007f14a6a54d90 [ 22.758422] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000002 [ 22.758726] R13: 00007f14a6a55760 R14: 0000000000000002 R15: 00007f14a6a509e0 [ 22.759039] </TASK> > return; > > device_del(&dev->dev); >-- >2.43.5 >
On Wed, Oct 02, 2024 at 07:34:13PM -0700, Davidlohr Bueso wrote: > On Tue, 27 Aug 2024, Keith Busch wrote: > > static void pci_destroy_dev(struct pci_dev *dev) > > { > > - if (!dev->dev.kobj.parent) > > + if (pci_dev_test_and_set_removed(dev)) > > Doesn't this want to be if (!pci_dev_test_and_set_removed()) ? No, this function returns the previous value of the REMOVED flag. If it's already set, then another thread already removed this device. > This also fixes a splat when triggering a removal when you add > subordinate refcounting is added: > > https://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git/commit/?h=pci-bus-locking-2024-09-09&id=3883c485d5e45b5e17f685f77ff4020bec162336 Oh, that's pretty neat. Thanks for testing that! I think that reference counting patch is pretty safe too. I can rebase the series with that one included next time.
On Thu, 03 Oct 2024, Keith Busch wrote: >Oh, that's pretty neat. Thanks for testing that! I think that reference >counting patch is pretty safe too. I can rebase the series with that one >included next time. Well the thing is it is crashing on me, as reported. I was able to reproduce the deadlock with rescan_remove_lock on a switched CXL topology (via sysfs remove/rescan parent/child). This is why I tested your full pci-bus-locking-2024-09-09 branch hoping the last patch would fix it, but still cannot confirm because of that nil ptr deref.
On Thu, Oct 03, 2024 at 10:04:16AM -0700, Davidlohr Bueso wrote: > Well the thing is it is crashing on me, as reported. I was able to reproduce > the deadlock with rescan_remove_lock on a switched CXL topology (via sysfs > remove/rescan parent/child). This is why I tested your full pci-bus-locking-2024-09-09 > branch hoping the last patch would fix it, but still cannot confirm because > of that nil ptr deref. You might need something different than anything in that series if you are doing concurrent sysfs access during removal. This is what I wrote up for that issue: https://lore.kernel.org/linux-pci/20240719185513.3376321-1-kbusch@meta.com/T/#u The last reply from GregKH was against this proposal, but I briefly spoke with him a couple weeks ago and I think he may accept this if I resubmit.
On Thu, 03 Oct 2024, Keith Busch wrote: >On Thu, Oct 03, 2024 at 10:04:16AM -0700, Davidlohr Bueso wrote: >> Well the thing is it is crashing on me, as reported. I was able to reproduce >> the deadlock with rescan_remove_lock on a switched CXL topology (via sysfs >> remove/rescan parent/child). This is why I tested your full pci-bus-locking-2024-09-09 >> branch hoping the last patch would fix it, but still cannot confirm because >> of that nil ptr deref. > >You might need something different than anything in that series if you >are doing concurrent sysfs access during removal. This is what I wrote >up for that issue: The crash is *only* happening in your branch. The test runs into a deadlock on a vanilla kernel. > > https://lore.kernel.org/linux-pci/20240719185513.3376321-1-kbusch@meta.com/T/#u This does not address either, fyi.
On Wed, Oct 02, 2024 at 07:34:13PM -0700, Davidlohr Bueso wrote: > On Tue, 27 Aug 2024, Keith Busch wrote: > > > +static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) > > +{ > > + return test_and_set_bit(PCI_DEV_REMOVED, &dev->priv_flags); > > +} > > Same ordering/dependency description observations as mentioned in > patch 1 (both these cases are fully ordered). Just rebasing everything so late reply here. test_and_set_bit already has a memory barrier. It's the "set_bit" that doesn't, but set_bit is not used for this new flag. This new flag only indicates the device is being removed, so it's only set once before the device is deleted. It's never accessed outside this path, and it's safe compared to looking at the kobj parent.
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 802f7eac53115..a11a53f67a0ce 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -443,6 +443,7 @@ static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused) #define PCI_DEV_ADDED 0 #define PCI_DPC_RECOVERED 1 #define PCI_DPC_RECOVERING 2 +#define PCI_DEV_REMOVED 3 static inline void pci_dev_assign_added(struct pci_dev *dev) { @@ -459,6 +460,11 @@ static inline bool pci_dev_is_added(const struct pci_dev *dev) return test_bit(PCI_DEV_ADDED, &dev->priv_flags); } +static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) +{ + return test_and_set_bit(PCI_DEV_REMOVED, &dev->priv_flags); +} + #ifdef CONFIG_PCIEAER #include <linux/aer.h> diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index ec3064a115bf8..8284ab20949c9 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -29,7 +29,7 @@ static void pci_stop_dev(struct pci_dev *dev) static void pci_destroy_dev(struct pci_dev *dev) { - if (!dev->dev.kobj.parent) + if (pci_dev_test_and_set_removed(dev)) return; device_del(&dev->dev);