diff mbox series

[-rc] workqueue: Reimplement UAF fix to avoid lockdep worning

Message ID 4c4f1fb769a609a61010cb6d884ab2841ef716d3.1716885172.git.leon@kernel.org (mailing list archive)
State Not Applicable
Headers show
Series [-rc] workqueue: Reimplement UAF fix to avoid lockdep worning | expand

Commit Message

Leon Romanovsky May 28, 2024, 8:39 a.m. UTC
From: Leon Romanovsky <leonro@nvidia.com>

The commit 643445531829 ("workqueue: Fix UAF report by KASAN in
pwq_release_workfn()") causes to the following lockdep warning.

 [ 1818.839405] ==================================================================
 [ 1818.840636] BUG: KASAN: slab-use-after-free in lockdep_register_key+0x707/0x810
 [ 1818.841827] Read of size 8 at addr ffff888156864928 by task systemd-udevd/71399
 [ 1818.843007]
 [ 1818.843378] CPU: 8 PID: 71399 Comm: systemd-udevd Not tainted 6.10.0-rc1_external_1613e604df0c #1
 [ 1818.844806] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 [ 1818.846493] Call Trace:
 [ 1818.846981]  <TASK>
 [ 1818.847439]  dump_stack_lvl+0x7e/0xc0
 [ 1818.848089]  print_report+0xc1/0x600
 [ 1818.848728]  ? __virt_addr_valid+0x1cf/0x390
 [ 1818.849458]  ? lockdep_register_key+0x707/0x810
 [ 1818.850221]  ? lockdep_register_key+0x707/0x810
 [ 1818.850978]  kasan_report+0xb9/0xf0
 [ 1818.851604]  ? lockdep_register_key+0x707/0x810
 [ 1818.852381]  lockdep_register_key+0x707/0x810
 [ 1818.853121]  ? add_lock_to_list+0x5b0/0x5b0
 [ 1818.853842]  ? lockdep_init_map_type+0x2c3/0x7a0
 [ 1818.854594]  ? __raw_spin_lock_init+0x3b/0x110
 [ 1818.855329]  alloc_workqueue+0x466/0x1800
 [ 1818.856005]  ? pointer+0x8a0/0x8a0
 [ 1818.856615]  ? workqueue_sysfs_register+0x370/0x370
 [ 1818.857399]  ? vsprintf+0x10/0x10
 [ 1818.857997]  ib_mad_init_device+0x809/0x1760 [ib_core]
 [ 1818.858893]  ? ib_mad_post_receive_mads+0xf20/0xf20 [ib_core]
 [ 1818.859809]  ? rwsem_mark_wake+0x990/0x990
 [ 1818.860505]  ? do_raw_spin_unlock+0x54/0x220
 [ 1818.861224]  add_client_context+0x2e9/0x430 [ib_core]
 [ 1818.862058]  ? ib_unregister_driver+0x1a0/0x1a0 [ib_core]
 [ 1818.862933]  enable_device_and_get+0x1a7/0x340 [ib_core]
 [ 1818.863788]  ? add_client_context+0x430/0x430 [ib_core]
 [ 1818.864643]  ? rdma_counter_init+0x139/0x390 [ib_core]
 [ 1818.865487]  ib_register_device+0x6d0/0xa60 [ib_core]
 [ 1818.866320]  ? alloc_port_data.part.0+0x390/0x390 [ib_core]
 [ 1818.867212]  ? mlx5_lag_is_active+0x7c/0xb0 [mlx5_core]
 [ 1818.868204]  __mlx5_ib_add+0x88/0x1e0 [mlx5_ib]
 [ 1818.868989]  mlx5r_probe+0x1d0/0x3c0 [mlx5_ib]
 [ 1818.869744]  ? __mlx5_ib_add+0x1e0/0x1e0 [mlx5_ib]
 [ 1818.870542]  auxiliary_bus_probe+0xa1/0x110
 [ 1818.871234]  really_probe+0x1e4/0x920
 [ 1818.871862]  __driver_probe_device+0x261/0x3e0
 [ 1818.872609]  driver_probe_device+0x49/0x130
 [ 1818.873311]  __driver_attach+0x215/0x4c0
 [ 1818.873969]  ? __device_attach_driver+0x290/0x290
 [ 1818.874726]  bus_for_each_dev+0xf0/0x170
 [ 1818.875392]  ? lockdep_init_map_type+0x2c3/0x7a0
 [ 1818.876151]  ? bus_remove_file+0x40/0x40
 [ 1818.876820]  bus_add_driver+0x21d/0x5d0
 [ 1818.877472]  driver_register+0x133/0x460
 [ 1818.878138]  __auxiliary_driver_register+0x14e/0x230
 [ 1818.878930]  ? 0xffffffffa07f8000
 [ 1818.879516]  ? 0xffffffffa07f8000
 [ 1818.880110]  mlx5_ib_init+0x111/0x190 [mlx5_ib]
 [ 1818.880879]  do_one_initcall+0xc4/0x3c0
 [ 1818.881532]  ? trace_event_raw_event_initcall_level+0x1f0/0x1f0
 [ 1818.882447]  ? __create_object+0x5e/0x80
 [ 1818.883115]  ? kmalloc_trace_noprof+0x1f9/0x410
 [ 1818.883860]  ? kasan_unpoison+0x23/0x50
 [ 1818.884520]  do_init_module+0x22d/0x740
 [ 1818.885184]  load_module+0x5077/0x6690
 [ 1818.885839]  ? module_frob_arch_sections+0x20/0x20
 [ 1818.886618]  ? kernel_read_file+0x560/0x6c0
 [ 1818.887311]  ? kernel_read_file+0x246/0x6c0
 [ 1818.888005]  ? __x64_sys_fsopen+0x190/0x190
 [ 1818.888708]  ? expand_stack+0x440/0x440
 [ 1818.889367]  ? init_module_from_file+0xd2/0x130
 [ 1818.890102]  init_module_from_file+0xd2/0x130
 [ 1818.890824]  ? __x64_sys_init_module+0xb0/0xb0
 [ 1818.891558]  ? do_raw_spin_unlock+0x54/0x220
 [ 1818.892291]  idempotent_init_module+0x326/0x5a0
 [ 1818.893030]  ? init_module_from_file+0x130/0x130
 [ 1818.893783]  ? security_capable+0x51/0x90
 [ 1818.894461]  __x64_sys_finit_module+0xc1/0x130
 [ 1818.895192]  do_syscall_64+0x6d/0x140
 [ 1818.895818]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 [ 1818.896631] RIP: 0033:0x7f29a2b0af3d
 [ 1818.897254] Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa
48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d bb ee 0e 00 f7 d8 64 89 01 48
 [ 1818.899904] RSP: 002b:00007ffc9cd4cda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
 [ 1818.901092] RAX: ffffffffffffffda RBX: 00005643c8995670 RCX: 00007f29a2b0af3d
 [ 1818.902157] RDX: 0000000000000000 RSI: 00007f29a31ef32c RDI: 0000000000000011
 [ 1818.903224] RBP: 0000000000020000 R08: 0000000000000000 R09: 0000000000000002
 [ 1818.904292] R10: 0000000000000011 R11: 0000000000000246 R12: 00007f29a31ef32c
 [ 1818.905365] R13: 00005643c89efca0 R14: 0000000000000007 R15: 00005643c8993530
 [ 1818.906431]  </TASK>
 [ 1818.906877]
 [ 1818.907242] Allocated by task 1:
 [ 1818.907819]  kasan_save_stack+0x20/0x40
 [ 1818.908512]  kasan_save_track+0x10/0x30
 [ 1818.909173]  __kasan_slab_alloc+0x51/0x60
 [ 1818.909849]  kmem_cache_alloc_noprof+0x139/0x3f0
 [ 1818.910608]  getname_flags+0x4f/0x3c0
 [ 1818.911236]  do_sys_openat2+0xd3/0x150
 [ 1818.911878]  __x64_sys_openat+0x11f/0x1d0
 [ 1818.912554]  do_syscall_64+0x6d/0x140
 [ 1818.913189]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 [ 1818.913996]
 [ 1818.914359] Freed by task 1:
 [ 1818.914897]  kasan_save_stack+0x20/0x40
 [ 1818.915553]  kasan_save_track+0x10/0x30
 [ 1818.916210]  kasan_save_free_info+0x37/0x50
 [ 1818.916911]  poison_slab_object+0x10c/0x190
 [ 1818.917606]  __kasan_slab_free+0x11/0x30
 [ 1818.918271]  kmem_cache_free+0x12c/0x460
 [ 1818.918939]  do_sys_openat2+0x102/0x150
 [ 1818.919586]  __x64_sys_openat+0x11f/0x1d0
 [ 1818.920264]  do_syscall_64+0x6d/0x140
 [ 1818.920899]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 [ 1818.921699]
 [ 1818.922059] The buggy address belongs to the object at ffff888156864400
 [ 1818.922059]  which belongs to the cache names_cache of size 4096
 [ 1818.923872] The buggy address is located 1320 bytes inside of
 [ 1818.923872]  freed 4096-byte region [ffff888156864400, ffff888156865400)
 [ 1818.925674]
 [ 1818.926034] The buggy address belongs to the physical page:
 [ 1818.926905] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x156860
 [ 1818.928155] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
 [ 1818.929358] flags: 0x200000000000040(head|node=0|zone=2)
 [ 1818.930195] page_type: 0xffffefff(slab)
 [ 1818.930844] raw: 0200000000000040 ffff8881009ab040 dead000000000122 0000000000000000
 [ 1818.932049] raw: 0000000000000000 0000000000070007 00000001ffffefff 0000000000000000
 [ 1818.933277] head: 0200000000000040 ffff8881009ab040 dead000000000122 0000000000000000
 [ 1818.934506] head: 0000000000000000 0000000000070007 00000001ffffefff 0000000000000000
 [ 1818.935728] head: 0200000000000003 ffffea00055a1801 ffffffffffffffff 0000000000000000
 [ 1818.936962] head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000
 [ 1818.938186] page dumped because: kasan: bad access detected
 [ 1818.939065]
 [ 1818.939429] Memory state around the buggy address:
 [ 1818.940197]  ffff888156864800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1818.941358]  ffff888156864880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1818.942504] >ffff888156864900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1818.943656]                                   ^
 [ 1818.944399]  ffff888156864980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1818.945564]  ffff888156864a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1818.946711] ==================================================================

As a solution, let's rewrite the commit mentioned in Fixes line to
properly unwind error without need to flash anything.

Fixes: 643445531829 ("workqueue: Fix UAF report by KASAN in pwq_release_workfn()")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 kernel/workqueue.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

Comments

Tejun Heo May 30, 2024, 9:42 p.m. UTC | #1
Hello, Leon. Sorry about the delay.

On Tue, May 28, 2024 at 11:39:58AM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> The commit 643445531829 ("workqueue: Fix UAF report by KASAN in
> pwq_release_workfn()") causes to the following lockdep warning.

KASAN warning?

>  [ 1818.839405] ==================================================================
>  [ 1818.840636] BUG: KASAN: slab-use-after-free in lockdep_register_key+0x707/0x810
>  [ 1818.841827] Read of size 8 at addr ffff888156864928 by task systemd-udevd/71399
...
>  [ 1818.846493] Call Trace:
>  [ 1818.846981]  <TASK>
>  [ 1818.847439]  dump_stack_lvl+0x7e/0xc0
>  [ 1818.848089]  print_report+0xc1/0x600
>  [ 1818.850978]  kasan_report+0xb9/0xf0
>  [ 1818.852381]  lockdep_register_key+0x707/0x810
>  [ 1818.855329]  alloc_workqueue+0x466/0x1800

Can you please map this to the source line?

>  [ 1818.857997]  ib_mad_init_device+0x809/0x1760 [ib_core]

...

>  [ 1818.907242] Allocated by task 1:
>  [ 1818.907819]  kasan_save_stack+0x20/0x40
>  [ 1818.908512]  kasan_save_track+0x10/0x30
>  [ 1818.909173]  __kasan_slab_alloc+0x51/0x60
>  [ 1818.909849]  kmem_cache_alloc_noprof+0x139/0x3f0
>  [ 1818.910608]  getname_flags+0x4f/0x3c0
>  [ 1818.911236]  do_sys_openat2+0xd3/0x150
>  [ 1818.911878]  __x64_sys_openat+0x11f/0x1d0
>  [ 1818.912554]  do_syscall_64+0x6d/0x140
>  [ 1818.913189]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
>  [ 1818.913996]
>  [ 1818.914359] Freed by task 1:
>  [ 1818.914897]  kasan_save_stack+0x20/0x40
>  [ 1818.915553]  kasan_save_track+0x10/0x30
>  [ 1818.916210]  kasan_save_free_info+0x37/0x50
>  [ 1818.916911]  poison_slab_object+0x10c/0x190
>  [ 1818.917606]  __kasan_slab_free+0x11/0x30
>  [ 1818.918271]  kmem_cache_free+0x12c/0x460
>  [ 1818.918939]  do_sys_openat2+0x102/0x150
>  [ 1818.919586]  __x64_sys_openat+0x11f/0x1d0
>  [ 1818.920264]  do_syscall_64+0x6d/0x140
>  [ 1818.920899]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
>  [ 1818.921699]
>  [ 1818.922059] The buggy address belongs to the object at ffff888156864400
>  [ 1818.922059]  which belongs to the cache names_cache of size 4096

This is a dcache name. I'm a bit lost on how we're hitting this.

Thanks.
Leon Romanovsky May 31, 2024, 3:48 a.m. UTC | #2
On Thu, May 30, 2024 at 11:42:00AM -1000, Tejun Heo wrote:
> Hello, Leon. Sorry about the delay.
> 
> On Tue, May 28, 2024 at 11:39:58AM +0300, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> > 
> > The commit 643445531829 ("workqueue: Fix UAF report by KASAN in
> > pwq_release_workfn()") causes to the following lockdep warning.
> 
> KASAN warning?

Yes

> 
> >  [ 1818.839405] ==================================================================
> >  [ 1818.840636] BUG: KASAN: slab-use-after-free in lockdep_register_key+0x707/0x810
> >  [ 1818.841827] Read of size 8 at addr ffff888156864928 by task systemd-udevd/71399
> ...
> >  [ 1818.846493] Call Trace:
> >  [ 1818.846981]  <TASK>
> >  [ 1818.847439]  dump_stack_lvl+0x7e/0xc0
> >  [ 1818.848089]  print_report+0xc1/0x600
> >  [ 1818.850978]  kasan_report+0xb9/0xf0
> >  [ 1818.852381]  lockdep_register_key+0x707/0x810
> >  [ 1818.855329]  alloc_workqueue+0x466/0x1800
> 
> Can you please map this to the source line?

I'm not sure if it is accurate, as my object is slightly different from
the report, but the area is the same.

➜  kernel git:(wip/leon-for-next) ./scripts/faddr2line vmlinux alloc_workqueue+0x466
alloc_workqueue+0x466/0x19f0:
kernel/workqueue.c:5710
  5709         wq_init_lockdep(wq);
  5710         INIT_LIST_HEAD(&wq->list);  

> 
> >  [ 1818.857997]  ib_mad_init_device+0x809/0x1760 [ib_core]
> 
> ...
> 
> >  [ 1818.907242] Allocated by task 1:
> >  [ 1818.907819]  kasan_save_stack+0x20/0x40
> >  [ 1818.908512]  kasan_save_track+0x10/0x30
> >  [ 1818.909173]  __kasan_slab_alloc+0x51/0x60
> >  [ 1818.909849]  kmem_cache_alloc_noprof+0x139/0x3f0
> >  [ 1818.910608]  getname_flags+0x4f/0x3c0
> >  [ 1818.911236]  do_sys_openat2+0xd3/0x150
> >  [ 1818.911878]  __x64_sys_openat+0x11f/0x1d0
> >  [ 1818.912554]  do_syscall_64+0x6d/0x140
> >  [ 1818.913189]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
> >  [ 1818.913996]
> >  [ 1818.914359] Freed by task 1:
> >  [ 1818.914897]  kasan_save_stack+0x20/0x40
> >  [ 1818.915553]  kasan_save_track+0x10/0x30
> >  [ 1818.916210]  kasan_save_free_info+0x37/0x50
> >  [ 1818.916911]  poison_slab_object+0x10c/0x190
> >  [ 1818.917606]  __kasan_slab_free+0x11/0x30
> >  [ 1818.918271]  kmem_cache_free+0x12c/0x460
> >  [ 1818.918939]  do_sys_openat2+0x102/0x150
> >  [ 1818.919586]  __x64_sys_openat+0x11f/0x1d0
> >  [ 1818.920264]  do_syscall_64+0x6d/0x140
> >  [ 1818.920899]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
> >  [ 1818.921699]
> >  [ 1818.922059] The buggy address belongs to the object at ffff888156864400
> >  [ 1818.922059]  which belongs to the cache names_cache of size 4096
> 
> This is a dcache name. I'm a bit lost on how we're hitting this.

We have similar issues but with different workqueue.

 [ 1233.554381] ==================================================================
 [ 1233.555215] BUG: KASAN: slab-use-after-free in lockdep_register_key+0x707/0x810
 [ 1233.555983] Read of size 8 at addr ffff88811f1d8928 by task test-ovs-bond-m/10149
 [ 1233.556774] 
 [ 1233.557020] CPU: 0 PID: 10149 Comm: test-ovs-bond-m Not tainted 6.10.0-rc1_external_1613e604df0c #1
 [ 1233.557951] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 [ 1233.559044] Call Trace:
 [ 1233.559367]  <TASK>
 [ 1233.559653]  dump_stack_lvl+0x7e/0xc0
 [ 1233.560078]  print_report+0xc1/0x600
 [ 1233.560496]  ? __virt_addr_valid+0x1cf/0x390
 [ 1233.560964]  ? lockdep_register_key+0x707/0x810
 [ 1233.561463]  ? lockdep_register_key+0x707/0x810
 [ 1233.561975]  kasan_report+0xb9/0xf0
 [ 1233.562382]  ? lockdep_register_key+0x707/0x810
 [ 1233.562872]  lockdep_register_key+0x707/0x810
 [ 1233.563344]  ? add_lock_to_list+0x5b0/0x5b0
 [ 1233.563813]  ? lockdep_init_map_type+0x2c3/0x7a0
 [ 1233.564312]  ? __raw_spin_lock_init+0x3b/0x110
 [ 1233.564799]  alloc_workqueue+0x466/0x1800
 [ 1233.565261]  ? workqueue_sysfs_register+0x370/0x370
 [ 1233.565808]  ? is_module_address+0x2d/0x40
 [ 1233.566268]  ? static_obj+0x98/0xd0
 [ 1233.566671]  ? lockdep_init_map_type+0x2c3/0x7a0
 [ 1233.567170]  ? queue_work_node+0x2c0/0x2c0
 [ 1233.567627]  mlx5_pagealloc_init+0x7d/0x180 [mlx5_core]
 [ 1233.568322]  mlx5_mdev_init+0x482/0xad0 [mlx5_core]
 [ 1233.568915]  ? devlink_alloc_ns+0x883/0xaa0
 [ 1233.569387]  probe_one+0x11d/0xc80 [mlx5_core]
 [ 1233.572996]  ? remove_one+0x210/0x210 [mlx5_core]
 [ 1233.573604]  local_pci_probe+0xd7/0x180
 [ 1233.574063]  pci_device_probe+0x228/0x720
 [ 1233.574516]  ? pci_match_device+0x690/0x690
 [ 1233.574985]  ? kernfs_create_link+0x16f/0x230
 [ 1233.575460]  ? do_raw_spin_unlock+0x54/0x220
 [ 1233.575933]  ? kernfs_put+0x18/0x30
 [ 1233.576342]  ? sysfs_do_create_link_sd+0x8c/0xf0
 [ 1233.576844]  really_probe+0x1e4/0x920
 [ 1233.577269]  __driver_probe_device+0x261/0x3e0
 [ 1233.577766]  driver_probe_device+0x49/0x130
 [ 1233.578232]  __device_attach_driver+0x187/0x290
 [ 1233.578723]  ? driver_probe_device+0x130/0x130
 [ 1233.579202]  bus_for_each_drv+0x103/0x190
 [ 1233.579644]  ? mark_held_locks+0x9f/0xe0
 [ 1233.580090]  ? bus_for_each_dev+0x170/0x170
 [ 1233.580553]  ? lockdep_hardirqs_on_prepare+0x284/0x400
 [ 1233.581108]  __device_attach+0x1a3/0x3f0
 [ 1233.581544]  ? device_driver_attach+0x1e0/0x1e0
 [ 1233.582047]  ? pci_bridge_d3_possible+0x1e0/0x1e0
 [ 1233.582558]  ? pci_create_resource_files+0xeb/0x170
 [ 1233.583081]  pci_bus_add_device+0x9f/0xf0
 [ 1233.583531]  pci_iov_add_virtfn+0x9c6/0xde0
 [ 1233.583994]  ? __wake_up+0x40/0x50
 [ 1233.584396]  sriov_enable+0x658/0xd10
 [ 1233.584813]  ? pcibios_sriov_disable+0x10/0x10
 [ 1233.585317]  mlx5_core_sriov_configure+0xb1/0x310 [mlx5_core]
 [ 1233.586030]  sriov_numvfs_store+0x209/0x370
 [ 1233.586497]  ? sriov_totalvfs_show+0xc0/0xc0
 [ 1233.586972]  ? sysfs_file_ops+0x170/0x170
 [ 1233.587430]  ? sysfs_file_ops+0x11a/0x170
 [ 1233.587883]  ? sysfs_file_ops+0x170/0x170
 [ 1233.588334]  kernfs_fop_write_iter+0x347/0x520
 [ 1233.588817]  vfs_write+0x96f/0xf30
 [ 1233.589234]  ? do_user_addr_fault+0x812/0xd20
 [ 1233.589714]  ? kernel_write+0x530/0x530
 [ 1233.590168]  ? __fget_light+0x53/0x1d0
 [ 1233.590600]  ksys_write+0xf1/0x1d0
 [ 1233.591003]  ? __x64_sys_read+0xb0/0xb0
 [ 1233.591437]  ? do_user_addr_fault+0x895/0xd20
 [ 1233.591924]  do_syscall_64+0x6d/0x140
 [ 1233.592348]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 [ 1233.592888] RIP: 0033:0x7f8d663018b7
 [ 1233.593312] Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
 [ 1233.595070] RSP: 002b:00007ffdbae78c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
 [ 1233.595853] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f8d663018b7
 [ 1233.596551] RDX: 0000000000000002 RSI: 000055e696bc2070 RDI: 0000000000000001
 [ 1233.597270] RBP: 000055e696bc2070 R08: 0000000000000000 R09: 00007f8d663b64e0
 [ 1233.598001] R10: 00007f8d663b63e0 R11: 0000000000000246 R12: 0000000000000002
 [ 1233.598716] R13: 00007f8d663fb5a0 R14: 0000000000000002 R15: 00007f8d663fb7a0
 [ 1233.599431]  </TASK>
 [ 1233.599735] 
 [ 1233.599979] Allocated by task 9589:
 [ 1233.600382]  kasan_save_stack+0x20/0x40
 [ 1233.600828]  kasan_save_track+0x10/0x30
 [ 1233.601265]  __kasan_kmalloc+0x77/0x90
 [ 1233.601696]  kernfs_iop_get_link+0x61/0x5a0
 [ 1233.602181]  vfs_readlink+0x1ab/0x320
 [ 1233.602582] mlx5_core 0000:08:00.6 enp8s0f1v0: renamed from eth4
 [ 1233.602605]  do_readlinkat+0x1cb/0x290
 [ 1233.602610]  __x64_sys_readlinkat+0x92/0xf0
 [ 1233.602612]  do_syscall_64+0x6d/0x140
 [ 1233.605196]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 [ 1233.605731] 
 [ 1233.605986] Freed by task 9589:
 [ 1233.606373]  kasan_save_stack+0x20/0x40
 [ 1233.606801]  kasan_save_track+0x10/0x30
 [ 1233.607232]  kasan_save_free_info+0x37/0x50
 [ 1233.607695]  poison_slab_object+0x10c/0x190
 [ 1233.608161]  __kasan_slab_free+0x11/0x30
 [ 1233.608604]  kfree+0x11b/0x340
 [ 1233.608970]  vfs_readlink+0x120/0x320
 [ 1233.609413]  do_readlinkat+0x1cb/0x290
 [ 1233.609849]  __x64_sys_readlinkat+0x92/0xf0
 [ 1233.610308]  do_syscall_64+0x6d/0x140
 [ 1233.610741]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 [ 1233.611276] 
 [ 1233.611524] The buggy address belongs to the object at ffff88811f1d8000
 [ 1233.611524]  which belongs to the cache kmalloc-4k of size 4096
 [ 1233.612714] The buggy address is located 2344 bytes inside of
 [ 1233.612714]  freed 4096-byte region [ffff88811f1d8000, ffff88811f1d9000)
 [ 1233.613920] 
 [ 1233.614168] The buggy address belongs to the physical page:
 [ 1233.614743] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11f1d8
 [ 1233.615556] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
 [ 1233.616338] flags: 0x200000000000040(head|node=0|zone=2)
 [ 1233.616894] page_type: 0xffffefff(slab)
 [ 1233.617340] raw: 0200000000000040 ffff888100043040 ffffea00051bd800 dead000000000002
 [ 1233.618151] raw: 0000000000000000 0000000000040004 00000001ffffefff 0000000000000000
 [ 1233.618958] head: 0200000000000040 ffff888100043040 ffffea00051bd800 dead000000000002
 [ 1233.619763] head: 0000000000000000 0000000000040004 00000001ffffefff 0000000000000000
 [ 1233.620573] head: 0200000000000003 ffffea00047c7601 ffffffffffffffff 0000000000000000
 [ 1233.621386] head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000
 [ 1233.622201] page dumped because: kasan: bad access detected
 [ 1233.622778] 
 [ 1233.623027] Memory state around the buggy address:
 [ 1233.623532]  ffff88811f1d8800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1233.624280]  ffff88811f1d8880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1233.625029] >ffff88811f1d8900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1233.625803]                                   ^
 [ 1233.626296]  ffff88811f1d8980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1233.627047]  ffff88811f1d8a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 [ 1233.627801] ==================================================================

> 
> Thanks.
> 
> -- 
> tejun
Tejun Heo May 31, 2024, 5:45 p.m. UTC | #3
Hello,

On Fri, May 31, 2024 at 06:48:51AM +0300, Leon Romanovsky wrote:
> We have similar issues but with different workqueue.

So, the problem with the proposed patch is that pwq may still be in use by
then (due to async freeing) and thus can't be freed immediately. I still
don't understand why KASAN is triggering there. I tried to repro by
introducing a pwq alloc failure but couldn't. Can you please share the
repro?

Thanks.
Leon Romanovsky June 2, 2024, 6:56 a.m. UTC | #4
On Fri, May 31, 2024 at 07:45:35AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Fri, May 31, 2024 at 06:48:51AM +0300, Leon Romanovsky wrote:
> > We have similar issues but with different workqueue.
> 
> So, the problem with the proposed patch is that pwq may still be in use by
> then (due to async freeing) and thus can't be freed immediately. I still
> don't understand why KASAN is triggering there. I tried to repro by
> introducing a pwq alloc failure but couldn't. Can you please share the
> repro?

The repro is part of our CI test, where we run traffic tests and at the
end perform "restart driver" stage. At that stage, this lockdep is
printed.

The test is:
1. Create bond interfaces
2. Run traffic
3. Strop traffic and check that everything is OK
4. Restart driver:
modprobe -r -a bonding
modprobe -r -a vfio-pci ip_gre ip6_gre bonding geneve ipip ip6_tunnel
modprobe -r -a ib_umad ib_ipoib rdma_ucm mlx5_vfio_pci mlx5_ib mlx5_core mlx5-vfio-pci nf_tables

Thanks

> 
> Thanks.
> 
> -- 
> tejun
>
Tejun Heo June 3, 2024, 8:10 p.m. UTC | #5
Hello, again, Leon.

Re-reading the warning, I'm not sure this is a bug on workqueue side.

On Fri, May 31, 2024 at 06:48:51AM +0300, Leon Romanovsky wrote:
>  [ 1233.554381] ==================================================================
>  [ 1233.555215] BUG: KASAN: slab-use-after-free in lockdep_register_key+0x707/0x810
>  [ 1233.555983] Read of size 8 at addr ffff88811f1d8928 by task test-ovs-bond-m/10149
>  [ 1233.556774] 
>  [ 1233.557020] CPU: 0 PID: 10149 Comm: test-ovs-bond-m Not tainted 6.10.0-rc1_external_1613e604df0c #1
>  [ 1233.557951] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
>  [ 1233.559044] Call Trace:
>  [ 1233.559367]  <TASK>
>  [ 1233.559653]  dump_stack_lvl+0x7e/0xc0
>  [ 1233.560078]  print_report+0xc1/0x600
>  [ 1233.561975]  kasan_report+0xb9/0xf0
>  [ 1233.562872]  lockdep_register_key+0x707/0x810
>  [ 1233.564799]  alloc_workqueue+0x466/0x1800
>  [ 1233.567627]  mlx5_pagealloc_init+0x7d/0x180 [mlx5_core]
>  [ 1233.568322]  mlx5_mdev_init+0x482/0xad0 [mlx5_core]
>  [ 1233.569387]  probe_one+0x11d/0xc80 [mlx5_core]

So, this is saying that alloc_workqueue() allocated a name during lockdep
initialization. This is before pwq init or anything else complicated
happening. It just allocated the workqueue struct and called into
lockep_register_key(&wq->key).

>  [ 1233.599979] Allocated by task 9589:
>  [ 1233.600382]  kasan_save_stack+0x20/0x40
>  [ 1233.600828]  kasan_save_track+0x10/0x30
>  [ 1233.601265]  __kasan_kmalloc+0x77/0x90
>  [ 1233.601696]  kernfs_iop_get_link+0x61/0x5a0
>  [ 1233.602181]  vfs_readlink+0x1ab/0x320
>  [ 1233.602605]  do_readlinkat+0x1cb/0x290
>  [ 1233.602610]  __x64_sys_readlinkat+0x92/0xf0
>  [ 1233.602612]  do_syscall_64+0x6d/0x140
>  [ 1233.605196]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
>  [ 1233.605731] 
>  [ 1233.605986] Freed by task 9589:
>  [ 1233.606373]  kasan_save_stack+0x20/0x40
>  [ 1233.606801]  kasan_save_track+0x10/0x30
>  [ 1233.607232]  kasan_save_free_info+0x37/0x50
>  [ 1233.607695]  poison_slab_object+0x10c/0x190
>  [ 1233.608161]  __kasan_slab_free+0x11/0x30
>  [ 1233.608604]  kfree+0x11b/0x340
>  [ 1233.608970]  vfs_readlink+0x120/0x320
>  [ 1233.609413]  do_readlinkat+0x1cb/0x290
>  [ 1233.609849]  __x64_sys_readlinkat+0x92/0xf0
>  [ 1233.610308]  do_syscall_64+0x6d/0x140
>  [ 1233.610741]  entry_SYSCALL_64_after_hwframe+0x4b/0x53

And KASAN is reporting use-after-free on a completely unrelated VFS object.
I can't tell for sure from the logs alone but lockdep_register_key()
iterates entries in the hashtable trying to find whether the key is a
duplicate and it could be that that walk is triggering the use-after-free
warning. If so, it doesn't really have much to do with workqueue. The
corruption happened elsewhere and workqueue just happens to traverse the
hashtable afterwards.

Thanks.
Leon Romanovsky June 4, 2024, 8:09 a.m. UTC | #6
On Mon, Jun 03, 2024 at 10:10:36AM -1000, Tejun Heo wrote:
> Hello, again, Leon.
> 
> Re-reading the warning, I'm not sure this is a bug on workqueue side.
> 
> On Fri, May 31, 2024 at 06:48:51AM +0300, Leon Romanovsky wrote:
> >  [ 1233.554381] ==================================================================
> >  [ 1233.555215] BUG: KASAN: slab-use-after-free in lockdep_register_key+0x707/0x810
> >  [ 1233.555983] Read of size 8 at addr ffff88811f1d8928 by task test-ovs-bond-m/10149
> >  [ 1233.556774] 
> >  [ 1233.557020] CPU: 0 PID: 10149 Comm: test-ovs-bond-m Not tainted 6.10.0-rc1_external_1613e604df0c #1
> >  [ 1233.557951] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
> >  [ 1233.559044] Call Trace:
> >  [ 1233.559367]  <TASK>
> >  [ 1233.559653]  dump_stack_lvl+0x7e/0xc0
> >  [ 1233.560078]  print_report+0xc1/0x600
> >  [ 1233.561975]  kasan_report+0xb9/0xf0
> >  [ 1233.562872]  lockdep_register_key+0x707/0x810
> >  [ 1233.564799]  alloc_workqueue+0x466/0x1800
> >  [ 1233.567627]  mlx5_pagealloc_init+0x7d/0x180 [mlx5_core]
> >  [ 1233.568322]  mlx5_mdev_init+0x482/0xad0 [mlx5_core]
> >  [ 1233.569387]  probe_one+0x11d/0xc80 [mlx5_core]
> 
> So, this is saying that alloc_workqueue() allocated a name during lockdep
> initialization. This is before pwq init or anything else complicated
> happening. It just allocated the workqueue struct and called into
> lockep_register_key(&wq->key).
> 
> >  [ 1233.599979] Allocated by task 9589:
> >  [ 1233.600382]  kasan_save_stack+0x20/0x40
> >  [ 1233.600828]  kasan_save_track+0x10/0x30
> >  [ 1233.601265]  __kasan_kmalloc+0x77/0x90
> >  [ 1233.601696]  kernfs_iop_get_link+0x61/0x5a0
> >  [ 1233.602181]  vfs_readlink+0x1ab/0x320
> >  [ 1233.602605]  do_readlinkat+0x1cb/0x290
> >  [ 1233.602610]  __x64_sys_readlinkat+0x92/0xf0
> >  [ 1233.602612]  do_syscall_64+0x6d/0x140
> >  [ 1233.605196]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
> >  [ 1233.605731] 
> >  [ 1233.605986] Freed by task 9589:
> >  [ 1233.606373]  kasan_save_stack+0x20/0x40
> >  [ 1233.606801]  kasan_save_track+0x10/0x30
> >  [ 1233.607232]  kasan_save_free_info+0x37/0x50
> >  [ 1233.607695]  poison_slab_object+0x10c/0x190
> >  [ 1233.608161]  __kasan_slab_free+0x11/0x30
> >  [ 1233.608604]  kfree+0x11b/0x340
> >  [ 1233.608970]  vfs_readlink+0x120/0x320
> >  [ 1233.609413]  do_readlinkat+0x1cb/0x290
> >  [ 1233.609849]  __x64_sys_readlinkat+0x92/0xf0
> >  [ 1233.610308]  do_syscall_64+0x6d/0x140
> >  [ 1233.610741]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
> 
> And KASAN is reporting use-after-free on a completely unrelated VFS object.
> I can't tell for sure from the logs alone but lockdep_register_key()
> iterates entries in the hashtable trying to find whether the key is a
> duplicate and it could be that that walk is triggering the use-after-free
> warning. If so, it doesn't really have much to do with workqueue. The
> corruption happened elsewhere and workqueue just happens to traverse the
> hashtable afterwards.

The problem is that revert of commit 643445531829 ("workqueue: Fix UAF report by KASAN in pwq_release_workfn()")
fixed these use-after-free reports. In addition, we didn't get any other failures (related to memory corruption)
except these lockdeps reports.

Tariq showed me another lockdep report. I don't think that it is
related, but adding here to make sure.

Also let's add Peter to this report, maybe he can guide us to find the root cause.
➜  kernel git:(rdma-next) ./scripts/faddr2line vmlinux lockdep_register_key+0x8f
lockdep_register_key+0x8f/0x800:
lockdep_lock at kernel/locking/lockdep.c:141
(inlined by) graph_lock at kernel/locking/lockdep.c:170
(inlined by) lockdep_register_key at kernel/locking/lockdep.c:1223

 [ 1078.101755] ====================================================================================================
 [ 1078.584085] mlx5_core 0000:08:00.0: firmware version: 16.35.1012
 [ 1078.585182] mlx5_core 0000:08:00.0: 126.016 Gb/s available PCIe bandwidth, limited by 8.0 GT/s PCIe x16 link at 0000:00:02.7 (capable of 252.048 Gb/s with 16.0 GT/s PCIe x16 link)
 [ 1079.013341] mlx5_core 0000:08:00.0: Rate limit: 127 rates are supported, range: 0Mbps to 97656Mbps
 [ 1079.015912] mlx5_core 0000:08:00.0: E-Switch: Total vports 6, per vport: max uc(128) max mc(2048)
 [ 1079.048423] mlx5_core 0000:08:00.0: Port module event: module 0, Cable plugged
 [ 1079.053400] mlx5_core 0000:08:00.0: mlx5_pcie_event:301:(pid 10308): PCIe slot advertised sufficient power (75W).
 [ 1079.283669] mlx5_core 0000:08:00.0: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0 basic)
 [ 1079.288969] BUG: unable to handle page fault for address: 000000330000000a
 [ 1079.290036] #PF: supervisor read access in kernel mode
 [ 1079.290902] #PF: error_code(0x0000) - not-present page
 [ 1079.292016] PGD 0 P4D 0
 [ 1079.292808] Oops: Oops: 0000 [#1] SMP
 [ 1079.293561] CPU: 8 PID: 31777 Comm: modprobe Not tainted 6.10.0-rc2_internal_net_next_60d8b51 #1
 [ 1079.295087] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 [ 1079.296804] RIP: 0010:lockdep_register_key+0x8f/0x220
 [ 1079.297600] Code: bf 31 fa 01 65 4c 8b 25 7f 56 e5 7e 45 85 c0 4c 89 25 3d 33 e0 03 74 25 48 8b 04 ed 00 37 e2 84 48 85 c0 75 11 e9 c2 00 00 00 <48> 8b 00 48 85 c0 0f 84 b6 00 00 00 48 39 d8 75 ef 0f 0b 8b 3d a0
 [ 1079.300226] RSP: 0018:ffff888108e97a80 EFLAGS: 00010003
 [ 1079.301034] RAX: 000000330000000a RBX: ffff8881095c6278 RCX: 000000000000000a
 [ 1079.302076] RDX: 0000000000000001 RSI: 000000000000000a RDI: ffffffff826a4700
 [ 1079.303148] RBP: 00000000000001ae R08: 0000000000000001 R09: 0000000000000000
 [ 1079.304215] R10: 0000000000000000 R11: 0000000000000001 R12: ffff888122ff8000
 [ 1079.305281] R13: 0000000000000202 R14: ffffffff84e24470 R15: ffffffff84ffdf18
 [ 1079.306338] FS:  00007fb8b7277740(0000) GS:ffff88852cc00000(0000) knlGS:0000000000000000
 [ 1079.307593] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [ 1079.308482] CR2: 000000330000000a CR3: 00000001167d7004 CR4: 0000000000370eb0
 [ 1079.309540] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 [ 1079.310628] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 [ 1079.311713] Call Trace:
 [ 1079.312195]  <TASK>
 [ 1079.312623]  ? __die+0x20/0x60
 [ 1079.313178]  ? page_fault_oops+0x150/0x400
 [ 1079.313852]  ? exc_page_fault+0x79/0x240
 [ 1079.314522]  ? asm_exc_page_fault+0x22/0x30
 [ 1079.315213]  ? lockdep_register_key+0x8f/0x220
 [ 1079.315963]  devlink_alloc_ns+0x306/0x390
 [ 1079.316623]  probe_one+0x24/0x4e0 [mlx5_core]
 [ 1079.317432]  local_pci_probe+0x3e/0x90
 [ 1079.318071]  pci_device_probe+0xbf/0x240
 [ 1079.318735]  ? sysfs_do_create_link_sd+0x69/0xd0
 [ 1079.319491]  really_probe+0xd4/0x3b0
 [ 1079.320104]  __driver_probe_device+0x8c/0x160
 [ 1079.320811]  driver_probe_device+0x1e/0xb0
 [ 1079.321488]  __driver_attach+0xff/0x1e0
 [ 1079.322128]  ? __device_attach_driver+0x130/0x130
 [ 1079.322886]  bus_for_each_dev+0x74/0xc0
 [ 1079.323555]  bus_add_driver+0xf0/0x250
 [ 1079.324188]  driver_register+0x58/0x100
 [ 1079.324837]  ? 0xffffffffa0242000
 [ 1079.325420]  mlx5_init+0x65/0x1000 [mlx5_core]
 [ 1079.326226]  do_one_initcall+0x61/0x2b0
 [ 1079.326887]  ? kmalloc_trace_noprof+0x1a9/0x400
 [ 1079.327647]  do_init_module+0x8a/0x270
 [ 1079.328293]  init_module_from_file+0x8b/0xd0
 [ 1079.328999]  idempotent_init_module+0x17d/0x230
 [ 1079.329735]  __x64_sys_finit_module+0x61/0xb0
 [ 1079.330459]  do_syscall_64+0x6d/0x140
 [ 1079.331084]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 [ 1079.331900] RIP: 0033:0x7fb8b6b0af3d
 [ 1079.332507] Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d bb ee 0e 00 f7 d8 64 89 01 48
 [ 1079.335157] RSP: 002b:00007ffe04497818 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
 [ 1079.336362] RAX: ffffffffffffffda RBX: 000055b05c326c10 RCX: 00007fb8b6b0af3d
 [ 1079.337407] RDX: 0000000000000000 RSI: 000055b049840a2a RDI: 0000000000000003
 [ 1079.338466] RBP: 0000000000040000 R08: 0000000000000000 R09: 00007ffe04497950
 [ 1079.339533] R10: 0000000000000003 R11: 0000000000000246 R12: 000055b049840a2a
 [ 1079.340581] R13: 000055b05c326d80 R14: 000055b05c326c10 R15: 000055b05c329600
 [ 1079.341634]  </TASK>
 [ 1079.342076] Modules linked in: mlx5_core(+) ip6_tunnel tunnel6 nf_tables vfio_pci vfio_pci_core act_tunnel_key vxlan act_mirred act_skbedit cls_matchall act_gact cls_flower sch_ingress nfnetlink_cttimeout vfio_iommu_type1 vfio iptable_raw openvswitch nsh xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core zram zsmalloc fuse [last unloaded: nf_tables]
 [ 1079.352250] CR2: 000000330000000a
 [ 1079.352838] ---[ end trace 0000000000000000 ]---
 [ 1079.353575] RIP: 0010:lockdep_register_key+0x8f/0x220
 [ 1079.354380] Code: bf 31 fa 01 65 4c 8b 25 7f 56 e5 7e 45 85 c0 4c 89 25 3d 33 e0 03 74 25 48 8b 04 ed 00 37 e2 84 48 85 c0 75 11 e9 c2 00 00 00 <48> 8b 00 48 85 c0 0f 84 b6 00 00 00 48 39 d8 75 ef 0f 0b 8b 3d a0
 [ 1079.357025] RSP: 0018:ffff888108e97a80 EFLAGS: 00010003
 [ 1079.357832] RAX: 000000330000000a RBX: ffff8881095c6278 RCX: 000000000000000a
 [ 1079.358899] RDX: 0000000000000001 RSI: 000000000000000a RDI: ffffffff826a4700
 [ 1079.359967] RBP: 00000000000001ae R08: 0000000000000001 R09: 0000000000000000
 [ 1079.361005] R10: 0000000000000000 R11: 0000000000000001 R12: ffff888122ff8000
 [ 1079.362044] R13: 0000000000000202 R14: ffffffff84e24470 R15: ffffffff84ffdf18
 [ 1079.363090] FS:  00007fb8b7277740(0000) GS:ffff88852cc00000(0000) knlGS:0000000000000000
 [ 1079.364347] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [ 1079.365217] CR2: 000000330000000a CR3: 00000001167d7004 CR4: 0000000000370eb0
 [ 1079.366259] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 [ 1079.367304] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 [ 1079.368353] note: modprobe[31777] exited with irqs disabled
 [ 1080.829334] ------------[ cut here ]------------
 [ 1080.830879] WARNING: CPU: 8 PID: 31850 at kernel/locking/lockdep.c:467 lockdep_set_lock_cmp_fn+0x7e/0xa0
 [ 1080.833713] Modules linked in: mlx5_core(+) ip6_tunnel tunnel6 nf_tables vfio_pci vfio_pci_core act_tunnel_key vxlan act_mirred act_skbedit cls_matchall act_gact cls_flower sch_ingress nfnetlink_cttimeout vfio_iommu_type1 vfio iptable_raw openvswitch nsh xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core zram zsmalloc fuse [last unloaded: nf_tables]
 [ 1080.842428] CPU: 8 PID: 31850 Comm: bash Tainted: G      D            6.10.0-rc2_internal_net_next_60d8b51 #1
 [ 1080.843911] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 [ 1080.845516] RIP: 0010:lockdep_set_lock_cmp_fn+0x7e/0xa0
 [ 1080.846331] Code: f8 01 75 26 9c 58 f6 c4 02 75 2e 41 f7 c4 00 02 00 00 74 01 fb 5b 5d 41 5c c3 31 d2 31 f6 e8 49 f8 ff ff 48 85 c0 74 ca eb a0 <0f> 0b 65 c7 05 0d ce e3 7e 00 00 00 00 eb cb e8 6e b7 c9 00 eb cb
 [ 1080.848956] RSP: 0018:ffff8881c8f5fe68 EFLAGS: 00010002
 [ 1080.849774] RAX: 0000000000000002 RBX: ffffffff81467940 RCX: 0000000000000000
 [ 1080.850839] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff888101959068
 [ 1080.851911] RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000000
 [ 1080.852962] R10: ffff8881c8f5fe88 R11: 0000000000000000 R12: 0000000000000246
 [ 1080.854017] R13: 0000000000000360 R14: 0000000000000000 R15: 0000000000000000
 [ 1080.855077] FS:  00007f2434925740(0000) GS:ffff88852cc00000(0000) knlGS:0000000000000000
 [ 1080.856344] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [ 1080.857222] CR2: 0000559940a36088 CR3: 00000001ba6a0004 CR4: 0000000000370eb0
 [ 1080.858273] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 [ 1080.859329] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 [ 1080.860389] Call Trace:
 [ 1080.860860]  <TASK>
 [ 1080.861293]  ? __warn+0x88/0x190
 [ 1080.861867]  ? lockdep_set_lock_cmp_fn+0x7e/0xa0
 [ 1080.862631]  ? report_bug+0x160/0x170
 [ 1080.863260]  ? handle_bug+0x3c/0x60
 [ 1080.863874]  ? exc_invalid_op+0x14/0x70
 [ 1080.864512]  ? asm_exc_invalid_op+0x16/0x20
 [ 1080.865201]  ? begin_new_exec+0xd30/0xd30
 [ 1080.865867]  ? lockdep_set_lock_cmp_fn+0x7e/0xa0
 [ 1080.866609]  ? lockdep_set_lock_cmp_fn+0x77/0xa0
 [ 1080.867354]  alloc_pipe_info+0x17a/0x240
 [ 1080.868015]  create_pipe_files+0x40/0x220
 [ 1080.868680]  do_pipe2+0x3a/0xf0
 [ 1080.869248]  ? trace_hardirqs_on_prepare+0x3f/0xb0
 [ 1080.870022]  __x64_sys_pipe+0x10/0x20
 [ 1080.870661]  do_syscall_64+0x6d/0x140
 [ 1080.871285]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 [ 1080.872103] RIP: 0033:0x7f24347020ab
 [ 1080.872723] Code: 73 01 c3 48 8b 0d 7d 7d 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 16 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4d 7d 0f 00 f7 d8 64 89 01 48
 [ 1080.875408] RSP: 002b:00007ffdcba2e4e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000016
 [ 1080.876594] RAX: ffffffffffffffda RBX: 00007ffdcba2ea5c RCX: 00007f24347020ab
 [ 1080.877649] RDX: 0000559c1936fbaf RSI: 0000559940a19010 RDI: 00007ffdcba2e5c0
 [ 1080.878710] RBP: 0000559940a35a90 R08: 0000559940a2f0d0 R09: 00746f6f723d5245
 [ 1080.879772] R10: 0000000000000048 R11: 0000000000000246 R12: 0000000000000001
 [ 1080.880812] R13: 0000559940a35bc0 R14: 0000000000000000 R15: 0000000000000022
 [ 1080.881859]  </TASK>
 [ 1080.882299] irq event stamp: 0
 [ 1080.882859] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
 [ 1080.883818] hardirqs last disabled at (0): [<ffffffff81174b38>] copy_process+0x928/0x2a80
 [ 1080.885058] softirqs last  enabled at (0): [<ffffffff81174b38>] copy_process+0x928/0x2a80
 [ 1080.886325] softirqs last disabled at (0): [<0000000000000000>] 0x0
 [ 1080.887273] ---[ end trace 0000000000000000 ]---
 [ 1083.381768] ====================================================================================================

Thanks

> 
> Thanks.
> 
> -- 
> tejun
Hillf Danton June 4, 2024, 10:54 a.m. UTC | #7
On Tue, 4 Jun 2024 11:09:58 +0300 Leon Romanovsky <leon@kernel.org>
> On Mon, Jun 03, 2024 at 10:10:36AM -1000, Tejun Heo wrote:
> > 
> > And KASAN is reporting use-after-free on a completely unrelated VFS object.
> > I can't tell for sure from the logs alone but lockdep_register_key()
> > iterates entries in the hashtable trying to find whether the key is a
> > duplicate and it could be that that walk is triggering the use-after-free
> > warning. If so, it doesn't really have much to do with workqueue. The
> > corruption happened elsewhere and workqueue just happens to traverse the
> > hashtable afterwards.
> 
> The problem is that revert of commit 643445531829
> ("workqueue: Fix UAF report by KASAN in pwq_release_workfn()")
> fixed these use-after-free reports.
> 
Given revert makes sense,

	if (alloc_and_link_pwqs(wq) < 0)
		goto err_unreg_lockdep;

err_unreg_lockdep:
	wq_unregister_lockdep(wq);
	wq_free_lockdep(wq);
err_free_wq:
	free_workqueue_attrs(wq->unbound_attrs);
	kfree(wq);	<-- freed
	return NULL;

the difference 643445531829 makes is double free.

	alloc_and_link_pwqs(struct workqueue_struct *wq)
	if (ret)
		kthread_flush_worker(pwq_release_worker);
		  pwq_release_workfn()
		  if (is_last) {
			wq_unregister_lockdep(wq);
			call_rcu(&wq->rcu, rcu_free_wq); <-- freed again
		  }
Leon Romanovsky June 4, 2024, 11:38 a.m. UTC | #8
On Tue, Jun 04, 2024 at 06:54:56PM +0800, Hillf Danton wrote:
> On Tue, 4 Jun 2024 11:09:58 +0300 Leon Romanovsky <leon@kernel.org>
> > On Mon, Jun 03, 2024 at 10:10:36AM -1000, Tejun Heo wrote:
> > > 
> > > And KASAN is reporting use-after-free on a completely unrelated VFS object.
> > > I can't tell for sure from the logs alone but lockdep_register_key()
> > > iterates entries in the hashtable trying to find whether the key is a
> > > duplicate and it could be that that walk is triggering the use-after-free
> > > warning. If so, it doesn't really have much to do with workqueue. The
> > > corruption happened elsewhere and workqueue just happens to traverse the
> > > hashtable afterwards.
> > 
> > The problem is that revert of commit 643445531829
> > ("workqueue: Fix UAF report by KASAN in pwq_release_workfn()")
> > fixed these use-after-free reports.
> > 
> Given revert makes sense,

Thanks, it is very rare situation where call to flush/drain queue
(in our case kthread_flush_worker) in the middle of the allocation
flow can be correct. I can't remember any such case.

So even we don't fully understand the root cause, the reimplementation
is still valid and improves existing code.

Thanks
Leon Romanovsky June 4, 2024, 11:40 a.m. UTC | #9
On Tue, May 28, 2024 at 11:39:58AM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> The commit 643445531829 ("workqueue: Fix UAF report by KASAN in
> pwq_release_workfn()") causes to the following lockdep warning.

<...>

> As a solution, let's rewrite the commit mentioned in Fixes line to
> properly unwind error without need to flash anything.

Tejun,

If you decide to take this patch, can you please fix typo in the commit message? "flash" -> "flush"

Thanks
Tariq Toukan June 4, 2024, 1:16 p.m. UTC | #10
On 04/06/2024 14:40, Leon Romanovsky wrote:
> On Tue, May 28, 2024 at 11:39:58AM +0300, Leon Romanovsky wrote:
>> From: Leon Romanovsky <leonro@nvidia.com>
>>
>> The commit 643445531829 ("workqueue: Fix UAF report by KASAN in
>> pwq_release_workfn()") causes to the following lockdep warning.
> 
> <...>
> 
>> As a solution, let's rewrite the commit mentioned in Fixes line to
>> properly unwind error without need to flash anything.
> 
> Tejun,
> 
> If you decide to take this patch, can you please fix typo in the commit message? "flash" -> "flush"
> 
> Thanks
> 

Also "worning" -> "warning" in subject.
Imre Deak June 4, 2024, 2:30 p.m. UTC | #11
Hi,

[Sorry for the previous message, resending it now
 with proper In-reply-to: header added.]

I see a similar issue, a corruption in the lock_keys_hash while
alloc_workqueue()->lockdep_register_key() iterates it, see [1] for the
stacktrace.

Not sure if related or even will solve [1], but [2] will revert

commit 7e89efc6e9e4 ("PCI: Lock upstream bridge for pci_reset_function()")

which does

lockdep_register_key(&dev->cfg_access_key);

in pci_device_add() and doesn't unregister the key when the pci device is
removed (and potentially freed); so basically 7e89efc6e9e4 was missing a

lockdep_unregister_key();

in pci_destroy_dev().

Based on the above I wonder if 7e89efc6e9e4 could also lead to the
corruption of lock_keys_hash after a pci device is removed.

--Imre

[1] https://intel-gfx-ci.01.org/tree/drm-tip/IGT_7875/bat-atsm-1/dmesg0.txt
[2] https://lore.kernel.org/all/171711746402.1628941.14575335981264103013.stgit@dwillia2-xfh.jf.intel.com/
Dan Williams June 4, 2024, 3:20 p.m. UTC | #12
Imre Deak wrote:
> Hi,
> 
> [Sorry for the previous message, resending it now
>  with proper In-reply-to: header added.]
> 
> I see a similar issue, a corruption in the lock_keys_hash while
> alloc_workqueue()->lockdep_register_key() iterates it, see [1] for the
> stacktrace.
> 
> Not sure if related or even will solve [1], but [2] will revert
> 
> commit 7e89efc6e9e4 ("PCI: Lock upstream bridge for pci_reset_function()")
> 
> which does
> 
> lockdep_register_key(&dev->cfg_access_key);
> 
> in pci_device_add() and doesn't unregister the key when the pci device is
> removed (and potentially freed); so basically 7e89efc6e9e4 was missing a
> 
> lockdep_unregister_key();
> 
> in pci_destroy_dev().
> 
> Based on the above I wonder if 7e89efc6e9e4 could also lead to the
> corruption of lock_keys_hash after a pci device is removed.o

Are you running with the revert applied and still seeing issues?
Imre Deak June 4, 2024, 3:45 p.m. UTC | #13
On Tue, Jun 04, 2024 at 08:20:53AM -0700, Dan Williams wrote:
> Imre Deak wrote:
> > Hi,
> > 
> > [Sorry for the previous message, resending it now
> >  with proper In-reply-to: header added.]
> > 
> > I see a similar issue, a corruption in the lock_keys_hash while
> > alloc_workqueue()->lockdep_register_key() iterates it, see [1] for the
> > stacktrace.
> > 
> > Not sure if related or even will solve [1], but [2] will revert
> > 
> > commit 7e89efc6e9e4 ("PCI: Lock upstream bridge for pci_reset_function()")
> > 
> > which does
> > 
> > lockdep_register_key(&dev->cfg_access_key);
> > 
> > in pci_device_add() and doesn't unregister the key when the pci device is
> > removed (and potentially freed); so basically 7e89efc6e9e4 was missing a
> > 
> > lockdep_unregister_key();
> > 
> > in pci_destroy_dev().
> > 
> > Based on the above I wonder if 7e89efc6e9e4 could also lead to the
> > corruption of lock_keys_hash after a pci device is removed.o
> 
> Are you running with the revert applied and still seeing issues?

The revert is not yet applied and so [1] happened with a kernel
containing 7e89efc6e9e4.

[1] https://intel-gfx-ci.01.org/tree/drm-tip/IGT_7875/bat-atsm-1/dmesg0.txt
Tejun Heo June 4, 2024, 4:30 p.m. UTC | #14
Hello, Leon.

On Tue, Jun 04, 2024 at 02:38:34PM +0300, Leon Romanovsky wrote:
> Thanks, it is very rare situation where call to flush/drain queue
> (in our case kthread_flush_worker) in the middle of the allocation
> flow can be correct. I can't remember any such case.
>
> So even we don't fully understand the root cause, the reimplementation
> is still valid and improves existing code.

It's not valid. pwq release is async and while wq free in the error path
isn't. The flush is there so that we finish the async part before
synchronize error handling. The patch you posted will can lead to double
free after a pwq allocation failure. We can make the error path synchronous
but the pwq free path should be updated first so that it stays synchronous
in the error path. Note that it *needs* to be asynchronous in non-error
paths, so it's going to be a bit subtle one way or the other.

Thanks.
Leon Romanovsky June 4, 2024, 6:58 p.m. UTC | #15
On Tue, Jun 04, 2024 at 06:30:49AM -1000, Tejun Heo wrote:
> Hello, Leon.
> 
> On Tue, Jun 04, 2024 at 02:38:34PM +0300, Leon Romanovsky wrote:
> > Thanks, it is very rare situation where call to flush/drain queue
> > (in our case kthread_flush_worker) in the middle of the allocation
> > flow can be correct. I can't remember any such case.
> >
> > So even we don't fully understand the root cause, the reimplementation
> > is still valid and improves existing code.
> 
> It's not valid. pwq release is async and while wq free in the error path
> isn't. The flush is there so that we finish the async part before
> synchronize error handling. The patch you posted will can lead to double
> free after a pwq allocation failure. We can make the error path synchronous
> but the pwq free path should be updated first so that it stays synchronous
> in the error path. Note that it *needs* to be asynchronous in non-error
> paths, so it's going to be a bit subtle one way or the other.

But at that point, we didn't add newly created WQ to any list which will execute
that asynchronous release. Did I miss something?

Anyway, I understand that the lockdep_register_key() corruption comes
from something else. Do you have any idea what can cause it? How can we
help debug this issue?

Thanks

> 
> Thanks.
> 
> -- 
> tejun
>
Tejun Heo June 4, 2024, 8:04 p.m. UTC | #16
Hello,

On Tue, Jun 04, 2024 at 09:58:04PM +0300, Leon Romanovsky wrote:
> But at that point, we didn't add newly created WQ to any list which will execute
> that asynchronous release. Did I miss something?

So, wq itself is not the problem. There are multiple pwq's that get attached
to a wq and each pwq is refcnt'd and released asynchronously. Over time,
during wq init, how the error paths behave diverged - pwq's still take the
async path while wq error path stayed synchronous. The flush is there to
match them. A cleaner solution would be either turning everything async or
sync.

> Anyway, I understand that the lockdep_register_key() corruption comes
> from something else. Do you have any idea what can cause it? How can we
> help debug this issue?

It looks like other guys are already looking at another commit, but focusing
on the backtrace which prematurely freed the reported object (rather than
the backtrace which stumbled upon it while walking shared data structure)
should help finding the actual culprit.

Thanks.
Hillf Danton June 5, 2024, 11:10 a.m. UTC | #17
On Tue, 4 Jun 2024 21:58:04 +0300 Leon Romanovsky <leon@kernel.org>
> On Tue, Jun 04, 2024 at 06:30:49AM -1000, Tejun Heo wrote:
> > On Tue, Jun 04, 2024 at 02:38:34PM +0300, Leon Romanovsky wrote:
> > > Thanks, it is very rare situation where call to flush/drain queue
> > > (in our case kthread_flush_worker) in the middle of the allocation
> > > flow can be correct. I can't remember any such case.
> > >
> > > So even we don't fully understand the root cause, the reimplementation
> > > is still valid and improves existing code.
> > 
> > It's not valid. pwq release is async and while wq free in the error path
> > isn't. The flush is there so that we finish the async part before
> > synchronize error handling. The patch you posted will can lead to double
> > free after a pwq allocation failure. We can make the error path synchronous
> > but the pwq free path should be updated first so that it stays synchronous
> > in the error path. Note that it *needs* to be asynchronous in non-error
> > paths, so it's going to be a bit subtle one way or the other.
> 
> But at that point, we didn't add newly created WQ to any list which will execute
> that asynchronous release. Did I miss something?
> 
Maybe it is more subtle than thought, but not difficult to make the wq
allocation path sync. See if the patch could survive your test.

--- x/include/linux/workqueue.h
+++ y/include/linux/workqueue.h
@@ -402,6 +402,7 @@ enum wq_flags {
 	 */
 	WQ_POWER_EFFICIENT	= 1 << 7,
 
+	__WQ_INITIALIZING 	= 1 << 14, /* internal: workqueue is initializing */
 	__WQ_DESTROYING		= 1 << 15, /* internal: workqueue is destroying */
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
--- x/kernel/workqueue.c
+++ y/kernel/workqueue.c
@@ -5080,6 +5080,8 @@ static void pwq_release_workfn(struct kt
 	 * is gonna access it anymore.  Schedule RCU free.
 	 */
 	if (is_last) {
+		if (wq->flags & __WQ_INITIALIZING)
+			return;
 		wq_unregister_lockdep(wq);
 		call_rcu(&wq->rcu, rcu_free_wq);
 	}
@@ -5714,8 +5716,10 @@ struct workqueue_struct *alloc_workqueue
 			goto err_unreg_lockdep;
 	}
 
+	wq->flags |= __WQ_INITIALIZING;
 	if (alloc_and_link_pwqs(wq) < 0)
 		goto err_free_node_nr_active;
+	wq->flags &= ~__WQ_INITIALIZING;
 
 	if (wq_online && init_rescuer(wq) < 0)
 		goto err_destroy;
--
Leon Romanovsky June 6, 2024, 7:38 a.m. UTC | #18
On Wed, Jun 05, 2024 at 07:10:55PM +0800, Hillf Danton wrote:
> On Tue, 4 Jun 2024 21:58:04 +0300 Leon Romanovsky <leon@kernel.org>
> > On Tue, Jun 04, 2024 at 06:30:49AM -1000, Tejun Heo wrote:
> > > On Tue, Jun 04, 2024 at 02:38:34PM +0300, Leon Romanovsky wrote:
> > > > Thanks, it is very rare situation where call to flush/drain queue
> > > > (in our case kthread_flush_worker) in the middle of the allocation
> > > > flow can be correct. I can't remember any such case.
> > > >
> > > > So even we don't fully understand the root cause, the reimplementation
> > > > is still valid and improves existing code.
> > > 
> > > It's not valid. pwq release is async and while wq free in the error path
> > > isn't. The flush is there so that we finish the async part before
> > > synchronize error handling. The patch you posted will can lead to double
> > > free after a pwq allocation failure. We can make the error path synchronous
> > > but the pwq free path should be updated first so that it stays synchronous
> > > in the error path. Note that it *needs* to be asynchronous in non-error
> > > paths, so it's going to be a bit subtle one way or the other.
> > 
> > But at that point, we didn't add newly created WQ to any list which will execute
> > that asynchronous release. Did I miss something?
> > 
> Maybe it is more subtle than thought, but not difficult to make the wq
> allocation path sync. See if the patch could survive your test.

Thanks, I started to run our tests with Dan's revert.
https://lore.kernel.org/all/171711745834.1628941.5259278474013108507.stgit@dwillia2-xfh.jf.intel.com/

As premature results, it fixed my lockdep warnings, but it will take time till I get full confidence.
If not, I will try your patch.

Thanks

> 
> --- x/include/linux/workqueue.h
> +++ y/include/linux/workqueue.h
> @@ -402,6 +402,7 @@ enum wq_flags {
>  	 */
>  	WQ_POWER_EFFICIENT	= 1 << 7,
>  
> +	__WQ_INITIALIZING 	= 1 << 14, /* internal: workqueue is initializing */
>  	__WQ_DESTROYING		= 1 << 15, /* internal: workqueue is destroying */
>  	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
>  	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
> --- x/kernel/workqueue.c
> +++ y/kernel/workqueue.c
> @@ -5080,6 +5080,8 @@ static void pwq_release_workfn(struct kt
>  	 * is gonna access it anymore.  Schedule RCU free.
>  	 */
>  	if (is_last) {
> +		if (wq->flags & __WQ_INITIALIZING)
> +			return;
>  		wq_unregister_lockdep(wq);
>  		call_rcu(&wq->rcu, rcu_free_wq);
>  	}
> @@ -5714,8 +5716,10 @@ struct workqueue_struct *alloc_workqueue
>  			goto err_unreg_lockdep;
>  	}
>  
> +	wq->flags |= __WQ_INITIALIZING;
>  	if (alloc_and_link_pwqs(wq) < 0)
>  		goto err_free_node_nr_active;
> +	wq->flags &= ~__WQ_INITIALIZING;
>  
>  	if (wq_online && init_rescuer(wq) < 0)
>  		goto err_destroy;
> --
>
Leon Romanovsky June 6, 2024, 10:29 a.m. UTC | #19
On Thu, Jun 06, 2024 at 10:38:01AM +0300, Leon Romanovsky wrote:
> On Wed, Jun 05, 2024 at 07:10:55PM +0800, Hillf Danton wrote:
> > On Tue, 4 Jun 2024 21:58:04 +0300 Leon Romanovsky <leon@kernel.org>
> > > On Tue, Jun 04, 2024 at 06:30:49AM -1000, Tejun Heo wrote:
> > > > On Tue, Jun 04, 2024 at 02:38:34PM +0300, Leon Romanovsky wrote:
> > > > > Thanks, it is very rare situation where call to flush/drain queue
> > > > > (in our case kthread_flush_worker) in the middle of the allocation
> > > > > flow can be correct. I can't remember any such case.
> > > > >
> > > > > So even we don't fully understand the root cause, the reimplementation
> > > > > is still valid and improves existing code.
> > > > 
> > > > It's not valid. pwq release is async and while wq free in the error path
> > > > isn't. The flush is there so that we finish the async part before
> > > > synchronize error handling. The patch you posted will can lead to double
> > > > free after a pwq allocation failure. We can make the error path synchronous
> > > > but the pwq free path should be updated first so that it stays synchronous
> > > > in the error path. Note that it *needs* to be asynchronous in non-error
> > > > paths, so it's going to be a bit subtle one way or the other.
> > > 
> > > But at that point, we didn't add newly created WQ to any list which will execute
> > > that asynchronous release. Did I miss something?
> > > 
> > Maybe it is more subtle than thought, but not difficult to make the wq
> > allocation path sync. See if the patch could survive your test.
> 
> Thanks, I started to run our tests with Dan's revert.
> https://lore.kernel.org/all/171711745834.1628941.5259278474013108507.stgit@dwillia2-xfh.jf.intel.com/
> 
> As premature results, it fixed my lockdep warnings, but it will take time till I get full confidence.

Don't series fixed reported issue.

Thanks
Hillf Danton June 7, 2024, 11:04 a.m. UTC | #20
On Thu, 6 Jun 2024 13:29:21 +0300 Leon Romanovsky wrote:
> On Thu, Jun 06, 2024 at 10:38:01AM +0300, Leon Romanovsky wrote:
> > 
> > Thanks, I started to run our tests with Dan's revert.
> > https://lore.kernel.org/all/171711745834.1628941.5259278474013108507.stgit@dwillia2-xfh.jf.intel.com/
> > 
> > As premature results, it fixed my lockdep warnings, but it will take time till I get full confidence.
> 
> Don't series fixed reported issue.
> 
I am happy to nominate you for the 2024 Kidding Lore Award ($2) if the
uaf in the subject line got fixed with Dan's revert.
diff mbox series

Patch

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 003474c9a77d..c3df0a1f908b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5454,7 +5454,7 @@  static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
 	bool highpri = wq->flags & WQ_HIGHPRI;
-	int cpu, ret;
+	int cpu, ret = -ENOMEM;
 
 	wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
 	if (!wq->cpu_pwq)
@@ -5502,14 +5502,10 @@  static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 		ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
 	}
 	cpus_read_unlock();
-
-	/* for unbound pwq, flush the pwq_release_worker ensures that the
-	 * pwq_release_workfn() completes before calling kfree(wq).
-	 */
 	if (ret)
-		kthread_flush_worker(pwq_release_worker);
+		goto enomem;
 
-	return ret;
+	return 0;
 
 enomem:
 	if (wq->cpu_pwq) {
@@ -5522,7 +5518,7 @@  static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 		free_percpu(wq->cpu_pwq);
 		wq->cpu_pwq = NULL;
 	}
-	return -ENOMEM;
+	return ret;
 }
 
 static int wq_clamp_max_active(int max_active, unsigned int flags,
@@ -5714,7 +5710,7 @@  struct workqueue_struct *alloc_workqueue(const char *fmt,
 			goto err_unreg_lockdep;
 	}
 
-	if (alloc_and_link_pwqs(wq) < 0)
+	if (alloc_and_link_pwqs(wq))
 		goto err_free_node_nr_active;
 
 	if (wq_online && init_rescuer(wq) < 0)