Message ID | 20240426195429.28547-2-lkarpins@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | fs/namespace: defer RCU sync for MNT_DETACH umount | expand |
On Fri, Apr 26, 2024 at 03:53:48PM -0400, Lucas Karpinski wrote: > -static void namespace_unlock(void) > +static void free_mounts(struct hlist_head *mount_list) > { > - struct hlist_head head; > struct hlist_node *p; > struct mount *m; > + > + hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) { > + hlist_del(&m->mnt_umount); > + mntput(&m->mnt); ... which may block in quite a few ways. > + } > +} > + > +static void delayed_mount_release(struct rcu_head *head) > +{ > + struct mount_delayed_release *drelease = > + container_of(head, struct mount_delayed_release, rcu); > + > + free_mounts(&drelease->release_list); ... and therefore so can this. > + kfree(drelease); > +} > + call_rcu(&drelease->rcu, delayed_mount_release); ... which is a bad idea, since call_rcu() callbacks are run from interrupt context. Which makes blocking in them a problem.
On Fri, Apr 26, 2024 at 09:09:41PM +0100, Al Viro wrote: > > + call_rcu(&drelease->rcu, delayed_mount_release); > > ... which is a bad idea, since call_rcu() callbacks are run > from interrupt context. Which makes blocking in them a problem. > Thanks for the quick review. Documentation/RCU/checklist.rst suggests switching to queue_rcu_work() function in scenarios where the callback function can block. This seems like it would fix the issue you found, while still providing similar performance improvements. workqueue: perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt 0.003066 +- 0.000307 seconds time elapsed ( +- 10.02% ) callrcu: perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt 0.0030812 +- 0.0000524 seconds time elapsed ( +- 1.70% ) Regards, Lucas
Hello, kernel test robot noticed "WARNING:inconsistent_lock_state" on: commit: 9a4131bb20fbeb5980c3e21709488b9c4ef2eee5 ("[RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount") url: https://github.com/intel-lab-lkp/linux/commits/Lucas-Karpinski/fs-namespace-defer-RCU-sync-for-MNT_DETACH-umount/20240427-035724 base: https://git.kernel.org/cgit/linux/kernel/git/vfs/vfs.git vfs.all patch link: https://lore.kernel.org/all/20240426195429.28547-2-lkarpins@redhat.com/ patch subject: [RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount in testcase: boot compiler: gcc-7 test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G (please refer to attached dmesg/kmsg for entire log/backtrace) +---------------------------------------------------+------------+------------+ | | eea3260250 | 9a4131bb20 | +---------------------------------------------------+------------+------------+ | WARNING:inconsistent_lock_state | 0 | 12 | | inconsistent{SOFTIRQ-ON-W}->{IN-SOFTIRQ-W}usage | 0 | 12 | +---------------------------------------------------+------------+------------+ If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@intel.com> | Closes: https://lore.kernel.org/oe-lkp/202404302220.f8959d4c-oliver.sang@intel.com [ 19.407878][ C0] [ 19.408164][ C0] ================================ [ 19.408629][ C0] WARNING: inconsistent lock state [ 19.409083][ C0] 6.9.0-rc3-00075-g9a4131bb20fb #2 Tainted: G W TN [ 19.409757][ C0] -------------------------------- [ 19.410209][ C0] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. [ 19.410777][ C0] ksoftirqd/0/10 [HC0[0]:SC1[3]:HE1:SE0] takes: [ 19.411306][ C0] ffffffff83066cd0 (mount_lock){+.?.}-{2:2}, at: mntput_no_expire (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1314) [ 19.412072][ C0] {SOFTIRQ-ON-W} state was registered at: [ 19.412574][ C0] lock_acquire (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5756) [ 19.412999][ C0] _raw_spin_lock (include/linux/spinlock_api_smp.h:134 kernel/locking/spinlock.c:154) [ 19.413421][ C0] vfs_create_mount (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1122) [ 19.413862][ C0] fc_mount (fs/namespace.c:1134) [ 19.414246][ C0] vfs_kern_mount (fs/namespace.c:1148 fs/namespace.c:1181) [ 19.414713][ C0] kern_mount (fs/namespace.c:5248) [ 19.415106][ C0] shmem_init (mm/shmem.c:4684) [ 19.415508][ C0] mnt_init (fs/namespace.c:5232) [ 19.415907][ C0] vfs_caches_init (fs/dcache.c:3187) [ 19.416333][ C0] start_kernel (init/main.c:1058) [ 19.416760][ C0] x86_64_start_reservations (arch/x86/kernel/head64.c:495) [ 19.417245][ C0] x86_64_start_kernel (arch/x86/kernel/head64.c:488) [ 19.417694][ C0] common_startup_64 (arch/x86/kernel/head_64.S:421) [ 19.418133][ C0] irq event stamp: 868370 [ 19.418530][ C0] hardirqs last enabled at (868370): __local_bh_enable_ip (arch/x86/include/asm/paravirt.h:698 kernel/softirq.c:387) [ 19.419370][ C0] hardirqs last disabled at (868369): __local_bh_enable_ip (kernel/softirq.c:364 (discriminator 1)) [ 19.420216][ C0] softirqs last enabled at (868292): __do_softirq (arch/x86/include/asm/preempt.h:26 kernel/softirq.c:401 kernel/softirq.c:583) [ 19.421025][ C0] softirqs last disabled at (868297): run_ksoftirqd (kernel/softirq.c:411 kernel/softirq.c:925) [ 19.421823][ C0] [ 19.421823][ C0] other info that might help us debug this: [ 19.422522][ C0] Possible unsafe locking scenario: [ 19.422522][ C0] [ 19.423184][ C0] CPU0 [ 19.423510][ C0] ---- [ 19.423838][ C0] lock(mount_lock); [ 19.424225][ C0] <Interrupt> [ 19.424567][ C0] lock(mount_lock); [ 19.425115][ C0] [ 19.425115][ C0] *** DEADLOCK *** [ 19.425115][ C0] [ 19.425858][ C0] 2 locks held by ksoftirqd/0/10: [ 19.426297][ C0] #0: ffffffff8301d700 (rcu_callback){....}-{0:0}, at: rcu_process_callbacks (include/linux/bottom_half.h:20 kernel/rcu/tiny.c:133) [ 19.427122][ C0] #1: ffffffff8301d7c0 (rcu_read_lock){....}-{1:2}, at: mntput_no_expire (include/linux/rcupdate.h:329 include/linux/rcupdate.h:781 fs/namespace.c:1299) [ 19.427921][ C0] [ 19.427921][ C0] stack backtrace: [ 19.428459][ C0] CPU: 0 PID: 10 Comm: ksoftirqd/0 Tainted: G W TN 6.9.0-rc3-00075-g9a4131bb20fb #2 [ 19.429328][ C0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 19.430179][ C0] Call Trace: [ 19.430505][ C0] <TASK> [ 19.430805][ C0] dump_stack_lvl (arch/x86/include/asm/paravirt.h:687 (discriminator 3) arch/x86/include/asm/irqflags.h:127 (discriminator 3) lib/dump_stack.c:117 (discriminator 3)) [ 19.431221][ C0] mark_lock+0x473/0x4c0 [ 19.431661][ C0] ? __lock_acquire (kernel/locking/lockdep.c:4599 kernel/locking/lockdep.c:5091) [ 19.432102][ C0] ? __lock_acquire (include/linux/hash.h:78 kernel/locking/lockdep.c:3759 kernel/locking/lockdep.c:3782 kernel/locking/lockdep.c:3837 kernel/locking/lockdep.c:5137) [ 19.432550][ C0] ? __lock_acquire (kernel/locking/lockdep.c:4567 kernel/locking/lockdep.c:5091) [ 19.432989][ C0] __lock_acquire (kernel/locking/lockdep.c:4567 kernel/locking/lockdep.c:5091) [ 19.433415][ C0] ? __lock_acquire (kernel/locking/lockdep.c:5134 (discriminator 9)) [ 19.433856][ C0] lock_acquire (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5756) [ 19.434269][ C0] ? mntput_no_expire (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1314) [ 19.434708][ C0] ? mntput_no_expire (include/linux/rcupdate.h:329 include/linux/rcupdate.h:781 fs/namespace.c:1299) [ 19.435144][ C0] _raw_spin_lock (include/linux/spinlock_api_smp.h:134 kernel/locking/spinlock.c:154) [ 19.435555][ C0] ? mntput_no_expire (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1314) [ 19.435994][ C0] mntput_no_expire (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1314) [ 19.436421][ C0] ? rcu_process_callbacks (include/linux/bottom_half.h:20 kernel/rcu/tiny.c:133) [ 19.436900][ C0] free_mounts (fs/namespace.c:1571) [ 19.437291][ C0] ? __x64_sys_mount_setattr (fs/namespace.c:1574) [ 19.437772][ C0] ? __x64_sys_mount_setattr (fs/namespace.c:1574) [ 19.438253][ C0] delayed_mount_release (fs/namespace.c:1579) [ 19.438699][ C0] rcu_process_callbacks (include/linux/rcupdate.h:339 kernel/rcu/tiny.c:103 kernel/rcu/tiny.c:134) [ 19.439151][ C0] __do_softirq (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/jump_label.h:260 include/linux/jump_label.h:270 include/trace/events/irq.h:142 kernel/softirq.c:555) [ 19.439556][ C0] ? smpboot_thread_fn (kernel/smpboot.c:112) [ 19.439997][ C0] ? sort_range (kernel/smpboot.c:107) [ 19.440389][ C0] run_ksoftirqd (kernel/softirq.c:411 kernel/softirq.c:925) [ 19.443032][ C0] smpboot_thread_fn (kernel/smpboot.c:164 (discriminator 3)) [ 19.443510][ C0] kthread (kernel/kthread.c:388) [ 19.443887][ C0] ? kthread_complete_and_exit (kernel/kthread.c:341) [ 19.444367][ C0] ret_from_fork (arch/x86/kernel/process.c:153) [ 19.444788][ C0] ? kthread_complete_and_exit (kernel/kthread.c:341) [ 19.445269][ C0] ret_from_fork_asm (arch/x86/entry/entry_64.S:253) [ 19.445696][ C0] </TASK> The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20240430/202404302220.f8959d4c-oliver.sang@intel.com
On 30/4/24 21:25, Lucas Karpinski wrote: > On Fri, Apr 26, 2024 at 09:09:41PM +0100, Al Viro wrote: >>> + call_rcu(&drelease->rcu, delayed_mount_release); >> ... which is a bad idea, since call_rcu() callbacks are run >> from interrupt context. Which makes blocking in them a problem. >> > Thanks for the quick review. > > Documentation/RCU/checklist.rst suggests switching to queue_rcu_work() > function in scenarios where the callback function can block. This seems > like it would fix the issue you found, while still providing similar > performance improvements. You know I've been looking at this and you can see that mntput() will just call mntput_no_expire() which queues work to do the bulk of the work and returns. So I'm wondering what would happen to the timing if you simply didn't call the rcu wait for the lazy umount case and left everything else as it is. Ian
diff --git a/fs/namespace.c b/fs/namespace.c index 5a51315c6678..df03fc0d1990 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init; static unsigned int mp_hash_mask __ro_after_init; static unsigned int mp_hash_shift __ro_after_init; +struct mount_delayed_release { + struct rcu_head rcu; + struct hlist_head release_list; +}; + static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { @@ -78,6 +83,7 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static bool lazy_unlock = false; /* protected by namespace_sem */ struct mount_kattr { unsigned int attr_set; @@ -1553,16 +1559,39 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); -static void namespace_unlock(void) +static void free_mounts(struct hlist_head *mount_list) { - struct hlist_head head; struct hlist_node *p; struct mount *m; + + hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) { + hlist_del(&m->mnt_umount); + mntput(&m->mnt); + } +} + +static void delayed_mount_release(struct rcu_head *head) +{ + struct mount_delayed_release *drelease = + container_of(head, struct mount_delayed_release, rcu); + + free_mounts(&drelease->release_list); + kfree(drelease); +} + +static void namespace_unlock(void) +{ + HLIST_HEAD(head); LIST_HEAD(list); + bool lazy; + hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); + lazy = lazy_unlock; + lazy_unlock = false; + up_write(&namespace_sem); shrink_dentry_list(&list); @@ -1570,12 +1599,21 @@ static void namespace_unlock(void) if (likely(hlist_empty(&head))) return; - synchronize_rcu_expedited(); + if (lazy) { + struct mount_delayed_release *drelease = + kmalloc(sizeof(*drelease), GFP_KERNEL); - hlist_for_each_entry_safe(m, p, &head, mnt_umount) { - hlist_del(&m->mnt_umount); - mntput(&m->mnt); + if (unlikely(!drelease)) + goto out; + + hlist_move_list(&head, &drelease->release_list); + call_rcu(&drelease->rcu, delayed_mount_release); + return; } + +out: + synchronize_rcu_expedited(); + free_mounts(&head); } static inline void namespace_lock(void) @@ -1798,6 +1836,7 @@ static int do_umount(struct mount *mnt, int flags) } out: unlock_mount_hash(); + lazy_unlock = flags & MNT_DETACH ? true : false; namespace_unlock(); return retval; }