diff mbox series

[RFC,v2] fs/namespace: use percpu_rw_semaphore for writer holding

Message ID 20200702154646.qkrzchuttrywvuud@linutronix.de (mailing list archive)
State New, archived
Headers show
Series [RFC,v2] fs/namespace: use percpu_rw_semaphore for writer holding | expand

Commit Message

Sebastian Sewior July 2, 2020, 3:46 p.m. UTC
From: John Ogness <john.ogness@linutronix.de>

The MNT_WRITE_HOLD flag is used to manually implement a rwsem. Remove
that flag and instead use a percpu_rw_semaphore, which effectively
provides the same functionality. This allows lockdep to be used for
the writer holding, allows CONFIG_PREEMPT to preempt held writers,
and simplifies the writer holding code.

Note that writer holding is now performed on a per super block basis
rather than a per mount basis.

Co-developed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
v1…v2:
  - Switch two percpu rwsem

TEST COMMAND
lat_mmap -P $pval -W 32 -N 50 64m file

On a 32 core Xeon.

OUTPUT FORMAT
pval: avg std

Preemptible Kernel
           BASE           RWS4
 1:   289.40  2.97   286.00  2.35
 2:   307.60  3.58   292.40  3.65
 4:   323.20  3.56   313.20  2.28
 8:   375.00  6.08   361.20  1.30
16:   412.40  1.34   413.00  2.83
32:   611.80  4.32   613.40  4.28
64:  1239.00  7.84  1251.00  9.30

Preemptible Kernel, no PTI
           BASE           RWS4
 1:   286.00  1.22   289.20  3.35
 2:   312.00  9.06   311.00  3.87
 4:   321.20  5.17   394.40  2.30
 8:   368.60  2.07   447.80  3.35
16:   411.40  2.41   476.40  1.14
32:   623.40  6.47   651.00  3.39
64:  1264.20  9.55  1292.60  2.51

Voluntary Kernel Preemption
           BASE           RWS4
 1:   285.80  1.48   283.40  5.94
 2:   303.40  3.78   303.60  3.21
 4:   315.20  7.29   309.80  3.49
 8:   361.40  2.30   361.00  7.00
16:   415.40  3.36   419.80  6.14
32:   606.60  5.13   612.80  5.40
64:  1211.40  7.27  1205.20  8.04

Voluntary Kernel Preemption, no PTI
           BASE           RWS4
 1:   281.60  2.51   293.80  3.03
 2:   295.00  3.81   321.00  5.96
 4:   312.20  2.77   372.40  8.73
 8:   359.20  0.45   424.40  1.82
16:   413.20  3.96   449.40  2.30
32:   623.20  5.36   635.80  1.64
64:  1212.00  6.63  1229.00  5.15

No Forced Preemption
           BASE           RWS4
 1:   273.80  0.45   275.40  0.55
 2:   291.60  7.20   285.60  1.82
 4:   310.60  4.22   302.20  1.79
 8:   362.40  3.05   350.20  1.30
16:   416.40  3.78   405.00  2.92
32:   626.20  2.28   611.80  1.30
64:  1215.80  3.83  1185.60 11.67

No Forced Preemption, no PTI
           BASE           RWS4
 1:   274.80  2.68   274.00  3.39
 2:   291.00  5.66   289.60  3.05
 4:   304.00  2.74   303.20  1.48
 8:   352.40  1.82   350.60  2.30
16:   408.40  0.89   406.20  2.49
32:   605.20  3.27   607.20  3.96
64:  1189.80  4.09  1194.40  1.82

 fs/mount.h            |  2 +
 fs/namespace.c        | 99 ++++++++++++++++---------------------------
 fs/super.c            |  2 +
 include/linux/fs.h    |  1 +
 include/linux/mount.h |  6 +--
 5 files changed, 44 insertions(+), 66 deletions(-)

Comments

Al Viro July 19, 2020, 1:49 a.m. UTC | #1
On Thu, Jul 02, 2020 at 05:46:46PM +0200, Sebastian Andrzej Siewior wrote:

> The MNT_WRITE_HOLD flag is used to manually implement a rwsem.

Could you show me where does it currently sleep?  Your version does,
unless I'm misreading it...
John Ogness July 20, 2020, 3:35 p.m. UTC | #2
On 2020-07-19, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> The MNT_WRITE_HOLD flag is used to manually implement a rwsem.
>
> Could you show me where does it currently sleep?  Your version does,
> unless I'm misreading it...

You are reading it correctly. This patch introduces new possible
sleeping for__mnt_want_write() when writers for a superblock are being
held.

The RFCv1 [0] is a variant that does not introduce sleeping, but instead
reverts back to per-cpu spinlocks.

Sebastian and I are requesting comments on these two possible
solutions. Or perhaps you have an idea how to solve the potential live
lock situation.

John Ogness

[0] https://lkml.kernel.org/r/20200617104058.14902-2-john.ogness@linutronix.de
diff mbox series

Patch

diff --git a/fs/mount.h b/fs/mount.h
index c7abb7b394d8e..67ef1f87b89fe 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,6 +4,7 @@ 
 #include <linux/poll.h>
 #include <linux/ns_common.h>
 #include <linux/fs_pin.h>
+#include <linux/percpu-rwsem.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -46,6 +47,7 @@  struct mount {
 		struct rcu_head mnt_rcu;
 		struct llist_node mnt_llist;
 	};
+	struct percpu_rw_semaphore mnt_writers_rws;
 #ifdef CONFIG_SMP
 	struct mnt_pcp __percpu *mnt_pcp;
 #else
diff --git a/fs/namespace.c b/fs/namespace.c
index f30ed401cc6d7..5275f73ce7443 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -311,30 +311,17 @@  static int mnt_is_readonly(struct vfsmount *mnt)
 int __mnt_want_write(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
+	struct super_block *sb = m->mnt_sb;
 	int ret = 0;
 
-	preempt_disable();
-	mnt_inc_writers(mnt);
-	/*
-	 * The store to mnt_inc_writers must be visible before we pass
-	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
-	 * incremented count after it has set MNT_WRITE_HOLD.
-	 */
-	smp_mb();
-	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
-		cpu_relax();
-	/*
-	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
-	 * be set to match its requirements. So we must not load that until
-	 * MNT_WRITE_HOLD is cleared.
-	 */
-	smp_rmb();
-	if (mnt_is_readonly(m)) {
-		mnt_dec_writers(mnt);
-		ret = -EROFS;
-	}
-	preempt_enable();
+	percpu_down_read(&sb->mnt_writers_rws);
 
+	if (mnt_is_readonly(m))
+		ret = -EROFS;
+	else
+		mnt_inc_writers(mnt);
+
+	percpu_up_read(&sb->mnt_writers_rws);
 	return ret;
 }
 
@@ -373,12 +360,14 @@  EXPORT_SYMBOL_GPL(mnt_want_write);
  */
 int mnt_clone_write(struct vfsmount *mnt)
 {
+	struct super_block *sb = mnt->mnt_sb;
+
 	/* superblock may be r/o */
 	if (__mnt_is_readonly(mnt))
 		return -EROFS;
-	preempt_disable();
+	percpu_down_read(&sb->mnt_writers_rws);
 	mnt_inc_writers(real_mount(mnt));
-	preempt_enable();
+	percpu_up_read(&sb->mnt_writers_rws);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mnt_clone_write);
@@ -427,9 +416,11 @@  EXPORT_SYMBOL_GPL(mnt_want_write_file);
  */
 void __mnt_drop_write(struct vfsmount *mnt)
 {
-	preempt_disable();
+	struct super_block *sb = mnt->mnt_sb;
+
+	percpu_down_read(&sb->mnt_writers_rws);
 	mnt_dec_writers(real_mount(mnt));
-	preempt_enable();
+	percpu_up_read(&sb->mnt_writers_rws);
 }
 
 /**
@@ -459,53 +450,38 @@  void mnt_drop_write_file(struct file *file)
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
 
-static int mnt_make_readonly(struct mount *mnt)
+static int mnt_make_readonly(struct super_block *sb, struct mount *mnt)
 {
 	int ret = 0;
 
+	percpu_down_write(&sb->mnt_writers_rws);
 	lock_mount_hash();
-	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
-	/*
-	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
-	 * should be visible before we do.
-	 */
-	smp_mb();
 
 	/*
 	 * With writers on hold, if this value is zero, then there are
-	 * definitely no active writers (although held writers may subsequently
-	 * increment the count, they'll have to wait, and decrement it after
-	 * seeing MNT_READONLY).
+	 * definitely no active writers.
 	 *
 	 * It is OK to have counter incremented on one CPU and decremented on
-	 * another: the sum will add up correctly. The danger would be when we
-	 * sum up each counter, if we read a counter before it is incremented,
-	 * but then read another CPU's count which it has been subsequently
-	 * decremented from -- we would see more decrements than we should.
-	 * MNT_WRITE_HOLD protects against this scenario, because
-	 * mnt_want_write first increments count, then smp_mb, then spins on
-	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
-	 * we're counting up here.
+	 * another: the sum will add up correctly. The rwsem ensures that the
+	 * counters are not modified once the writer lock is acquired.
 	 */
 	if (mnt_get_writers(mnt) > 0)
 		ret = -EBUSY;
 	else
 		mnt->mnt.mnt_flags |= MNT_READONLY;
-	/*
-	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
-	 * that become unheld will see MNT_READONLY.
-	 */
-	smp_wmb();
-	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+
 	unlock_mount_hash();
+	percpu_up_write(&sb->mnt_writers_rws);
 	return ret;
 }
 
-static int __mnt_unmake_readonly(struct mount *mnt)
+static int __mnt_unmake_readonly(struct super_block *sb, struct mount *mnt)
 {
+	percpu_down_write(&sb->mnt_writers_rws);
 	lock_mount_hash();
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
 	unlock_mount_hash();
+	percpu_up_write(&sb->mnt_writers_rws);
 	return 0;
 }
 
@@ -514,15 +490,15 @@  int sb_prepare_remount_readonly(struct super_block *sb)
 	struct mount *mnt;
 	int err = 0;
 
-	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
+	/* Racy optimization.  Recheck the counter under mnt_writers_rws. */
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 
+	percpu_down_write(&sb->mnt_writers_rws);
 	lock_mount_hash();
+
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
-			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
-			smp_mb();
 			if (mnt_get_writers(mnt) > 0) {
 				err = -EBUSY;
 				break;
@@ -536,11 +512,9 @@  int sb_prepare_remount_readonly(struct super_block *sb)
 		sb->s_readonly_remount = 1;
 		smp_wmb();
 	}
-	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
-		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
-			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-	}
+
 	unlock_mount_hash();
+	percpu_up_write(&sb->mnt_writers_rws);
 
 	return err;
 }
@@ -1052,7 +1026,7 @@  static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+	mnt->mnt.mnt_flags &= ~(MNT_MARKED|MNT_INTERNAL);
 
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
@@ -2498,7 +2472,8 @@  static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
 	return true;
 }
 
-static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
+static int change_mount_ro_state(struct super_block *sb,
+				 struct mount *mnt, unsigned int mnt_flags)
 {
 	bool readonly_request = (mnt_flags & MNT_READONLY);
 
@@ -2506,9 +2481,9 @@  static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
 		return 0;
 
 	if (readonly_request)
-		return mnt_make_readonly(mnt);
+		return mnt_make_readonly(sb, mnt);
 
-	return __mnt_unmake_readonly(mnt);
+	return __mnt_unmake_readonly(sb, mnt);
 }
 
 /*
@@ -2567,7 +2542,7 @@  static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
 		return -EPERM;
 
 	down_write(&sb->s_umount);
-	ret = change_mount_ro_state(mnt, mnt_flags);
+	ret = change_mount_ro_state(sb, mnt, mnt_flags);
 	if (ret == 0)
 		set_mount_attributes(mnt, mnt_flags);
 	up_write(&sb->s_umount);
diff --git a/fs/super.c b/fs/super.c
index 904459b351199..5c70bc135ba27 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -210,6 +210,8 @@  static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	INIT_LIST_HEAD(&s->s_mounts);
 	s->s_user_ns = get_user_ns(user_ns);
 	init_rwsem(&s->s_umount);
+	percpu_init_rwsem(&s->mnt_writers_rws);
+
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
 	/*
 	 * sget() can have s_umount recursion.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f881a892ea74..2406e823a37df 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1464,6 +1464,7 @@  struct super_block {
 #endif
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
+	struct percpu_rw_semaphore mnt_writers_rws;
 	struct block_device	*s_bdev;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index de657bd211fa6..dce9bda5eeaa1 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -32,7 +32,6 @@  struct fs_context;
 #define MNT_READONLY	0x40	/* does the user want this to be r/o? */
 
 #define MNT_SHRINKABLE	0x100
-#define MNT_WRITE_HOLD	0x200
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -49,9 +48,8 @@  struct fs_context;
 				 | MNT_READONLY)
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
-#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
-			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
-			    MNT_CURSOR)
+#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_INTERNAL | MNT_CURSOR | \
+			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
 
 #define MNT_INTERNAL	0x4000