diff mbox series

[RFC] vfs: make rename_lock per-sb

Message ID 20191128154858.GA16668@miu.piliscsaba.redhat.com (mailing list archive)
State New, archived
Headers show
Series [RFC] vfs: make rename_lock per-sb | expand

Commit Message

Miklos Szeredi Nov. 28, 2019, 3:48 p.m. UTC
Overlayfs uses rename to atomically move copied up files into place.  This
can result in heavier than usual rename load, which in turn might result in
cross-filesystem serialization of tree walking and rename operations.

One notable case where this seems to have been manifesting itself is
shrink_dcache_for_umount(), where otherwise there would be no contention at
all.

Turning rename_lock into a per-sb lock mitigates the above issues.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/autofs/waitq.c      |    6 ++---
 fs/ceph/mds_client.c   |    4 +--
 fs/cifs/dir.c          |    8 ++++---
 fs/d_path.c            |   14 +++++++-----
 fs/dcache.c            |   55 +++++++++++++++++++++++++++----------------------
 fs/nfs/namespace.c     |    7 +++---
 fs/super.c             |    1 
 include/linux/dcache.h |    2 -
 include/linux/fs.h     |    2 +
 kernel/auditsc.c       |    5 ++--
 10 files changed, 59 insertions(+), 45 deletions(-)

Comments

Al Viro Dec. 8, 2019, 5:54 a.m. UTC | #1
On Thu, Nov 28, 2019 at 04:48:58PM +0100, Miklos Szeredi wrote:

> Turning rename_lock into a per-sb lock mitigates the above issues.

... and completely fucks d_lookup(), since false negative can be
caused by rename of *ANYTHING* you've encountered while walking
a hash chain.

NAK.
Miklos Szeredi Dec. 11, 2019, 11:33 a.m. UTC | #2
On Sun, Dec 8, 2019 at 6:54 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Thu, Nov 28, 2019 at 04:48:58PM +0100, Miklos Szeredi wrote:
>
> > Turning rename_lock into a per-sb lock mitigates the above issues.
>
> ... and completely fucks d_lookup(), since false negative can be
> caused by rename of *ANYTHING* you've encountered while walking
> a hash chain.

Drat.

Could we use the bitlock in the hash chain head for verification of a
negative?  Seems like much finer grained than having to rely on a
global seqlock for hash integrity.

Thanks,
Miklos
diff mbox series

Patch

--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -189,7 +189,7 @@  static int autofs_getpath(struct autofs_
 	buf = name;
 	len = 0;
 
-	seq = read_seqbegin(&rename_lock);
+	seq = read_seqbegin(&sbi->sb->rename_lock);
 	rcu_read_lock();
 	spin_lock(&sbi->fs_lock);
 	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
@@ -198,7 +198,7 @@  static int autofs_getpath(struct autofs_
 	if (!len || --len > NAME_MAX) {
 		spin_unlock(&sbi->fs_lock);
 		rcu_read_unlock();
-		if (read_seqretry(&rename_lock, seq))
+		if (read_seqretry(&sbi->sb->rename_lock, seq))
 			goto rename_retry;
 		return 0;
 	}
@@ -214,7 +214,7 @@  static int autofs_getpath(struct autofs_
 	}
 	spin_unlock(&sbi->fs_lock);
 	rcu_read_unlock();
-	if (read_seqretry(&rename_lock, seq))
+	if (read_seqretry(&sbi->sb->rename_lock, seq))
 		goto rename_retry;
 
 	return len;
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2143,7 +2143,7 @@  char *ceph_mdsc_build_path(struct dentry
 	pos = PATH_MAX - 1;
 	path[pos] = '\0';
 
-	seq = read_seqbegin(&rename_lock);
+	seq = read_seqbegin(&dentry->d_sb->rename_lock);
 	rcu_read_lock();
 	temp = dentry;
 	for (;;) {
@@ -2182,7 +2182,7 @@  char *ceph_mdsc_build_path(struct dentry
 	}
 	base = ceph_ino(d_inode(temp));
 	rcu_read_unlock();
-	if (pos < 0 || read_seqretry(&rename_lock, seq)) {
+	if (pos < 0 || read_seqretry(&dentry->d_sb->rename_lock, seq)) {
 		pr_err("build_path did not end path lookup where "
 		       "expected, pos is %d\n", pos);
 		/* presumably this is only possible if racing with a
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -97,7 +97,8 @@  build_path_from_dentry_optional_prefix(s
 	int pplen = 0;
 	char *full_path;
 	char dirsep;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+	struct super_block *sb = direntry->d_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 	unsigned seq;
 
@@ -112,7 +113,7 @@  build_path_from_dentry_optional_prefix(s
 
 cifs_bp_rename_retry:
 	namelen = dfsplen + pplen;
-	seq = read_seqbegin(&rename_lock);
+	seq = read_seqbegin(&sb->rename_lock);
 	rcu_read_lock();
 	for (temp = direntry; !IS_ROOT(temp);) {
 		namelen += (1 + temp->d_name.len);
@@ -152,7 +153,8 @@  build_path_from_dentry_optional_prefix(s
 		}
 	}
 	rcu_read_unlock();
-	if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) {
+	if (namelen != dfsplen + pplen ||
+	    read_seqretry(&sb->rename_lock, seq)) {
 		cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n",
 			 namelen, dfsplen);
 		/* presumably this is only possible if racing with a rename
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -76,6 +76,7 @@  static int prepend_path(const struct pat
 			const struct path *root,
 			char **buffer, int *buflen)
 {
+	struct super_block *sb = root->mnt->mnt_sb;
 	struct dentry *dentry;
 	struct vfsmount *vfsmnt;
 	struct mount *mnt;
@@ -96,7 +97,7 @@  static int prepend_path(const struct pat
 	dentry = path->dentry;
 	vfsmnt = path->mnt;
 	mnt = real_mount(vfsmnt);
-	read_seqbegin_or_lock(&rename_lock, &seq);
+	read_seqbegin_or_lock(&sb->rename_lock, &seq);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
@@ -132,11 +133,11 @@  static int prepend_path(const struct pat
 	}
 	if (!(seq & 1))
 		rcu_read_unlock();
-	if (need_seqretry(&rename_lock, seq)) {
+	if (need_seqretry(&sb->rename_lock, seq)) {
 		seq = 1;
 		goto restart;
 	}
-	done_seqretry(&rename_lock, seq);
+	done_seqretry(&sb->rename_lock, seq);
 
 	if (!(m_seq & 1))
 		rcu_read_unlock();
@@ -324,6 +325,7 @@  char *simple_dname(struct dentry *dentry
  */
 static char *__dentry_path(struct dentry *d, char *buf, int buflen)
 {
+	struct super_block *sb = d->d_sb;
 	struct dentry *dentry;
 	char *end, *retval;
 	int len, seq = 0;
@@ -341,7 +343,7 @@  static char *__dentry_path(struct dentry
 	/* Get '/' right */
 	retval = end-1;
 	*retval = '/';
-	read_seqbegin_or_lock(&rename_lock, &seq);
+	read_seqbegin_or_lock(&sb->rename_lock, &seq);
 	while (!IS_ROOT(dentry)) {
 		struct dentry *parent = dentry->d_parent;
 
@@ -355,11 +357,11 @@  static char *__dentry_path(struct dentry
 	}
 	if (!(seq & 1))
 		rcu_read_unlock();
-	if (need_seqretry(&rename_lock, seq)) {
+	if (need_seqretry(&sb->rename_lock, seq)) {
 		seq = 1;
 		goto restart;
 	}
-	done_seqretry(&rename_lock, seq);
+	done_seqretry(&sb->rename_lock, seq);
 	if (error)
 		goto Elong;
 	return retval;
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -74,10 +74,6 @@ 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
-
-EXPORT_SYMBOL(rename_lock);
-
 static struct kmem_cache *dentry_cache __read_mostly;
 
 const struct qstr empty_name = QSTR_INIT("", 0);
@@ -1267,6 +1263,7 @@  enum d_walk_ret {
 static void d_walk(struct dentry *parent, void *data,
 		   enum d_walk_ret (*enter)(void *, struct dentry *))
 {
+	struct super_block *sb = parent->d_sb;
 	struct dentry *this_parent;
 	struct list_head *next;
 	unsigned seq = 0;
@@ -1274,7 +1271,7 @@  static void d_walk(struct dentry *parent
 	bool retry = true;
 
 again:
-	read_seqbegin_or_lock(&rename_lock, &seq);
+	read_seqbegin_or_lock(&sb->rename_lock, &seq);
 	this_parent = parent;
 	spin_lock(&this_parent->d_lock);
 
@@ -1339,7 +1336,7 @@  static void d_walk(struct dentry *parent
 		spin_lock(&this_parent->d_lock);
 
 		/* might go back up the wrong parent if we have had a rename. */
-		if (need_seqretry(&rename_lock, seq))
+		if (need_seqretry(&sb->rename_lock, seq))
 			goto rename_retry;
 		/* go into the first sibling still alive */
 		do {
@@ -1351,13 +1348,13 @@  static void d_walk(struct dentry *parent
 		rcu_read_unlock();
 		goto resume;
 	}
-	if (need_seqretry(&rename_lock, seq))
+	if (need_seqretry(&sb->rename_lock, seq))
 		goto rename_retry;
 	rcu_read_unlock();
 
 out_unlock:
 	spin_unlock(&this_parent->d_lock);
-	done_seqretry(&rename_lock, seq);
+	done_seqretry(&sb->rename_lock, seq);
 	return;
 
 rename_retry:
@@ -1419,9 +1416,10 @@  EXPORT_SYMBOL(path_has_submounts);
  */
 int d_set_mounted(struct dentry *dentry)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct dentry *p;
 	int ret = -ENOENT;
-	write_seqlock(&rename_lock);
+	write_seqlock(&sb->rename_lock);
 	for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
 		/* Need exclusion wrt. d_invalidate() */
 		spin_lock(&p->d_lock);
@@ -1441,7 +1439,7 @@  int d_set_mounted(struct dentry *dentry)
 	}
  	spin_unlock(&dentry->d_lock);
 out:
-	write_sequnlock(&rename_lock);
+	write_sequnlock(&sb->rename_lock);
 	return ret;
 }
 
@@ -2306,15 +2304,16 @@  struct dentry *__d_lookup_rcu(const stru
  */
 struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
 {
+	struct super_block *sb = parent->d_sb;
 	struct dentry *dentry;
 	unsigned seq;
 
 	do {
-		seq = read_seqbegin(&rename_lock);
+		seq = read_seqbegin(&sb->rename_lock);
 		dentry = __d_lookup(parent, name);
 		if (dentry)
 			break;
-	} while (read_seqretry(&rename_lock, seq));
+	} while (read_seqretry(&sb->rename_lock, seq));
 	return dentry;
 }
 EXPORT_SYMBOL(d_lookup);
@@ -2513,6 +2512,7 @@  struct dentry *d_alloc_parallel(struct d
 				const struct qstr *name,
 				wait_queue_head_t *wq)
 {
+	struct super_block *sb = parent->d_sb;
 	unsigned int hash = name->hash;
 	struct hlist_bl_head *b = in_lookup_hash(parent, hash);
 	struct hlist_bl_node *node;
@@ -2526,7 +2526,7 @@  struct dentry *d_alloc_parallel(struct d
 retry:
 	rcu_read_lock();
 	seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
-	r_seq = read_seqbegin(&rename_lock);
+	r_seq = read_seqbegin(&sb->rename_lock);
 	dentry = __d_lookup_rcu(parent, name, &d_seq);
 	if (unlikely(dentry)) {
 		if (!lockref_get_not_dead(&dentry->d_lockref)) {
@@ -2542,7 +2542,7 @@  struct dentry *d_alloc_parallel(struct d
 		dput(new);
 		return dentry;
 	}
-	if (unlikely(read_seqretry(&rename_lock, r_seq))) {
+	if (unlikely(read_seqretry(&sb->rename_lock, r_seq))) {
 		rcu_read_unlock();
 		goto retry;
 	}
@@ -2890,9 +2890,11 @@  static void __d_move(struct dentry *dent
  */
 void d_move(struct dentry *dentry, struct dentry *target)
 {
-	write_seqlock(&rename_lock);
+	struct super_block *sb = dentry->d_sb;
+
+	write_seqlock(&sb->rename_lock);
 	__d_move(dentry, target, false);
-	write_sequnlock(&rename_lock);
+	write_sequnlock(&sb->rename_lock);
 }
 EXPORT_SYMBOL(d_move);
 
@@ -2903,7 +2905,9 @@  EXPORT_SYMBOL(d_move);
  */
 void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
 {
-	write_seqlock(&rename_lock);
+	struct super_block *sb = dentry1->d_sb;
+
+	write_seqlock(&sb->rename_lock);
 
 	WARN_ON(!dentry1->d_inode);
 	WARN_ON(!dentry2->d_inode);
@@ -2912,7 +2916,7 @@  void d_exchange(struct dentry *dentry1,
 
 	__d_move(dentry1, dentry2, true);
 
-	write_sequnlock(&rename_lock);
+	write_sequnlock(&sb->rename_lock);
 }
 
 /**
@@ -2997,6 +3001,8 @@  static int __d_unalias(struct inode *ino
  */
 struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
@@ -3012,9 +3018,9 @@  struct dentry *d_splice_alias(struct ino
 		if (unlikely(new)) {
 			/* The reference to new ensures it remains an alias */
 			spin_unlock(&inode->i_lock);
-			write_seqlock(&rename_lock);
+			write_seqlock(&sb->rename_lock);
 			if (unlikely(d_ancestor(new, dentry))) {
-				write_sequnlock(&rename_lock);
+				write_sequnlock(&sb->rename_lock);
 				dput(new);
 				new = ERR_PTR(-ELOOP);
 				pr_warn_ratelimited(
@@ -3026,7 +3032,7 @@  struct dentry *d_splice_alias(struct ino
 			} else if (!IS_ROOT(new)) {
 				struct dentry *old_parent = dget(new->d_parent);
 				int err = __d_unalias(inode, dentry, new);
-				write_sequnlock(&rename_lock);
+				write_sequnlock(&sb->rename_lock);
 				if (err) {
 					dput(new);
 					new = ERR_PTR(err);
@@ -3034,7 +3040,7 @@  struct dentry *d_splice_alias(struct ino
 				dput(old_parent);
 			} else {
 				__d_move(new, dentry, false);
-				write_sequnlock(&rename_lock);
+				write_sequnlock(&sb->rename_lock);
 			}
 			iput(inode);
 			return new;
@@ -3064,6 +3070,7 @@  EXPORT_SYMBOL(d_splice_alias);
   
 bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
+	struct super_block *sb = new_dentry->d_sb;
 	bool result;
 	unsigned seq;
 
@@ -3072,7 +3079,7 @@  bool is_subdir(struct dentry *new_dentry
 
 	do {
 		/* for restarting inner loop in case of seq retry */
-		seq = read_seqbegin(&rename_lock);
+		seq = read_seqbegin(&sb->rename_lock);
 		/*
 		 * Need rcu_readlock to protect against the d_parent trashing
 		 * due to d_move
@@ -3083,7 +3090,7 @@  bool is_subdir(struct dentry *new_dentry
 		else
 			result = false;
 		rcu_read_unlock();
-	} while (read_seqretry(&rename_lock, seq));
+	} while (read_seqretry(&sb->rename_lock, seq));
 
 	return result;
 }
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -51,6 +51,7 @@  int nfs_mountpoint_expiry_timeout = 500
 char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen,
 	       unsigned flags)
 {
+	struct super_block *sb = dentry->d_sb;
 	char *end;
 	int namelen;
 	unsigned seq;
@@ -61,7 +62,7 @@  char *nfs_path(char **p, struct dentry *
 	*--end = '\0';
 	buflen--;
 
-	seq = read_seqbegin(&rename_lock);
+	seq = read_seqbegin(&sb->rename_lock);
 	rcu_read_lock();
 	while (1) {
 		spin_lock(&dentry->d_lock);
@@ -77,7 +78,7 @@  char *nfs_path(char **p, struct dentry *
 		spin_unlock(&dentry->d_lock);
 		dentry = dentry->d_parent;
 	}
-	if (read_seqretry(&rename_lock, seq)) {
+	if (read_seqretry(&sb->rename_lock, seq)) {
 		spin_unlock(&dentry->d_lock);
 		rcu_read_unlock();
 		goto rename_retry;
@@ -118,7 +119,7 @@  char *nfs_path(char **p, struct dentry *
 Elong_unlock:
 	spin_unlock(&dentry->d_lock);
 	rcu_read_unlock();
-	if (read_seqretry(&rename_lock, seq))
+	if (read_seqretry(&sb->rename_lock, seq))
 		goto rename_retry;
 Elong:
 	return ERR_PTR(-ENAMETOOLONG);
--- a/fs/super.c
+++ b/fs/super.c
@@ -249,6 +249,7 @@  static struct super_block *alloc_super(s
 	spin_lock_init(&s->s_inode_list_lock);
 	INIT_LIST_HEAD(&s->s_inodes_wb);
 	spin_lock_init(&s->s_inode_wblist_lock);
+	seqlock_init(&s->rename_lock);
 
 	s->s_count = 1;
 	atomic_set(&s->s_active, 1);
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -218,8 +218,6 @@  struct dentry_operations {
 #define DCACHE_DENTRY_CURSOR		0x20000000
 #define DCACHE_NORCU			0x40000000 /* No RCU delay for freeing */
 
-extern seqlock_t rename_lock;
-
 /*
  * These are the low-level FS interfaces to the dcache..
  */
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1548,6 +1548,8 @@  struct super_block {
 
 	spinlock_t		s_inode_wblist_lock;
 	struct list_head	s_inodes_wb;	/* writeback inodes */
+
+	seqlock_t		rename_lock ____cacheline_aligned_in_smp;
 } __randomize_layout;
 
 /* Helper functions so that in most cases filesystems will
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1763,6 +1763,7 @@  static inline void handle_one(const stru
 
 static void handle_path(const struct dentry *dentry)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct audit_context *context;
 	struct audit_tree_refs *p;
 	const struct dentry *d, *parent;
@@ -1777,7 +1778,7 @@  static void handle_path(const struct den
 	drop = NULL;
 	d = dentry;
 	rcu_read_lock();
-	seq = read_seqbegin(&rename_lock);
+	seq = read_seqbegin(&sb->rename_lock);
 	for(;;) {
 		struct inode *inode = d_backing_inode(d);
 		if (inode && unlikely(inode->i_fsnotify_marks)) {
@@ -1795,7 +1796,7 @@  static void handle_path(const struct den
 			break;
 		d = parent;
 	}
-	if (unlikely(read_seqretry(&rename_lock, seq) || drop)) {  /* in this order */
+	if (unlikely(read_seqretry(&sb->rename_lock, seq) || drop)) {  /* in this order */
 		rcu_read_unlock();
 		if (!drop) {
 			/* just a race with rename */