Message ID | 20191130020225.20239-2-yukuai3@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [V2,1/3] dcache: add a new enum type for 'dentry_d_lock_class' | expand |
On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote: > However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than > two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'. No. These need meaningful names. Indeed, I think D_LOCK_NESTED is a terrible name. Looking at the calls: $ git grep -n nested.*d_lock fs fs/autofs/expire.c:82: spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); fs/dcache.c:619: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); fs/dcache.c:1086: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); fs/dcache.c:1303: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); fs/dcache.c:2822: spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); fs/dcache.c:2827: spin_lock_nested(&target->d_parent->d_lock, fs/dcache.c:2830: spin_lock_nested(&dentry->d_lock, 2); fs/dcache.c:2831: spin_lock_nested(&target->d_lock, 3); fs/dcache.c:3121: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); fs/libfs.c:112: spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); fs/libfs.c:341: spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); fs/notify/fsnotify.c:129: spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); Most of these would be well-expressed by DENTRY_D_LOCK_CHILD. The exception is __d_move() where I think we should actually name the different lock classes instead of using a bare '2' and '3'. Something like this, perhaps: diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 2866fabf497f..f604175243eb 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -79,7 +79,7 @@ static struct dentry *positive_after(struct dentry *p, struct dentry *child) child = list_first_entry(&p->d_subdirs, struct dentry, d_child); list_for_each_entry_from(child, &p->d_subdirs, d_child) { - spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_CHILD); if (simple_positive(child)) { dget_dlock(child); spin_unlock(&child->d_lock); diff --git a/fs/dcache.c b/fs/dcache.c index e88cf0554e65..c73b7d7bc785 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -616,7 +616,7 @@ static struct dentry *__lock_parent(struct dentry *dentry) } rcu_read_unlock(); if (parent != dentry) - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD); else parent = NULL; return parent; @@ -1083,7 +1083,7 @@ static bool shrink_lock_dentry(struct dentry *dentry) spin_lock(&dentry->d_lock); goto out; } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD); if (likely(!dentry->d_lockref.count)) return true; spin_unlock(&parent->d_lock); @@ -1300,7 +1300,7 @@ static void d_walk(struct dentry *parent, void *data, if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD); ret = enter(data, dentry); switch (ret) { @@ -2819,16 +2819,16 @@ static void __d_move(struct dentry *dentry, struct dentry *target, } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); - spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_PARENT_2); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, - DENTRY_D_LOCK_NESTED); + DENTRY_D_LOCK_PARENT_2); } - spin_lock_nested(&dentry->d_lock, 2); - spin_lock_nested(&target->d_lock, 3); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD); + spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_CHILD_2); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; @@ -2837,7 +2837,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, } write_seqcount_begin(&dentry->d_seq); - write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); + write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_CHILD); /* unhash both */ if (!d_unhashed(dentry)) @@ -3118,7 +3118,7 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD); dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); diff --git a/fs/libfs.c b/fs/libfs.c index 1463b038ffc4..c68dedbf4ad2 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -109,7 +109,7 @@ static struct dentry *scan_positives(struct dentry *cursor, if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; if (simple_positive(d) && !--count) { - spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_CHILD); if (simple_positive(d)) found = dget_dlock(d); spin_unlock(&d->d_lock); @@ -336,9 +336,9 @@ int simple_empty(struct dentry *dentry) struct dentry *child; int ret = 0; - spin_lock(&dentry->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_PARENT_2); list_for_each_entry(child, &dentry->d_subdirs, d_child) { - spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_CHILD_2); if (simple_positive(child)) { spin_unlock(&child->d_lock); goto out; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 2ecef6155fc0..23492f2e4915 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -126,7 +126,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) if (!child->d_inode) continue; - spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_CHILD); if (watched) child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 10090f11ab95..6a497c63bd38 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -121,15 +121,20 @@ struct dentry { } __randomize_layout; /* - * dentry->d_lock spinlock nesting subclasses: + * dentry->d_lock spinlock nesting subclasses. Always taken in increasing + * order although some subclasses may be skipped. * * 0: normal - * 1: nested + * 1: either a descendent of "normal" or a cousin. + * 2: child of the "normal" dentry + * 3: child of the "parent2" dentry */ enum dentry_d_lock_class { - DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ - DENTRY_D_LOCK_NESTED + DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs */ + DENTRY_D_LOCK_PARENT_2, /* not an ancestor of parent */ + DENTRY_D_LOCK_CHILD, /* nests under parent's lock */ + DENTRY_D_LOCK_CHILD_2, /* PARENT_2's child */ }; struct dentry_operations {
On 2019/11/30 11:43, Matthew Wilcox wrote: > On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote: >> However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than >> two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'. > > No. These need meaningful names. Indeed, I think D_LOCK_NESTED is > a terrible name. > > Looking at the calls: > > $ git grep -n nested.*d_lock fs > fs/autofs/expire.c:82: spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); > fs/dcache.c:619: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); > fs/dcache.c:1086: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); > fs/dcache.c:1303: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); > fs/dcache.c:2822: spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); > fs/dcache.c:2827: spin_lock_nested(&target->d_parent->d_lock, > fs/dcache.c:2830: spin_lock_nested(&dentry->d_lock, 2); > fs/dcache.c:2831: spin_lock_nested(&target->d_lock, 3); > fs/dcache.c:3121: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); > fs/libfs.c:112: spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); > fs/libfs.c:341: spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); > fs/notify/fsnotify.c:129: spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); > > Most of these would be well-expressed by DENTRY_D_LOCK_CHILD. > > The exception is __d_move() where I think we should actually name the > different lock classes instead of using a bare '2' and '3'. Something > like this, perhaps: > Thanks for looking into this, do you mind if I replace your patch with the first two patches in the patchset? Yu Kuai
On Sat, Nov 30, 2019 at 03:53:10PM +0800, yukuai (C) wrote: > On 2019/11/30 11:43, Matthew Wilcox wrote: > > On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote: > > > However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than > > > two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'. > > > > No. These need meaningful names. Indeed, I think D_LOCK_NESTED is > > a terrible name. > > > > The exception is __d_move() where I think we should actually name the > > different lock classes instead of using a bare '2' and '3'. Something > > like this, perhaps: > > Thanks for looking into this, do you mind if I replace your patch with the > first two patches in the patchset? That's fine by me, but I think we should wait for Al to give his approval before submitting a new version. I'm also not entirely content with the explanation I wrote last night. Maybe this instead ... /* - * dentry->d_lock spinlock nesting subclasses: + * dentry->d_lock spinlock nesting subclasses. Always taken in increasing + * order although some subclasses may be skipped. If one dentry is the + * ancestor of another, then the ancestor's d_lock is taken before the + * descendent. If NORMAL and PARENT_2 do not have a hierarchical relationship + * then you must hold the s_vfs_rename_mutex to prevent another thread taking + * the locks in the opposite order, or NORMAL and PARENT_2 becoming + * hierarchical through a rename operation. * * 0: normal - * 1: nested + * 1: either a descendent of "normal" or a cousin. + * 2: child of the "normal" dentry + * 3: child of the "parent2" dentry */ enum dentry_d_lock_class { - DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ - DENTRY_D_LOCK_NESTED + DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs */ + DENTRY_D_LOCK_PARENT_2, /* not an ancestor of normal */ + DENTRY_D_LOCK_CHILD, /* nests under parent's lock */ + DENTRY_D_LOCK_CHILD_2, /* PARENT_2's child */ };
On Sat, Nov 30, 2019 at 11:36:15AM -0800, Matthew Wilcox wrote: > On Sat, Nov 30, 2019 at 03:53:10PM +0800, yukuai (C) wrote: > > On 2019/11/30 11:43, Matthew Wilcox wrote: > > > On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote: > > > > However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than > > > > two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'. > > > > > > No. These need meaningful names. Indeed, I think D_LOCK_NESTED is > > > a terrible name. > > > > > > The exception is __d_move() where I think we should actually name the > > > different lock classes instead of using a bare '2' and '3'. Something > > > like this, perhaps: > > > > Thanks for looking into this, do you mind if I replace your patch with the > > first two patches in the patchset? > > That's fine by me, but I think we should wait for Al to give his approval > before submitting a new version. IMO this is a wrong approach. It's papering over a confused code in debugfs recursive removal and it would be better to get rid of _that_, rather than try and slap bandaids on it. I suspect that the following would be a better way to deal with it; it adds a new primitive and converts debugfs and tracefs to that. There are followups converting other such places, still not finished. commit 7e9c8a08889bf42bbe64e80e456d2eca824e5db2 Author: Al Viro <viro@zeniv.linux.org.uk> Date: Mon Nov 18 09:43:10 2019 -0500 simple_recursive_removal(): kernel-side rm -rf for ramfs-style filesystems two requirements: no file creations in IS_DEADDIR and no cross-directory renames whatsoever. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 042b688ed124..da936c54d879 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -309,7 +309,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) parent = debugfs_mount->mnt_root; inode_lock(d_inode(parent)); - dentry = lookup_one_len(name, parent, strlen(name)); + if (unlikely(IS_DEADDIR(d_inode(parent)))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", @@ -657,62 +660,15 @@ static void __debugfs_file_removed(struct dentry *dentry) wait_for_completion(&fsd->active_users_drained); } -static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) -{ - int ret = 0; - - if (simple_positive(dentry)) { - dget(dentry); - if (d_is_dir(dentry)) { - ret = simple_rmdir(d_inode(parent), dentry); - if (!ret) - fsnotify_rmdir(d_inode(parent), dentry); - } else { - simple_unlink(d_inode(parent), dentry); - fsnotify_unlink(d_inode(parent), dentry); - } - if (!ret) - d_delete(dentry); - if (d_is_reg(dentry)) - __debugfs_file_removed(dentry); - dput(dentry); - } - return ret; -} - -/** - * debugfs_remove - removes a file or directory from the debugfs filesystem - * @dentry: a pointer to a the dentry of the file or directory to be - * removed. If this parameter is NULL or an error value, nothing - * will be done. - * - * This function removes a file or directory in debugfs that was previously - * created with a call to another debugfs function (like - * debugfs_create_file() or variants thereof.) - * - * This function is required to be called in order for the file to be - * removed, no automatic cleanup of files will happen when a module is - * removed, you are responsible here. - */ -void debugfs_remove(struct dentry *dentry) +static void remove_one(struct dentry *victim) { - struct dentry *parent; - int ret; - - if (IS_ERR_OR_NULL(dentry)) - return; - - parent = dentry->d_parent; - inode_lock(d_inode(parent)); - ret = __debugfs_remove(dentry, parent); - inode_unlock(d_inode(parent)); - if (!ret) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + if (d_is_reg(victim)) + __debugfs_file_removed(victim); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } -EXPORT_SYMBOL_GPL(debugfs_remove); /** - * debugfs_remove_recursive - recursively removes a directory + * debugfs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. If this * parameter is NULL or an error value, nothing will be done. * @@ -724,65 +680,16 @@ EXPORT_SYMBOL_GPL(debugfs_remove); * removed, no automatic cleanup of files will happen when a module is * removed, you are responsible here. */ -void debugfs_remove_recursive(struct dentry *dentry) +void debugfs_remove(struct dentry *dentry) { - struct dentry *child, *parent; - if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry; - down: - inode_lock(d_inode(parent)); - loop: - /* - * The parent->d_subdirs is protected by the d_lock. Outside that - * lock, the child can be unlinked and set to be freed which can - * use the d_u.d_child as the rcu head and corrupt this list. - */ - spin_lock(&parent->d_lock); - list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!simple_positive(child)) - continue; - - /* perhaps simple_empty(child) makes more sense */ - if (!list_empty(&child->d_subdirs)) { - spin_unlock(&parent->d_lock); - inode_unlock(d_inode(parent)); - parent = child; - goto down; - } - - spin_unlock(&parent->d_lock); - - if (!__debugfs_remove(child, parent)) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); - - /* - * The parent->d_lock protects agaist child from unlinking - * from d_subdirs. When releasing the parent->d_lock we can - * no longer trust that the next pointer is valid. - * Restart the loop. We'll skip this one with the - * simple_positive() check. - */ - goto loop; - } - spin_unlock(&parent->d_lock); - - inode_unlock(d_inode(parent)); - child = parent; - parent = parent->d_parent; - inode_lock(d_inode(parent)); - - if (child != dentry) - /* go up */ - goto loop; - - if (!__debugfs_remove(child, parent)) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); - inode_unlock(d_inode(parent)); + simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); + simple_recursive_removal(dentry, remove_one); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } -EXPORT_SYMBOL_GPL(debugfs_remove_recursive); +EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_rename - rename a file/directory in the debugfs filesystem diff --git a/fs/libfs.c b/fs/libfs.c index 540611b99b9a..b67003a948ed 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -19,6 +19,7 @@ #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> +#include <linux/fsnotify.h> #include <linux/uaccess.h> @@ -239,6 +240,75 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) +{ + struct dentry *child = NULL; + struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; + + spin_lock(&parent->d_lock); + while ((p = p->next) != &parent->d_subdirs) { + struct dentry *d = container_of(p, struct dentry, d_child); + if (simple_positive(d)) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + child = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(child)) + break; + } + } + spin_unlock(&parent->d_lock); + dput(prev); + return child; +} + +void simple_recursive_removal(struct dentry *dentry, + void (*callback)(struct dentry *)) +{ + struct dentry *this = dentry; + while (true) { + struct dentry *victim = NULL, *child; + struct inode *inode = this->d_inode; + + inode_lock(inode); + if (d_is_dir(this)) + inode->i_flags |= S_DEAD; + while ((child = find_next_child(this, victim)) == NULL) { + // kill and ascend + // update metadata while it's still locked + inode->i_ctime = current_time(inode); + clear_nlink(inode); + inode_unlock(inode); + victim = this; + this = this->d_parent; + inode = this->d_inode; + inode_lock(inode); + if (simple_positive(victim)) { + d_invalidate(victim); // avoid lost mounts + if (d_is_dir(victim)) + fsnotify_rmdir(inode, victim); + else + fsnotify_unlink(inode, victim); + if (callback) + callback(victim); + dput(victim); // unpin it + } + if (victim == dentry) { + inode->i_ctime = inode->i_mtime = + current_time(inode); + if (d_is_dir(dentry)) + drop_nlink(inode); + inode_unlock(inode); + dput(dentry); + return; + } + } + inode_unlock(inode); + this = child; + } +} +EXPORT_SYMBOL(simple_recursive_removal); + static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index eeeae0475da9..2a16c0eb97e4 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -329,7 +329,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) parent = tracefs_mount->mnt_root; inode_lock(parent->d_inode); - dentry = lookup_one_len(name, parent, strlen(name)); + if (unlikely(IS_DEADDIR(parent->d_inode))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); dentry = ERR_PTR(-EEXIST); @@ -495,122 +498,27 @@ __init struct dentry *tracefs_create_instance_dir(const char *name, return dentry; } -static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) +static void remove_one(struct dentry *victim) { - int ret = 0; - - if (simple_positive(dentry)) { - if (dentry->d_inode) { - dget(dentry); - switch (dentry->d_inode->i_mode & S_IFMT) { - case S_IFDIR: - ret = simple_rmdir(parent->d_inode, dentry); - if (!ret) - fsnotify_rmdir(parent->d_inode, dentry); - break; - default: - simple_unlink(parent->d_inode, dentry); - fsnotify_unlink(parent->d_inode, dentry); - break; - } - if (!ret) - d_delete(dentry); - dput(dentry); - } - } - return ret; -} - -/** - * tracefs_remove - removes a file or directory from the tracefs filesystem - * @dentry: a pointer to a the dentry of the file or directory to be - * removed. - * - * This function removes a file or directory in tracefs that was previously - * created with a call to another tracefs function (like - * tracefs_create_file() or variants thereof.) - */ -void tracefs_remove(struct dentry *dentry) -{ - struct dentry *parent; - int ret; - - if (IS_ERR_OR_NULL(dentry)) - return; - - parent = dentry->d_parent; - inode_lock(parent->d_inode); - ret = __tracefs_remove(dentry, parent); - inode_unlock(parent->d_inode); - if (!ret) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); } /** - * tracefs_remove_recursive - recursively removes a directory + * tracefs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. * * This function recursively removes a directory tree in tracefs that * was previously created with a call to another tracefs function * (like tracefs_create_file() or variants thereof.) */ -void tracefs_remove_recursive(struct dentry *dentry) +void tracefs_remove(struct dentry *dentry) { - struct dentry *child, *parent; - if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry; - down: - inode_lock(parent->d_inode); - loop: - /* - * The parent->d_subdirs is protected by the d_lock. Outside that - * lock, the child can be unlinked and set to be freed which can - * use the d_u.d_child as the rcu head and corrupt this list. - */ - spin_lock(&parent->d_lock); - list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!simple_positive(child)) - continue; - - /* perhaps simple_empty(child) makes more sense */ - if (!list_empty(&child->d_subdirs)) { - spin_unlock(&parent->d_lock); - inode_unlock(parent->d_inode); - parent = child; - goto down; - } - - spin_unlock(&parent->d_lock); - - if (!__tracefs_remove(child, parent)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - - /* - * The parent->d_lock protects agaist child from unlinking - * from d_subdirs. When releasing the parent->d_lock we can - * no longer trust that the next pointer is valid. - * Restart the loop. We'll skip this one with the - * simple_positive() check. - */ - goto loop; - } - spin_unlock(&parent->d_lock); - - inode_unlock(parent->d_inode); - child = parent; - parent = parent->d_parent; - inode_lock(parent->d_inode); - - if (child != dentry) - /* go up */ - goto loop; - - if (!__tracefs_remove(child, parent)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - inode_unlock(parent->d_inode); + simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count); + simple_recursive_removal(dentry, remove_one); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); } /** diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 58424eb3b329..0a817d763f0f 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -82,7 +82,7 @@ struct dentry *debugfs_create_automount(const char *name, void *data); void debugfs_remove(struct dentry *dentry); -void debugfs_remove_recursive(struct dentry *dentry); +#define debugfs_remove_recursive debugfs_remove const struct file_operations *debugfs_real_fops(const struct file *filp); diff --git a/include/linux/fs.h b/include/linux/fs.h index 997a530ff4e9..73ffc8654987 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3242,6 +3242,8 @@ extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); +extern void simple_recursive_removal(struct dentry *, + void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern int noop_set_page_dirty(struct page *page); extern void noop_invalidatepage(struct page *page, unsigned int offset, diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 88d279c1b863..99912445974c 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -28,7 +28,6 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *tracefs_create_dir(const char *name, struct dentry *parent); void tracefs_remove(struct dentry *dentry); -void tracefs_remove_recursive(struct dentry *dentry); struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, int (*mkdir)(const char *name), diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 563e80f9006a..88d94dc3ed37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8366,7 +8366,7 @@ struct trace_array *trace_array_create(const char *name) ret = event_trace_add_tracer(tr->dir, tr); if (ret) { - tracefs_remove_recursive(tr->dir); + tracefs_remove(tr->dir); goto out_free_tr; } @@ -8422,7 +8422,7 @@ static int __remove_instance(struct trace_array *tr) event_trace_del_tracer(tr); ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); - tracefs_remove_recursive(tr->dir); + tracefs_remove(tr->dir); free_trace_buffers(tr); for (i = 0; i < tr->nr_topts; i++) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 648930823b57..25bb3e8fb170 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -696,7 +696,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) return; if (!--dir->nr_events) { - tracefs_remove_recursive(dir->entry); + tracefs_remove(dir->entry); list_del(&dir->list); __put_system_dir(dir); } @@ -715,7 +715,7 @@ static void remove_event_file_dir(struct trace_event_file *file) } spin_unlock(&dir->d_lock); - tracefs_remove_recursive(dir); + tracefs_remove(dir); } list_del(&file->list); @@ -3032,7 +3032,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - tracefs_remove_recursive(tr->event_dir); + tracefs_remove(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index fa95139445b2..fa45a031848c 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -551,7 +551,7 @@ static int init_tracefs(void) return 0; err: - tracefs_remove_recursive(top_dir); + tracefs_remove(top_dir); return -ENOMEM; }
On 08.12.19 20:11, Al Viro wrote: > On Sat, Nov 30, 2019 at 11:36:15AM -0800, Matthew Wilcox wrote: >> On Sat, Nov 30, 2019 at 03:53:10PM +0800, yukuai (C) wrote: >>> On 2019/11/30 11:43, Matthew Wilcox wrote: >>>> On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote: >>>>> However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than >>>>> two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'. >>>> >>>> No. These need meaningful names. Indeed, I think D_LOCK_NESTED is >>>> a terrible name. >>>> >>>> The exception is __d_move() where I think we should actually name the >>>> different lock classes instead of using a bare '2' and '3'. Something >>>> like this, perhaps: >>> >>> Thanks for looking into this, do you mind if I replace your patch with the >>> first two patches in the patchset? >> >> That's fine by me, but I think we should wait for Al to give his approval >> before submitting a new version. > > IMO this is a wrong approach. It's papering over a confused code in > debugfs recursive removal and it would be better to get rid of _that_, > rather than try and slap bandaids on it. > > I suspect that the following would be a better way to deal with it; it adds > a new primitive and converts debugfs and tracefs to that. There are > followups converting other such places, still not finished. > > commit 7e9c8a08889bf42bbe64e80e456d2eca824e5db2 > Author: Al Viro <viro@zeniv.linux.org.uk> > Date: Mon Nov 18 09:43:10 2019 -0500 > > simple_recursive_removal(): kernel-side rm -rf for ramfs-style filesystems > > two requirements: no file creations in IS_DEADDIR and no cross-directory > renames whatsoever. > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> > > diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c > index 042b688ed124..da936c54d879 100644 > --- a/fs/debugfs/inode.c > +++ b/fs/debugfs/inode.c > @@ -309,7 +309,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) > parent = debugfs_mount->mnt_root; > > inode_lock(d_inode(parent)); > - dentry = lookup_one_len(name, parent, strlen(name)); > + if (unlikely(IS_DEADDIR(d_inode(parent)))) > + dentry = ERR_PTR(-ENOENT); > + else > + dentry = lookup_one_len(name, parent, strlen(name)); > if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { > if (d_is_dir(dentry)) > pr_err("Directory '%s' with parent '%s' already present!\n", > @@ -657,62 +660,15 @@ static void __debugfs_file_removed(struct dentry *dentry) > wait_for_completion(&fsd->active_users_drained); > } > > -static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) > -{ > - int ret = 0; > - > - if (simple_positive(dentry)) { > - dget(dentry); > - if (d_is_dir(dentry)) { > - ret = simple_rmdir(d_inode(parent), dentry); > - if (!ret) > - fsnotify_rmdir(d_inode(parent), dentry); > - } else { > - simple_unlink(d_inode(parent), dentry); > - fsnotify_unlink(d_inode(parent), dentry); > - } > - if (!ret) > - d_delete(dentry); > - if (d_is_reg(dentry)) > - __debugfs_file_removed(dentry); > - dput(dentry); > - } > - return ret; > -} > - > -/** > - * debugfs_remove - removes a file or directory from the debugfs filesystem > - * @dentry: a pointer to a the dentry of the file or directory to be > - * removed. If this parameter is NULL or an error value, nothing > - * will be done. > - * > - * This function removes a file or directory in debugfs that was previously > - * created with a call to another debugfs function (like > - * debugfs_create_file() or variants thereof.) > - * > - * This function is required to be called in order for the file to be > - * removed, no automatic cleanup of files will happen when a module is > - * removed, you are responsible here. > - */ > -void debugfs_remove(struct dentry *dentry) > +static void remove_one(struct dentry *victim) > { > - struct dentry *parent; > - int ret; > - > - if (IS_ERR_OR_NULL(dentry)) > - return; > - > - parent = dentry->d_parent; > - inode_lock(d_inode(parent)); > - ret = __debugfs_remove(dentry, parent); > - inode_unlock(d_inode(parent)); > - if (!ret) > - simple_release_fs(&debugfs_mount, &debugfs_mount_count); > + if (d_is_reg(victim)) > + __debugfs_file_removed(victim); > + simple_release_fs(&debugfs_mount, &debugfs_mount_count); > } > -EXPORT_SYMBOL_GPL(debugfs_remove); > > /** > - * debugfs_remove_recursive - recursively removes a directory > + * debugfs_remove - recursively removes a directory > * @dentry: a pointer to a the dentry of the directory to be removed. If this > * parameter is NULL or an error value, nothing will be done. > * > @@ -724,65 +680,16 @@ EXPORT_SYMBOL_GPL(debugfs_remove); > * removed, no automatic cleanup of files will happen when a module is > * removed, you are responsible here. > */ > -void debugfs_remove_recursive(struct dentry *dentry) > +void debugfs_remove(struct dentry *dentry) > { > - struct dentry *child, *parent; > - > if (IS_ERR_OR_NULL(dentry)) > return; > > - parent = dentry; > - down: > - inode_lock(d_inode(parent)); > - loop: > - /* > - * The parent->d_subdirs is protected by the d_lock. Outside that > - * lock, the child can be unlinked and set to be freed which can > - * use the d_u.d_child as the rcu head and corrupt this list. > - */ > - spin_lock(&parent->d_lock); > - list_for_each_entry(child, &parent->d_subdirs, d_child) { > - if (!simple_positive(child)) > - continue; > - > - /* perhaps simple_empty(child) makes more sense */ > - if (!list_empty(&child->d_subdirs)) { > - spin_unlock(&parent->d_lock); > - inode_unlock(d_inode(parent)); > - parent = child; > - goto down; > - } > - > - spin_unlock(&parent->d_lock); > - > - if (!__debugfs_remove(child, parent)) > - simple_release_fs(&debugfs_mount, &debugfs_mount_count); > - > - /* > - * The parent->d_lock protects agaist child from unlinking > - * from d_subdirs. When releasing the parent->d_lock we can > - * no longer trust that the next pointer is valid. > - * Restart the loop. We'll skip this one with the > - * simple_positive() check. > - */ > - goto loop; > - } > - spin_unlock(&parent->d_lock); > - > - inode_unlock(d_inode(parent)); > - child = parent; > - parent = parent->d_parent; > - inode_lock(d_inode(parent)); > - > - if (child != dentry) > - /* go up */ > - goto loop; > - > - if (!__debugfs_remove(child, parent)) > - simple_release_fs(&debugfs_mount, &debugfs_mount_count); > - inode_unlock(d_inode(parent)); > + simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); > + simple_recursive_removal(dentry, remove_one); > + simple_release_fs(&debugfs_mount, &debugfs_mount_count); > } > -EXPORT_SYMBOL_GPL(debugfs_remove_recursive); > +EXPORT_SYMBOL_GPL(debugfs_remove); > > /** > * debugfs_rename - rename a file/directory in the debugfs filesystem > diff --git a/fs/libfs.c b/fs/libfs.c > index 540611b99b9a..b67003a948ed 100644 > --- a/fs/libfs.c > +++ b/fs/libfs.c > @@ -19,6 +19,7 @@ > #include <linux/buffer_head.h> /* sync_mapping_buffers */ > #include <linux/fs_context.h> > #include <linux/pseudo_fs.h> > +#include <linux/fsnotify.h> > > #include <linux/uaccess.h> > > @@ -239,6 +240,75 @@ const struct inode_operations simple_dir_inode_operations = { > }; > EXPORT_SYMBOL(simple_dir_inode_operations); > > +static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) > +{ > + struct dentry *child = NULL; > + struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; > + > + spin_lock(&parent->d_lock); > + while ((p = p->next) != &parent->d_subdirs) { > + struct dentry *d = container_of(p, struct dentry, d_child); > + if (simple_positive(d)) { > + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); > + if (simple_positive(d)) > + child = dget_dlock(d); > + spin_unlock(&d->d_lock); > + if (likely(child)) > + break; > + } > + } > + spin_unlock(&parent->d_lock); > + dput(prev); > + return child; > +} > + > +void simple_recursive_removal(struct dentry *dentry, > + void (*callback)(struct dentry *)) > +{ > + struct dentry *this = dentry; > + while (true) { > + struct dentry *victim = NULL, *child; > + struct inode *inode = this->d_inode; > + > + inode_lock(inode); > + if (d_is_dir(this)) > + inode->i_flags |= S_DEAD; > + while ((child = find_next_child(this, victim)) == NULL) { > + // kill and ascend > + // update metadata while it's still locked > + inode->i_ctime = current_time(inode); > + clear_nlink(inode); > + inode_unlock(inode); > + victim = this; > + this = this->d_parent; > + inode = this->d_inode; > + inode_lock(inode); > + if (simple_positive(victim)) { > + d_invalidate(victim); // avoid lost mounts > + if (d_is_dir(victim)) > + fsnotify_rmdir(inode, victim); > + else > + fsnotify_unlink(inode, victim); > + if (callback) > + callback(victim); > + dput(victim); // unpin it > + } > + if (victim == dentry) { > + inode->i_ctime = inode->i_mtime = > + current_time(inode); > + if (d_is_dir(dentry)) > + drop_nlink(inode); > + inode_unlock(inode); > + dput(dentry); > + return; > + } > + } > + inode_unlock(inode); > + this = child; > + } > +} > +EXPORT_SYMBOL(simple_recursive_removal); > + > static const struct super_operations simple_super_operations = { > .statfs = simple_statfs, > }; > diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c > index eeeae0475da9..2a16c0eb97e4 100644 > --- a/fs/tracefs/inode.c > +++ b/fs/tracefs/inode.c > @@ -329,7 +329,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) > parent = tracefs_mount->mnt_root; > > inode_lock(parent->d_inode); > - dentry = lookup_one_len(name, parent, strlen(name)); > + if (unlikely(IS_DEADDIR(parent->d_inode))) > + dentry = ERR_PTR(-ENOENT); > + else > + dentry = lookup_one_len(name, parent, strlen(name)); > if (!IS_ERR(dentry) && dentry->d_inode) { > dput(dentry); > dentry = ERR_PTR(-EEXIST); > @@ -495,122 +498,27 @@ __init struct dentry *tracefs_create_instance_dir(const char *name, > return dentry; > } > > -static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) > +static void remove_one(struct dentry *victim) > { > - int ret = 0; > - > - if (simple_positive(dentry)) { > - if (dentry->d_inode) { > - dget(dentry); > - switch (dentry->d_inode->i_mode & S_IFMT) { > - case S_IFDIR: > - ret = simple_rmdir(parent->d_inode, dentry); > - if (!ret) > - fsnotify_rmdir(parent->d_inode, dentry); > - break; > - default: > - simple_unlink(parent->d_inode, dentry); > - fsnotify_unlink(parent->d_inode, dentry); > - break; > - } > - if (!ret) > - d_delete(dentry); > - dput(dentry); > - } > - } > - return ret; > -} > - > -/** > - * tracefs_remove - removes a file or directory from the tracefs filesystem > - * @dentry: a pointer to a the dentry of the file or directory to be > - * removed. > - * > - * This function removes a file or directory in tracefs that was previously > - * created with a call to another tracefs function (like > - * tracefs_create_file() or variants thereof.) > - */ > -void tracefs_remove(struct dentry *dentry) > -{ > - struct dentry *parent; > - int ret; > - > - if (IS_ERR_OR_NULL(dentry)) > - return; > - > - parent = dentry->d_parent; > - inode_lock(parent->d_inode); > - ret = __tracefs_remove(dentry, parent); > - inode_unlock(parent->d_inode); > - if (!ret) > - simple_release_fs(&tracefs_mount, &tracefs_mount_count); > + simple_release_fs(&tracefs_mount, &tracefs_mount_count); > } > > /** > - * tracefs_remove_recursive - recursively removes a directory > + * tracefs_remove - recursively removes a directory > * @dentry: a pointer to a the dentry of the directory to be removed. > * > * This function recursively removes a directory tree in tracefs that > * was previously created with a call to another tracefs function > * (like tracefs_create_file() or variants thereof.) > */ > -void tracefs_remove_recursive(struct dentry *dentry) > +void tracefs_remove(struct dentry *dentry) > { > - struct dentry *child, *parent; > - > if (IS_ERR_OR_NULL(dentry)) > return; > > - parent = dentry; > - down: > - inode_lock(parent->d_inode); > - loop: > - /* > - * The parent->d_subdirs is protected by the d_lock. Outside that > - * lock, the child can be unlinked and set to be freed which can > - * use the d_u.d_child as the rcu head and corrupt this list. > - */ > - spin_lock(&parent->d_lock); > - list_for_each_entry(child, &parent->d_subdirs, d_child) { > - if (!simple_positive(child)) > - continue; > - > - /* perhaps simple_empty(child) makes more sense */ > - if (!list_empty(&child->d_subdirs)) { > - spin_unlock(&parent->d_lock); > - inode_unlock(parent->d_inode); > - parent = child; > - goto down; > - } > - > - spin_unlock(&parent->d_lock); > - > - if (!__tracefs_remove(child, parent)) > - simple_release_fs(&tracefs_mount, &tracefs_mount_count); > - > - /* > - * The parent->d_lock protects agaist child from unlinking > - * from d_subdirs. When releasing the parent->d_lock we can > - * no longer trust that the next pointer is valid. > - * Restart the loop. We'll skip this one with the > - * simple_positive() check. > - */ > - goto loop; > - } > - spin_unlock(&parent->d_lock); > - > - inode_unlock(parent->d_inode); > - child = parent; > - parent = parent->d_parent; > - inode_lock(parent->d_inode); > - > - if (child != dentry) > - /* go up */ > - goto loop; > - > - if (!__tracefs_remove(child, parent)) > - simple_release_fs(&tracefs_mount, &tracefs_mount_count); > - inode_unlock(parent->d_inode); > + simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count); > + simple_recursive_removal(dentry, remove_one); > + simple_release_fs(&tracefs_mount, &tracefs_mount_count); > } > > /** > diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h > index 58424eb3b329..0a817d763f0f 100644 > --- a/include/linux/debugfs.h > +++ b/include/linux/debugfs.h > @@ -82,7 +82,7 @@ struct dentry *debugfs_create_automount(const char *name, > void *data); > > void debugfs_remove(struct dentry *dentry); > -void debugfs_remove_recursive(struct dentry *dentry); > +#define debugfs_remove_recursive debugfs_remove > > const struct file_operations *debugfs_real_fops(const struct file *filp); > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 997a530ff4e9..73ffc8654987 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -3242,6 +3242,8 @@ extern int simple_unlink(struct inode *, struct dentry *); > extern int simple_rmdir(struct inode *, struct dentry *); > extern int simple_rename(struct inode *, struct dentry *, > struct inode *, struct dentry *, unsigned int); > +extern void simple_recursive_removal(struct dentry *, > + void (*callback)(struct dentry *)); > extern int noop_fsync(struct file *, loff_t, loff_t, int); > extern int noop_set_page_dirty(struct page *page); > extern void noop_invalidatepage(struct page *page, unsigned int offset, > diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h > index 88d279c1b863..99912445974c 100644 > --- a/include/linux/tracefs.h > +++ b/include/linux/tracefs.h > @@ -28,7 +28,6 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, > struct dentry *tracefs_create_dir(const char *name, struct dentry *parent); > > void tracefs_remove(struct dentry *dentry); > -void tracefs_remove_recursive(struct dentry *dentry); > > struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, > int (*mkdir)(const char *name), > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > index 563e80f9006a..88d94dc3ed37 100644 > --- a/kernel/trace/trace.c > +++ b/kernel/trace/trace.c > @@ -8366,7 +8366,7 @@ struct trace_array *trace_array_create(const char *name) > > ret = event_trace_add_tracer(tr->dir, tr); > if (ret) { > - tracefs_remove_recursive(tr->dir); > + tracefs_remove(tr->dir); > goto out_free_tr; > } > > @@ -8422,7 +8422,7 @@ static int __remove_instance(struct trace_array *tr) > event_trace_del_tracer(tr); > ftrace_clear_pids(tr); > ftrace_destroy_function_files(tr); > - tracefs_remove_recursive(tr->dir); > + tracefs_remove(tr->dir); > free_trace_buffers(tr); > > for (i = 0; i < tr->nr_topts; i++) { > diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c > index 648930823b57..25bb3e8fb170 100644 > --- a/kernel/trace/trace_events.c > +++ b/kernel/trace/trace_events.c > @@ -696,7 +696,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) > return; > > if (!--dir->nr_events) { > - tracefs_remove_recursive(dir->entry); > + tracefs_remove(dir->entry); > list_del(&dir->list); > __put_system_dir(dir); > } > @@ -715,7 +715,7 @@ static void remove_event_file_dir(struct trace_event_file *file) > } > spin_unlock(&dir->d_lock); > > - tracefs_remove_recursive(dir); > + tracefs_remove(dir); > } > > list_del(&file->list); > @@ -3032,7 +3032,7 @@ int event_trace_del_tracer(struct trace_array *tr) > > down_write(&trace_event_sem); > __trace_remove_event_dirs(tr); > - tracefs_remove_recursive(tr->event_dir); > + tracefs_remove(tr->event_dir); > up_write(&trace_event_sem); > > tr->event_dir = NULL; > diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c > index fa95139445b2..fa45a031848c 100644 > --- a/kernel/trace/trace_hwlat.c > +++ b/kernel/trace/trace_hwlat.c > @@ -551,7 +551,7 @@ static int init_tracefs(void) > return 0; > > err: > - tracefs_remove_recursive(top_dir); > + tracefs_remove(top_dir); > return -ENOMEM; > } > > The patch in linux-next commit 653f0d05be0948e7610bb786e6570bb6c48a4e75 (HEAD, refs/bisect/bad) Author: Al Viro <viro@zeniv.linux.org.uk> Date: Mon Nov 18 09:43:10 2019 -0500 simple_recursive_removal(): kernel-side rm -rf for ramfs-style filesystems two requirements: no file creations in IS_DEADDIR and no cross-directory renames whatsoever. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Makes my simple QEMU setup crash when booting [ 4.571181] list_del corruption. prev->next should be ffff8b75df3408d0, but was ffff8b75df340d50 [ 4.572064] ------------[ cut here ]------------ [ 4.572448] kernel BUG at lib/list_debug.c:51! [ 4.572838] invalid opcode: 0000 [#1] SMP NOPTI [ 4.573235] CPU: 0 PID: 479 Comm: systemd-udevd Not tainted 5.5.0-rc1+ #14 [ 4.573827] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4 [ 4.574782] RIP: 0010:__list_del_entry_valid.cold+0x31/0x55 [ 4.575252] Code: 0d 3d a8 e8 14 dd bd ff 0f 0b 48 c7 c7 00 0e 3d a8 e8 06 dd bd ff 0f 0b 48 89 f2 48 89 fe 48 c7b [ 4.576829] RSP: 0018:ffffaef9401ebd30 EFLAGS: 00010246 [ 4.577283] RAX: 0000000000000054 RBX: ffff8b75df3416c0 RCX: 0000000000000000 [ 4.577879] RDX: 0000000000000000 RSI: ffff8b757fa1a248 RDI: ffff8b757fa1a248 [ 4.578479] RBP: ffff8b75df3407e0 R08: 0000000000000000 R09: 0000000000000000 [ 4.579055] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8b75df340860 [ 4.579660] R13: 0000000000000000 R14: ffff8b75df3416c0 R15: ffff8b75d6bda620 [ 4.580257] FS: 00007f3016d08940(0000) GS:ffff8b757fa00000(0000) knlGS:0000000000000000 [ 4.580941] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4.581425] CR2: 00005555eedf9cf3 CR3: 0000000197fa2000 CR4: 00000000000006f0 [ 4.582034] Call Trace: [ 4.582256] __dentry_kill+0x86/0x190 [ 4.582577] ? dput+0x20/0x460 [ 4.582839] dput+0x2a6/0x460 [ 4.583100] debugfs_remove+0x40/0x60 [ 4.583403] blk_mq_debugfs_unregister_sched+0x15/0x30 [ 4.583825] blk_mq_exit_sched+0x6b/0xa0 [ 4.584154] __elevator_exit+0x32/0x50 [ 4.584460] elevator_switch_mq+0x63/0x170 [ 4.584801] elevator_switch+0x33/0x70 [ 4.585114] elv_iosched_store+0x135/0x1b0 [ 4.585450] queue_attr_store+0x47/0x70 [ 4.585779] kernfs_fop_write+0xdc/0x1c0 [ 4.586128] vfs_write+0xdb/0x1d0 [ 4.586423] ksys_write+0x65/0xe0 [ 4.586716] do_syscall_64+0x5c/0xa0 [ 4.587029] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 4.587472] RIP: 0033:0x7f3017d4a467 [ 4.587782] Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 854 [ 4.589353] RSP: 002b:00007ffc7fa4f518 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 4.589994] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f3017d4a467 [ 4.590605] RDX: 0000000000000004 RSI: 00007ffc7fa4f600 RDI: 000000000000000f [ 4.591212] RBP: 00007ffc7fa4f600 R08: fefefefefefefeff R09: ffffffff00000000 [ 4.591820] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004 [ 4.592423] R13: 000055cdeaffe190 R14: 0000000000000004 R15: 00007f3017e1b700 [ 4.593032] Modules linked in: [ 4.593307] ---[ end trace 42f66ce1e6e1c1fe ]--- [ 4.593694] RIP: 0010:__list_del_entry_valid.cold+0x31/0x55 [ 4.594173] Code: 0d 3d a8 e8 14 dd bd ff 0f 0b 48 c7 c7 00 0e 3d a8 e8 06 dd bd ff 0f 0b 48 89 f2 48 89 fe 48 c7b [ 4.595756] RSP: 0018:ffffaef9401ebd30 EFLAGS: 00010246 [ 4.596205] RAX: 0000000000000054 RBX: ffff8b75df3416c0 RCX: 0000000000000000 [ 4.596818] RDX: 0000000000000000 RSI: ffff8b757fa1a248 RDI: ffff8b757fa1a248 [ 4.597423] RBP: ffff8b75df3407e0 R08: 0000000000000000 R09: 0000000000000000 [ 4.598038] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8b75df340860 [ 4.598640] R13: 0000000000000000 R14: ffff8b75df3416c0 R15: ffff8b75d6bda620 [ 4.599241] FS: 00007f3016d08940(0000) GS:ffff8b757fa00000(0000) knlGS:0000000000000000 [ 4.599936] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4.600415] CR2: 00005555eedf9cf3 CR3: 0000000197fa2000 CR4: 00000000000006f0 [ 4.601034] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:38 [ 4.601789] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 479, name: systemd-udevd [ 4.602505] INFO: lockdep is turned off. [ 4.602837] CPU: 0 PID: 479 Comm: systemd-udevd Tainted: G D 5.5.0-rc1+ #14 [ 4.603549] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4 [ 4.604520] Call Trace: [ 4.604736] dump_stack+0x8f/0xd0 [ 4.605020] ___might_sleep.cold+0xb3/0xc3 [ 4.605374] exit_signals+0x30/0x2d0 [ 4.605689] do_exit+0xb4/0xc40 [ 4.605961] ? ksys_write+0x65/0xe0 [ 4.606256] rewind_stack_do_exit+0x17/0x20 [ 4.606624] note: systemd-udevd[479] exited with preempt_count 2 [ 4.611186] list_del corruption. prev->next should be ffff8b75df3489f0, but was ffff8b75df3480f0 [ 4.611972] ------------[ cut here ]------------ [ 4.612371] kernel BUG at lib/list_debug.c:51! [ 4.612783] invalid opcode: 0000 [#2] SMP NOPTI [ 4.613161] CPU: 0 PID: 511 Comm: systemd-udevd Tainted: G D W 5.5.0-rc1+ #14 [ 4.613875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4 [ 4.614842] RIP: 0010:__list_del_entry_valid.cold+0x31/0x55 [ 4.615309] Code: 0d 3d a8 e8 14 dd bd ff 0f 0b 48 c7 c7 00 0e 3d a8 e8 06 dd bd ff 0f 0b 48 89 f2 48 89 fe 48 c7b [ 4.616880] RSP: 0018:ffffaef9402a3d30 EFLAGS: 00010246 [ 4.617327] RAX: 0000000000000054 RBX: ffff8b75df347240 RCX: 0000000000000000 [ 4.617930] RDX: 0000000000000000 RSI: ffff8b757fa1a248 RDI: ffff8b757fa1a248 [ 4.618541] RBP: ffff8b75df348900 R08: 0000000000000000 R09: 0000000000000001 [ 4.619140] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8b75df348980 [ 4.619743] R13: 0000000000000000 R14: ffff8b75df347240 R15: ffff8b75d6a25020 [ 4.620349] FS: 00007f3016d08940(0000) GS:ffff8b757fa00000(0000) knlGS:0000000000000000 [ 4.621030] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4.621508] CR2: 000055cdeb042e48 CR3: 000000019e202000 CR4: 00000000000006f0 [ 4.622099] Call Trace: [ 4.622322] __dentry_kill+0x86/0x190 [ 4.622645] ? dput+0x20/0x460 [ 4.622911] dput+0x2a6/0x460 [ 4.623170] debugfs_remove+0x40/0x60 [ 4.623495] blk_mq_debugfs_unregister_sched+0x15/0x30 [ 4.623929] blk_mq_exit_sched+0x6b/0xa0 [ 4.624264] __elevator_exit+0x32/0x50 [ 4.624593] elevator_switch_mq+0x63/0x170 [ 4.624945] elevator_switch+0x33/0x70 [ 4.625268] elv_iosched_store+0x135/0x1b0 [ 4.625619] queue_attr_store+0x47/0x70 [ 4.625951] kernfs_fop_write+0xdc/0x1c0 [ 4.626289] vfs_write+0xdb/0x1d0 [ 4.626583] ksys_write+0x65/0xe0 [ 4.626870] do_syscall_64+0x5c/0xa0 [ 4.627180] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 4.627616] RIP: 0033:0x7f3017d4a467 [ 4.627925] Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 854 [ 4.629499] RSP: 002b:00007ffc7fa4f578 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 4.630136] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f3017d4a467 [ 4.630735] RDX: 0000000000000004 RSI: 00007ffc7fa4f660 RDI: 000000000000000f [ 4.631334] RBP: 00007ffc7fa4f660 R08: fefefefefefefeff R09: ffffffff00000000 [ 4.631940] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004 [ 4.632540] R13: 000055cdeaffe190 R14: 0000000000000004 R15: 00007f3017e1b700 [ 4.633141] Modules linked in: [ 4.633414] ---[ end trace 42f66ce1e6e1c1ff ]--- [ 4.633814] RIP: 0010:__list_del_entry_valid.cold+0x31/0x55 [ 4.634289] Code: 0d 3d a8 e8 14 dd bd ff 0f 0b 48 c7 c7 00 0e 3d a8 e8 06 dd bd ff 0f 0b 48 89 f2 48 89 fe 48 c7b [ 4.635881] RSP: 0018:ffffaef9401ebd30 EFLAGS: 00010246 [ 4.636329] RAX: 0000000000000054 RBX: ffff8b75df3416c0 RCX: 0000000000000000 [ 4.636940] RDX: 0000000000000000 RSI: ffff8b757fa1a248 RDI: ffff8b757fa1a248 [ 4.637544] RBP: ffff8b75df3407e0 R08: 0000000000000000 R09: 0000000000000000 [ 4.638149] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8b75df340860 [ 4.638750] R13: 0000000000000000 R14: ffff8b75df3416c0 R15: ffff8b75d6bda620 [ 4.639360] FS: 00007f3016d08940(0000) GS:ffff8b757fa00000(0000) knlGS:0000000000000000 [ 4.640047] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4.640537] CR2: 000055cdeb042e48 CR3: 000000019e202000 CR4: 00000000000006f0 [ 4.641128] note: systemd-udevd[511] exited with preempt_count 2 Reverting that commit makes it work again. How does that untested and unreviewed patch end up in linux-next? Took me 30min to bisect.
On Wed, Dec 11, 2019 at 04:55:56PM +0100, David Hildenbrand wrote: [snip] > The patch in linux-next > > commit 653f0d05be0948e7610bb786e6570bb6c48a4e75 (HEAD, refs/bisect/bad) ... is no longer there. commit a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672 is; could you check if it fixes your reproducer?
On 11.12.19 19:46, Al Viro wrote: > On Wed, Dec 11, 2019 at 04:55:56PM +0100, David Hildenbrand wrote: > > [snip] > >> The patch in linux-next >> >> commit 653f0d05be0948e7610bb786e6570bb6c48a4e75 (HEAD, refs/bisect/bad) > > ... is no longer there. commit a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672 > is; could you check if it fixes your reproducer? > desktop: ~/git/linux memory_holes $ git fetch next desktop: ~/git/linux memory_holes $ git show a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672 fatal: bad object a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672 I'll go hunt for that commit :) ... guess it will show up in -next soon.
On 11.12.19 20:18, David Hildenbrand wrote: > On 11.12.19 19:46, Al Viro wrote: >> On Wed, Dec 11, 2019 at 04:55:56PM +0100, David Hildenbrand wrote: >> >> [snip] >> >>> The patch in linux-next >>> >>> commit 653f0d05be0948e7610bb786e6570bb6c48a4e75 (HEAD, refs/bisect/bad) >> >> ... is no longer there. commit a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672 >> is; could you check if it fixes your reproducer? >> > > desktop: ~/git/linux memory_holes $ git fetch next > desktop: ~/git/linux memory_holes $ git show > a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672 > fatal: bad object a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672 > > I'll go hunt for that commit :) ... guess it will show up in -next soon. > Found it on vfs.git - seems to do its job in my setup (booting Fedora 31 with custom kernel in QEMU).
diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 10090f11ab95..830405b401ce 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -129,7 +129,16 @@ struct dentry { enum dentry_d_lock_class { DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ - DENTRY_D_LOCK_NESTED + /* + * used by spin_lock_nested when two d_lock need to be taken + * at the same time + */ + DENTRY_D_LOCK_NESTED, + /* + * used by spin_lock_nested when three d_lock need to be taken + * at the same time + */ + DENTRY_D_LOCK_NESTED_TWICE }; struct dentry_operations {
'dentry_d_lock_class' can be used for spin_lock_nested in case lockdep confused about two different dentry take the 'd_lock'. However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'. Signed-off-by: yu kuai <yukuai3@huawei.com> --- include/linux/dcache.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-)