Message ID | 87vbcw9i8g.fsf_-_@x220.int.ebiederm.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 08/04/2015 12:26 AM, Eric W. Biederman wrote: > > This is needed infrastructure for better handling of when files > or directories are moved out from under the root of a bind mount. > > Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> > --- > fs/mount.h | 7 +++ > fs/namespace.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++-- > include/linux/dcache.h | 7 +++ > 3 files changed, 130 insertions(+), 4 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index 14db05d424f7..e8f22970fe59 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -27,6 +27,12 @@ struct mountpoint { > int m_count; > }; > > +struct mountroot { > + struct hlist_node r_hash; > + struct dentry *r_dentry; > + struct hlist_head r_list; > +}; > + > struct mount { > struct hlist_node mnt_hash; > struct mount *mnt_parent; > @@ -55,6 +61,7 @@ struct mount { > struct mnt_namespace *mnt_ns; /* containing namespace */ > struct mountpoint *mnt_mp; /* where is it mounted */ > struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ > + struct hlist_node mnt_mr_list; /* list mounts with the same mountroot */ > #ifdef CONFIG_FSNOTIFY > struct hlist_head mnt_fsnotify_marks; > __u32 mnt_fsnotify_mask; > diff --git a/fs/namespace.c b/fs/namespace.c > index 2b8aa15fd6df..2ce987af9afa 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -31,6 +31,8 @@ static unsigned int m_hash_mask __read_mostly; > static unsigned int m_hash_shift __read_mostly; > static unsigned int mp_hash_mask __read_mostly; > static unsigned int mp_hash_shift __read_mostly; > +static unsigned int mr_hash_mask __read_mostly; > +static unsigned int mr_hash_shift __read_mostly; > > static __initdata unsigned long mhash_entries; > static int __init set_mhash_entries(char *str) > @@ -52,6 +54,16 @@ static int __init set_mphash_entries(char *str) > } > __setup("mphash_entries=", set_mphash_entries); > > +static __initdata unsigned long mrhash_entries; > +static int __init set_mrhash_entries(char *str) > +{ > + if (!str) > + return 0; > + mrhash_entries = simple_strtoul(str, &str, 0); Nit: Any particular reason for using simple_* rather than kstrto* family of functions? > + return 1; > +} > +__setup("mrhash_entries=", set_mrhash_entries); > + > static u64 event; > static DEFINE_IDA(mnt_id_ida); > static DEFINE_IDA(mnt_group_ida); > @@ -61,6 +73,7 @@ static int mnt_group_start = 1; > > static struct hlist_head *mount_hashtable __read_mostly; > static struct hlist_head *mountpoint_hashtable __read_mostly; > +static struct hlist_head *mountroot_hashtable __read_mostly; > static struct kmem_cache *mnt_cache __read_mostly; > static DECLARE_RWSEM(namespace_sem); > > @@ -93,6 +106,13 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry) > return &mountpoint_hashtable[tmp & mp_hash_mask]; > } > > +static inline struct hlist_head *mr_hash(struct dentry *dentry) > +{ > + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); > + tmp = tmp + (tmp >> mr_hash_shift); > + return &mountroot_hashtable[tmp & mr_hash_mask]; > +} > + > /* > * allocation is serialized by namespace_sem, but we need the spinlock to > * serialize with freeing. > @@ -234,6 +254,7 @@ static struct mount *alloc_vfsmnt(const char *name) > INIT_LIST_HEAD(&mnt->mnt_slave_list); > INIT_LIST_HEAD(&mnt->mnt_slave); > INIT_HLIST_NODE(&mnt->mnt_mp_list); > + INIT_HLIST_NODE(&mnt->mnt_mr_list); > #ifdef CONFIG_FSNOTIFY > INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); > #endif > @@ -779,6 +800,77 @@ static void put_mountpoint(struct mountpoint *mp) > } > } > > +static struct mountroot *lookup_mountroot(struct dentry *dentry) > +{ > + struct hlist_head *chain = mr_hash(dentry); > + struct mountroot *mr; > + > + hlist_for_each_entry(mr, chain, r_hash) { > + if (mr->r_dentry == dentry) > + return mr; > + } > + return NULL; > +} > + > +static int mnt_set_root(struct mount *mnt, struct dentry *root) > +{ > + struct mountroot *mr = NULL; > + > + read_seqlock_excl(&mount_lock); > + if (d_mountroot(root)) > + mr = lookup_mountroot(root); > + if (!mr) { > + struct mountroot *new; > + read_sequnlock_excl(&mount_lock); > + > + new = kmalloc(sizeof(struct mountroot), GFP_KERNEL); > + if (!new) > + return -ENOMEM; > + > + read_seqlock_excl(&mount_lock); > + mr = lookup_mountroot(root); > + if (mr) { > + kfree(new); > + } else { > + struct hlist_head *chain = mr_hash(root); > + > + mr = new; > + mr->r_dentry = root; > + INIT_HLIST_HEAD(&mr->r_list); > + hlist_add_head(&mr->r_hash, chain); > + > + spin_lock(&root->d_lock); > + root->d_flags |= DCACHE_MOUNTROOT; > + spin_unlock(&root->d_lock); > + } > + } > + mnt->mnt.mnt_root = root; > + hlist_add_head(&mnt->mnt_mr_list, &mr->r_list); > + read_sequnlock_excl(&mount_lock); > + > + return 0; > +} > + > +static void mnt_put_root(struct mount *mnt) > +{ > + struct dentry *root = mnt->mnt.mnt_root; > + struct mountroot *mr; > + > + read_seqlock_excl(&mount_lock); > + mr = lookup_mountroot(root); > + BUG_ON(!mr); > + hlist_del(&mnt->mnt_mr_list); > + if (hlist_empty(&mr->r_list)) { > + hlist_del(&mr->r_hash); > + spin_lock(&root->d_lock); > + root->d_flags &= ~DCACHE_MOUNTROOT; > + spin_unlock(&root->d_lock); > + kfree(mr); > + } > + read_sequnlock_excl(&mount_lock); > + dput(root); > +} > + > static inline int check_mnt(struct mount *mnt) > { > return mnt->mnt_ns == current->nsproxy->mnt_ns; > @@ -934,6 +1026,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void > { > struct mount *mnt; > struct dentry *root; > + int err; > > if (!type) > return ERR_PTR(-ENODEV); > @@ -952,8 +1045,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void > return ERR_CAST(root); > } > > - mnt->mnt.mnt_root = root; > mnt->mnt.mnt_sb = root->d_sb; > + err = mnt_set_root(mnt, root); > + if (err) { > + dput(root); > + deactivate_super(mnt->mnt.mnt_sb); > + mnt_free_id(mnt); > + free_vfsmnt(mnt); > + return ERR_PTR(err); > + } > + > mnt->mnt_mountpoint = mnt->mnt.mnt_root; > mnt->mnt_parent = mnt; > lock_mount_hash(); > @@ -985,6 +1086,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, > goto out_free; > } > > + err = mnt_set_root(mnt, root); > + if (err) > + goto out_free; > + > mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); > /* Don't allow unprivileged users to change mount flags */ > if (flag & CL_UNPRIVILEGED) { > @@ -1010,7 +1115,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, > > atomic_inc(&sb->s_active); > mnt->mnt.mnt_sb = sb; > - mnt->mnt.mnt_root = dget(root); > + dget(root); > mnt->mnt_mountpoint = mnt->mnt.mnt_root; > mnt->mnt_parent = mnt; > lock_mount_hash(); > @@ -1063,7 +1168,7 @@ static void cleanup_mnt(struct mount *mnt) > if (unlikely(mnt->mnt_pins.first)) > mnt_pin_kill(mnt); > fsnotify_vfsmount_delete(&mnt->mnt); > - dput(mnt->mnt.mnt_root); > + mnt_put_root(mnt); > deactivate_super(mnt->mnt.mnt_sb); > mnt_free_id(mnt); > call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); > @@ -3120,14 +3225,21 @@ void __init mnt_init(void) > mphash_entries, 19, > 0, > &mp_hash_shift, &mp_hash_mask, 0, 0); > + mountroot_hashtable = alloc_large_system_hash("Mountroot-cache", > + sizeof(struct hlist_head), > + mrhash_entries, 19, > + 0, > + &mr_hash_shift, &mr_hash_mask, 0, 0); > > - if (!mount_hashtable || !mountpoint_hashtable) > + if (!mount_hashtable || !mountpoint_hashtable || !mountroot_hashtable) > panic("Failed to allocate mount hash table\n"); > > for (u = 0; u <= m_hash_mask; u++) > INIT_HLIST_HEAD(&mount_hashtable[u]); > for (u = 0; u <= mp_hash_mask; u++) > INIT_HLIST_HEAD(&mountpoint_hashtable[u]); > + for (u = 0; u <= mr_hash_mask; u++) > + INIT_HLIST_HEAD(&mountroot_hashtable[u]); > > kernfs_init(); > > diff --git a/include/linux/dcache.h b/include/linux/dcache.h > index d67ae119cf4e..52a5e6915f58 100644 > --- a/include/linux/dcache.h > +++ b/include/linux/dcache.h > @@ -228,6 +228,8 @@ struct dentry_operations { > #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ > #define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ > > +#define DCACHE_MOUNTROOT 0x04000000 /* Root of a vfsmount */ > + > extern seqlock_t rename_lock; > > /* > @@ -404,6 +406,11 @@ static inline bool d_mountpoint(const struct dentry *dentry) > return dentry->d_flags & DCACHE_MOUNTED; > } > > +static inline bool d_mountroot(const struct dentry *dentry) > +{ > + return dentry->d_flags & DCACHE_MOUNTROOT; > +} > + > /* > * Directory cache entry type accessor functions. > */ > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Nikolay Borisov <kernel@kyup.com> writes: >> diff --git a/fs/namespace.c b/fs/namespace.c >> index 2b8aa15fd6df..2ce987af9afa 100644 >> --- a/fs/namespace.c >> +++ b/fs/namespace.c >> @@ -31,6 +31,8 @@ static unsigned int m_hash_mask __read_mostly; >> static unsigned int m_hash_shift __read_mostly; >> static unsigned int mp_hash_mask __read_mostly; >> static unsigned int mp_hash_shift __read_mostly; >> +static unsigned int mr_hash_mask __read_mostly; >> +static unsigned int mr_hash_shift __read_mostly; >> >> static __initdata unsigned long mhash_entries; >> static int __init set_mhash_entries(char *str) >> @@ -52,6 +54,16 @@ static int __init set_mphash_entries(char *str) >> } >> __setup("mphash_entries=", set_mphash_entries); >> >> +static __initdata unsigned long mrhash_entries; >> +static int __init set_mrhash_entries(char *str) >> +{ >> + if (!str) >> + return 0; >> + mrhash_entries = simple_strtoul(str, &str, 0); > > Nit: Any particular reason for using simple_* rather than kstrto* family > of functions? That is what set_mhash_entries, and set_mphash_entries do, and I maintained the existing style in the code. It does look like a followup change to add error handling in the pathological cases might be worthwhile. Although it would probably be even better to convert these hash tables into rcu resizeable hash tables that can automatically grow to the size needed. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/mount.h b/fs/mount.h index 14db05d424f7..e8f22970fe59 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -27,6 +27,12 @@ struct mountpoint { int m_count; }; +struct mountroot { + struct hlist_node r_hash; + struct dentry *r_dentry; + struct hlist_head r_list; +}; + struct mount { struct hlist_node mnt_hash; struct mount *mnt_parent; @@ -55,6 +61,7 @@ struct mount { struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ + struct hlist_node mnt_mr_list; /* list mounts with the same mountroot */ #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/namespace.c b/fs/namespace.c index 2b8aa15fd6df..2ce987af9afa 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -31,6 +31,8 @@ static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; static unsigned int mp_hash_shift __read_mostly; +static unsigned int mr_hash_mask __read_mostly; +static unsigned int mr_hash_shift __read_mostly; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) @@ -52,6 +54,16 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); +static __initdata unsigned long mrhash_entries; +static int __init set_mrhash_entries(char *str) +{ + if (!str) + return 0; + mrhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mrhash_entries=", set_mrhash_entries); + static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); @@ -61,6 +73,7 @@ static int mnt_group_start = 1; static struct hlist_head *mount_hashtable __read_mostly; static struct hlist_head *mountpoint_hashtable __read_mostly; +static struct hlist_head *mountroot_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static DECLARE_RWSEM(namespace_sem); @@ -93,6 +106,13 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry) return &mountpoint_hashtable[tmp & mp_hash_mask]; } +static inline struct hlist_head *mr_hash(struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> mr_hash_shift); + return &mountroot_hashtable[tmp & mr_hash_mask]; +} + /* * allocation is serialized by namespace_sem, but we need the spinlock to * serialize with freeing. @@ -234,6 +254,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); + INIT_HLIST_NODE(&mnt->mnt_mr_list); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif @@ -779,6 +800,77 @@ static void put_mountpoint(struct mountpoint *mp) } } +static struct mountroot *lookup_mountroot(struct dentry *dentry) +{ + struct hlist_head *chain = mr_hash(dentry); + struct mountroot *mr; + + hlist_for_each_entry(mr, chain, r_hash) { + if (mr->r_dentry == dentry) + return mr; + } + return NULL; +} + +static int mnt_set_root(struct mount *mnt, struct dentry *root) +{ + struct mountroot *mr = NULL; + + read_seqlock_excl(&mount_lock); + if (d_mountroot(root)) + mr = lookup_mountroot(root); + if (!mr) { + struct mountroot *new; + read_sequnlock_excl(&mount_lock); + + new = kmalloc(sizeof(struct mountroot), GFP_KERNEL); + if (!new) + return -ENOMEM; + + read_seqlock_excl(&mount_lock); + mr = lookup_mountroot(root); + if (mr) { + kfree(new); + } else { + struct hlist_head *chain = mr_hash(root); + + mr = new; + mr->r_dentry = root; + INIT_HLIST_HEAD(&mr->r_list); + hlist_add_head(&mr->r_hash, chain); + + spin_lock(&root->d_lock); + root->d_flags |= DCACHE_MOUNTROOT; + spin_unlock(&root->d_lock); + } + } + mnt->mnt.mnt_root = root; + hlist_add_head(&mnt->mnt_mr_list, &mr->r_list); + read_sequnlock_excl(&mount_lock); + + return 0; +} + +static void mnt_put_root(struct mount *mnt) +{ + struct dentry *root = mnt->mnt.mnt_root; + struct mountroot *mr; + + read_seqlock_excl(&mount_lock); + mr = lookup_mountroot(root); + BUG_ON(!mr); + hlist_del(&mnt->mnt_mr_list); + if (hlist_empty(&mr->r_list)) { + hlist_del(&mr->r_hash); + spin_lock(&root->d_lock); + root->d_flags &= ~DCACHE_MOUNTROOT; + spin_unlock(&root->d_lock); + kfree(mr); + } + read_sequnlock_excl(&mount_lock); + dput(root); +} + static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; @@ -934,6 +1026,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void { struct mount *mnt; struct dentry *root; + int err; if (!type) return ERR_PTR(-ENODEV); @@ -952,8 +1045,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void return ERR_CAST(root); } - mnt->mnt.mnt_root = root; mnt->mnt.mnt_sb = root->d_sb; + err = mnt_set_root(mnt, root); + if (err) { + dput(root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + free_vfsmnt(mnt); + return ERR_PTR(err); + } + mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; lock_mount_hash(); @@ -985,6 +1086,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } + err = mnt_set_root(mnt, root); + if (err) + goto out_free; + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ if (flag & CL_UNPRIVILEGED) { @@ -1010,7 +1115,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; - mnt->mnt.mnt_root = dget(root); + dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; lock_mount_hash(); @@ -1063,7 +1168,7 @@ static void cleanup_mnt(struct mount *mnt) if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); fsnotify_vfsmount_delete(&mnt->mnt); - dput(mnt->mnt.mnt_root); + mnt_put_root(mnt); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); @@ -3120,14 +3225,21 @@ void __init mnt_init(void) mphash_entries, 19, 0, &mp_hash_shift, &mp_hash_mask, 0, 0); + mountroot_hashtable = alloc_large_system_hash("Mountroot-cache", + sizeof(struct hlist_head), + mrhash_entries, 19, + 0, + &mr_hash_shift, &mr_hash_mask, 0, 0); - if (!mount_hashtable || !mountpoint_hashtable) + if (!mount_hashtable || !mountpoint_hashtable || !mountroot_hashtable) panic("Failed to allocate mount hash table\n"); for (u = 0; u <= m_hash_mask; u++) INIT_HLIST_HEAD(&mount_hashtable[u]); for (u = 0; u <= mp_hash_mask; u++) INIT_HLIST_HEAD(&mountpoint_hashtable[u]); + for (u = 0; u <= mr_hash_mask; u++) + INIT_HLIST_HEAD(&mountroot_hashtable[u]); kernfs_init(); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d67ae119cf4e..52a5e6915f58 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -228,6 +228,8 @@ struct dentry_operations { #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ #define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ +#define DCACHE_MOUNTROOT 0x04000000 /* Root of a vfsmount */ + extern seqlock_t rename_lock; /* @@ -404,6 +406,11 @@ static inline bool d_mountpoint(const struct dentry *dentry) return dentry->d_flags & DCACHE_MOUNTED; } +static inline bool d_mountroot(const struct dentry *dentry) +{ + return dentry->d_flags & DCACHE_MOUNTROOT; +} + /* * Directory cache entry type accessor functions. */
This is needed infrastructure for better handling of when files or directories are moved out from under the root of a bind mount. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> --- fs/mount.h | 7 +++ fs/namespace.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/dcache.h | 7 +++ 3 files changed, 130 insertions(+), 4 deletions(-)