Message ID | YnEeuw6fd1A8usjj@miu.piliscsaba.redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] getting misc stats/attributes via xattr API | expand |
On Tue, May 3, 2022 at 3:23 PM Miklos Szeredi <miklos@szeredi.hu> wrote: > > This is a simplification of the getvalues(2) prototype and moving it to the > getxattr(2) interface, as suggested by Dave. > > The patch itself just adds the possibility to retrieve a single line of > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > patchset grew out of). > > But this should be able to serve Amir's per-sb iostats, as well as a host of > other cases where some statistic needs to be retrieved from some object. Note: > a filesystem object often represents other kinds of objects (such as processes > in /proc) so this is not limited to fs attributes. > > This also opens up the interface to setting attributes via setxattr(2). > > After some pondering I made the namespace so: > > : - root > bar - an attribute > foo: - a folder (can contain attributes and/or folders) > > The contents of a folder is represented by a null separated list of names. > > Examples: > > $ getfattr -etext -n ":" . > # file: . > :="mnt:\000mntns:" > > $ getfattr -etext -n ":mnt:" . > # file: . > :mnt:="info" > > $ getfattr -etext -n ":mnt:info" . > # file: . > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > $ getfattr -etext -n ":mntns:" . > # file: . > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > $ getfattr -etext -n ":mntns:28:" . > # file: . > :mntns:28:="info" > > Comments? > I like that :) It should be noted that while this API mandates text keys, it does not mandate text values, so for example, sb iostats could be exported as text or as binary struct, or as individual text/binary records or all of the above. We can relive this sort of discussion for every property that we add. Fun! Folks interested in this discussion are welcome to join the Zoom discussion on LSFMM tomorrow, Wed at 11AM PDT: https://zoom.us/j/99394450657?pwd=ZHE2TzdXV2MzWE9yVnpLYzJNZDBuUT09 Folks who did not get an invite and would like to participate, please email me in private. Thanks, Amir. > > --- > fs/Makefile | 2 > fs/mount.h | 8 + > fs/namespace.c | 15 ++- > fs/pnode.h | 2 > fs/proc_namespace.c | 15 ++- > fs/values.c | 242 +++++++++++++++++++++++++++++++++++++++++++++++++ > fs/xattr.c | 16 ++- > include/linux/values.h | 11 ++ > 8 files changed, 295 insertions(+), 16 deletions(-) > > --- a/fs/Makefile > +++ b/fs/Makefile > @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table. > pnode.o splice.o sync.o utimes.o d_path.o \ > stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ > fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ > - kernel_read_file.o remap_range.o > + kernel_read_file.o remap_range.o values.o > > ifeq ($(CONFIG_BLOCK),y) > obj-y += buffer.o direct-io.o mpage.o > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -148,3 +148,11 @@ static inline bool is_anon_ns(struct mnt > } > > extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); > + > +struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p); > +extern void namespace_lock_read(void); > +extern void namespace_unlock_read(void); > +extern int show_mountinfo_root(struct seq_file *m, struct vfsmount *mnt, > + struct path *root); > +extern bool is_path_reachable(struct mount *, struct dentry *, > + const struct path *root); > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -1332,9 +1332,7 @@ struct vfsmount *mnt_clone_internal(cons > return &p->mnt; > } > > -#ifdef CONFIG_PROC_FS > -static struct mount *mnt_list_next(struct mnt_namespace *ns, > - struct list_head *p) > +struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p) > { > struct mount *mnt, *ret = NULL; > > @@ -1351,6 +1349,7 @@ static struct mount *mnt_list_next(struc > return ret; > } > > +#ifdef CONFIG_PROC_FS > /* iterator; we want it to have access to namespace_sem, thus here... */ > static void *m_start(struct seq_file *m, loff_t *pos) > { > @@ -1507,6 +1506,16 @@ static inline void namespace_lock(void) > down_write(&namespace_sem); > } > > +void namespace_lock_read(void) > +{ > + down_read(&namespace_sem); > +} > + > +void namespace_unlock_read(void) > +{ > + up_read(&namespace_sem); > +} > + > enum umount_tree_flags { > UMOUNT_SYNC = 1, > UMOUNT_PROPAGATE = 2, > --- a/fs/pnode.h > +++ b/fs/pnode.h > @@ -50,7 +50,5 @@ void mnt_set_mountpoint(struct mount *, > void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, > struct mount *mnt); > struct mount *copy_tree(struct mount *, struct dentry *, int); > -bool is_path_reachable(struct mount *, struct dentry *, > - const struct path *root); > int count_mounts(struct mnt_namespace *ns, struct mount *mnt); > #endif /* _LINUX_PNODE_H */ > --- a/fs/proc_namespace.c > +++ b/fs/proc_namespace.c > @@ -132,9 +132,9 @@ static int show_vfsmnt(struct seq_file * > return err; > } > > -static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) > +int show_mountinfo_root(struct seq_file *m, struct vfsmount *mnt, > + struct path *root) > { > - struct proc_mounts *p = m->private; > struct mount *r = real_mount(mnt); > struct super_block *sb = mnt->mnt_sb; > struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; > @@ -152,7 +152,7 @@ static int show_mountinfo(struct seq_fil > seq_putc(m, ' '); > > /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ > - err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); > + err = seq_path_root(m, &mnt_path, root, " \t\n\\"); > if (err) > goto out; > > @@ -164,7 +164,7 @@ static int show_mountinfo(struct seq_fil > seq_printf(m, " shared:%i", r->mnt_group_id); > if (IS_MNT_SLAVE(r)) { > int master = r->mnt_master->mnt_group_id; > - int dom = get_dominating_id(r, &p->root); > + int dom = get_dominating_id(r, root); > seq_printf(m, " master:%i", master); > if (dom && dom != master) > seq_printf(m, " propagate_from:%i", dom); > @@ -194,6 +194,13 @@ static int show_mountinfo(struct seq_fil > return err; > } > > +static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) > +{ > + struct proc_mounts *p = m->private; > + > + return show_mountinfo_root(m, mnt, &p->root); > +} > + > static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) > { > struct proc_mounts *p = m->private; > --- /dev/null > +++ b/fs/values.c > @@ -0,0 +1,242 @@ > +#include <linux/values.h> > +#include <linux/fs_struct.h> > +#include <linux/seq_file.h> > +#include <linux/nsproxy.h> > +#include "../lib/kstrtox.h" > +#include "mount.h" > + > +struct val_string { > + const char *str; > + size_t len; > +}; > + > +struct val_iter { > + struct val_string name; > + struct seq_file seq; > + int error; > +}; > + > +struct val_desc { > + struct val_string name; > + union { > + u64 idx; > + int (*get)(struct val_iter *vi, const struct path *path); > + }; > +}; > + > +#define VAL_STRING(x) { .str = x, .len = sizeof(x) - 1 } > +#define VD_NAME(x) .name = VAL_STRING(x) > + > +static int val_err(struct val_iter *vi, int err) > +{ > + vi->error = err; > + return 0; > +} > + > +static int val_end_seq(struct val_iter *vi) > +{ > + if (vi->seq.count == vi->seq.size) > + return -EOVERFLOW; > + > + return 0; > +} > + > +static inline void val_string_skip(struct val_string *s, size_t count) > +{ > + WARN_ON(s->len < count); > + s->str += count; > + s->len -= count; > +} > + > +static bool val_string_prefix(const struct val_string *p, > + const struct val_string *s) > +{ > + return s->len >= p->len && !memcmp(s->str, p->str, p->len); > +} > + > +static struct val_desc *val_lookup(struct val_iter *vi, struct val_desc *vd) > +{ > + for (; vd->name.len; vd++) { > + if (val_string_prefix(&vd->name, &vi->name)) { > + val_string_skip(&vi->name, vd->name.len); > + break; > + } > + } > + return vd; > +} > + > +static int val_get_group(struct val_iter *vi, struct val_desc *vd) > +{ > + for (; vd->name.len; vd++) > + seq_write(&vi->seq, vd->name.str, vd->name.len + 1); > + > + return val_end_seq(vi); > +} > + > +enum { > + VAL_MNT_INFO, > +}; > + > +static struct val_desc val_mnt_group[] = { > + { VD_NAME("info"), .idx = VAL_MNT_INFO }, > + { } > +}; > + > +static int val_mnt_show(struct val_iter *vi, struct vfsmount *mnt) > +{ > + struct val_desc *vd = val_lookup(vi, val_mnt_group); > + struct path root; > + > + if (!vd->name.str) > + return val_err(vi, -ENOENT); > + > + switch(vd->idx) { > + case VAL_MNT_INFO: > + get_fs_root(current->fs, &root); > + show_mountinfo_root(&vi->seq, mnt, &root); > + path_put(&root); > + break; > + } > + > + return 0; > +} > + > +static int val_mnt_get(struct val_iter *vi, const struct path *path) > +{ > + int err; > + > + if (!vi->name.len) > + return val_get_group(vi, val_mnt_group); > + > + namespace_lock_read(); > + err = val_mnt_show(vi, path->mnt); > + namespace_unlock_read(); > + > + return err; > +} > + > +/* called with namespace_sem held for read */ > +static struct vfsmount *mnt_lookup_by_id(struct mnt_namespace *ns, > + struct path *root, int id) > +{ > + struct mount *m; > + > + for (m = mnt_list_next(ns, &ns->list); m; m = mnt_list_next(ns, &m->mnt_list)) { > + if (m->mnt_id == id) { > + if (is_path_reachable(m, m->mnt.mnt_root, root)) > + return mntget(&m->mnt); > + else > + return NULL; > + } > + } > + return NULL; > +} > + > +static void seq_mnt_list(struct seq_file *seq, struct mnt_namespace *ns, > + struct path *root) > +{ > + struct mount *m; > + > + namespace_lock_read(); > + for (m = mnt_list_next(ns, &ns->list); m; m = mnt_list_next(ns, &m->mnt_list)) { > + if (is_path_reachable(m, m->mnt.mnt_root, root)) { > + seq_printf(seq, "%i:", m->mnt_id); > + seq_putc(seq, '\0'); > + } > + } > + namespace_unlock_read(); > +} > + > +static int val_mntns_get(struct val_iter *vi, const struct path *path) > +{ > + struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns; > + struct vfsmount *mnt; > + struct path root; > + unsigned long long mnt_id; > + unsigned int end; > + int err; > + > + if (!vi->name.len) { > + get_fs_root(current->fs, &root); > + seq_mnt_list(&vi->seq, mnt_ns, &root); > + path_put(&root); > + return val_end_seq(vi); > + } > + > + end = _parse_integer(vi->name.str, 10, &mnt_id); > + if (end & KSTRTOX_OVERFLOW) > + return val_err(vi, -ENOENT); > + if (vi->name.str[end] != VAL_SEP) > + return val_err(vi, -ENOENT); > + val_string_skip(&vi->name, end + 1); > + > + namespace_lock_read(); > + get_fs_root(current->fs, &root); > + mnt = mnt_lookup_by_id(mnt_ns, &root, mnt_id); > + path_put(&root); > + if (!mnt) { > + namespace_unlock_read(); > + return val_err(vi, -ENOENT); > + } > + if (vi->name.len) > + err = val_mnt_show(vi, mnt); > + else > + err = val_get_group(vi, val_mnt_group); > + > + namespace_unlock_read(); > + mntput(mnt); > + > + return err; > +} > + > + > + > +static struct val_desc val_toplevel_group[] = { > + { VD_NAME("mnt:"), .get = val_mnt_get, }, > + { VD_NAME("mntns:"), .get = val_mntns_get, }, > + { }, > +}; > + > +static int getvalues(struct val_iter *vi, const struct path *path) > +{ > + struct val_desc *vd; > + int err; > + > + if (!vi->name.len) > + return val_get_group(vi, val_toplevel_group); > + > + vd = val_lookup(vi, val_toplevel_group); > + if (!vd->name.len) > + err = val_err(vi, -ENOENT); > + else > + err = vd->get(vi, path); > + > + return err ?: vi->error; > +} > + > +ssize_t val_getxattr(struct path *path, const char *name, size_t namelen, > + void __user *value, size_t size) > +{ > + int err; > + char val[1024]; > + struct val_iter vi = { > + .name = { .str = name, .len = namelen }, > + .seq = { .buf = val, .size = min(sizeof(val), size) }, > + }; > + > + if (!size) > + return sizeof(val); > + > + val_string_skip(&vi.name, 1); > + > + err = getvalues(&vi, path); > + if (err < 0) > + return err; > + > + WARN_ON(vi.seq.count > size); > + if (copy_to_user(value, vi.seq.buf, vi.seq.count)) > + return -EFAULT; > + > + return vi.seq.count; > +} > + > --- a/fs/xattr.c > +++ b/fs/xattr.c > @@ -22,6 +22,7 @@ > #include <linux/audit.h> > #include <linux/vmalloc.h> > #include <linux/posix_acl_xattr.h> > +#include <linux/values.h> > > #include <linux/uaccess.h> > > @@ -643,12 +644,13 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, cons > * Extended attribute GET operations > */ > static ssize_t > -getxattr(struct user_namespace *mnt_userns, struct dentry *d, > - const char __user *name, void __user *value, size_t size) > +getxattr(struct path *path, const char __user *name, > + void __user *value, size_t size) > { > ssize_t error; > void *kvalue = NULL; > char kname[XATTR_NAME_MAX + 1]; > + struct user_namespace *mnt_userns = mnt_user_ns(path->mnt); > > error = strncpy_from_user(kname, name, sizeof(kname)); > if (error == 0 || error == sizeof(kname)) > @@ -656,6 +658,9 @@ getxattr(struct user_namespace *mnt_user > if (error < 0) > return error; > > + if (kname[0] == VAL_SEP) > + return val_getxattr(path, kname, error, value, size); > + > if (size) { > if (size > XATTR_SIZE_MAX) > size = XATTR_SIZE_MAX; > @@ -664,7 +669,7 @@ getxattr(struct user_namespace *mnt_user > return -ENOMEM; > } > > - error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); > + error = vfs_getxattr(mnt_userns, path->dentry, kname, kvalue, size); > if (error > 0) { > if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || > (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) > @@ -693,7 +698,7 @@ static ssize_t path_getxattr(const char > error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); > if (error) > return error; > - error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size); > + error = getxattr(&path, name, value, size); > path_put(&path); > if (retry_estale(error, lookup_flags)) { > lookup_flags |= LOOKUP_REVAL; > @@ -723,8 +728,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, cons > if (!f.file) > return error; > audit_file(f.file); > - error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, > - name, value, size); > + error = getxattr(&f.file->f_path, name, value, size); > fdput(f); > return error; > } > --- /dev/null > +++ b/include/linux/values.h > @@ -0,0 +1,11 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +#include <linux/types.h> > + > +#define VAL_SEP ':' > + > +struct path; > + > +ssize_t val_getxattr(struct path *path, const char *name, size_t namelen, > + void __user *value, size_t size); > +
On Tue, May 03, 2022 at 05:39:46PM +0300, Amir Goldstein wrote: > On Tue, May 3, 2022 at 3:23 PM Miklos Szeredi <miklos@szeredi.hu> wrote: > > > > This is a simplification of the getvalues(2) prototype and moving it to the > > getxattr(2) interface, as suggested by Dave. > > > > The patch itself just adds the possibility to retrieve a single line of > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > > patchset grew out of). > > > > But this should be able to serve Amir's per-sb iostats, as well as a host of > > other cases where some statistic needs to be retrieved from some object. Note: > > a filesystem object often represents other kinds of objects (such as processes > > in /proc) so this is not limited to fs attributes. > > > > This also opens up the interface to setting attributes via setxattr(2). > > > > After some pondering I made the namespace so: > > > > : - root > > bar - an attribute > > foo: - a folder (can contain attributes and/or folders) > > > > The contents of a folder is represented by a null separated list of names. > > > > Examples: > > > > $ getfattr -etext -n ":" . > > # file: . > > :="mnt:\000mntns:" > > > > $ getfattr -etext -n ":mnt:" . > > # file: . > > :mnt:="info" > > > > $ getfattr -etext -n ":mnt:info" . > > # file: . > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > > > $ getfattr -etext -n ":mntns:" . > > # file: . > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > > > $ getfattr -etext -n ":mntns:28:" . > > # file: . > > :mntns:28:="info" > > > > Comments? > > > > I like that :) > > It should be noted that while this API mandates text keys, > it does not mandate text values, so for example, sb iostats could be > exported as text or as binary struct, or as individual text/binary records or > all of the above. Ugh, no, that would be a total mess. Don't go exporting random binary structs depending on the file, that's going to be completely unmaintainable. As it is, this is going to be hard enough with random text fields. As for this format, it needs to be required to be documented in Documentation/ABI/ for each entry and key type so that we have a chance of knowing what is going on and tracking how things are working and validating stuff. thanks, greg k-h
On Tue, 3 May 2022 at 16:53, Greg KH <gregkh@linuxfoundation.org> wrote: > > On Tue, May 03, 2022 at 05:39:46PM +0300, Amir Goldstein wrote: > > It should be noted that while this API mandates text keys, > > it does not mandate text values, so for example, sb iostats could be > > exported as text or as binary struct, or as individual text/binary records or > > all of the above. > > Ugh, no, that would be a total mess. Don't go exporting random binary > structs depending on the file, that's going to be completely > unmaintainable. As it is, this is going to be hard enough with random > text fields. > > As for this format, it needs to be required to be documented in > Documentation/ABI/ for each entry and key type so that we have a chance > of knowing what is going on and tracking how things are working and > validating stuff. My preference would be a single text value for each key. Contents of ":mnt:info" contradicts that, but mountinfo has a long established, well documented format, and nothing prevents exporting individual attributes with separate names as well (the getvalues(2) patch did exactly that). Thanks, Miklos
On Tue, May 3, 2022 at 6:04 PM Miklos Szeredi <miklos@szeredi.hu> wrote: > > On Tue, 3 May 2022 at 16:53, Greg KH <gregkh@linuxfoundation.org> wrote: > > > > On Tue, May 03, 2022 at 05:39:46PM +0300, Amir Goldstein wrote: > > > > It should be noted that while this API mandates text keys, > > > it does not mandate text values, so for example, sb iostats could be > > > exported as text or as binary struct, or as individual text/binary records or > > > all of the above. > > > > Ugh, no, that would be a total mess. Don't go exporting random binary > > structs depending on the file, that's going to be completely > > unmaintainable. As it is, this is going to be hard enough with random > > text fields. > > > > As for this format, it needs to be required to be documented in > > Documentation/ABI/ for each entry and key type so that we have a chance > > of knowing what is going on and tracking how things are working and > > validating stuff. > > My preference would be a single text value for each key. > > Contents of ":mnt:info" contradicts that, but mountinfo has a long > established, well documented format, and nothing prevents exporting > individual attributes with separate names as well (the getvalues(2) > patch did exactly that). > Right, the fun is that ":mnt:info" and ":mnt:info:" can co-exist. Thanks, Amir.
On Tue, May 03, 2022 at 05:04:06PM +0200, Miklos Szeredi wrote: > On Tue, 3 May 2022 at 16:53, Greg KH <gregkh@linuxfoundation.org> wrote: > > > > On Tue, May 03, 2022 at 05:39:46PM +0300, Amir Goldstein wrote: > > > > It should be noted that while this API mandates text keys, > > > it does not mandate text values, so for example, sb iostats could be > > > exported as text or as binary struct, or as individual text/binary records or > > > all of the above. > > > > Ugh, no, that would be a total mess. Don't go exporting random binary > > structs depending on the file, that's going to be completely > > unmaintainable. As it is, this is going to be hard enough with random > > text fields. > > > > As for this format, it needs to be required to be documented in > > Documentation/ABI/ for each entry and key type so that we have a chance > > of knowing what is going on and tracking how things are working and > > validating stuff. > > My preference would be a single text value for each key. Yes! That's the only sane way to maintain apis, and is why we do that for sysfs. If the key isn't present, there's no value, so userspace "knows" this automatically and parsing this is trivial. > Contents of ":mnt:info" contradicts that, but mountinfo has a long > established, well documented format, and nothing prevents exporting > individual attributes with separate names as well (the getvalues(2) > patch did exactly that). I understand, for "legacy" things like this, that's fine, but don't add new fields or change them over time please, that way just gets us back to the nightmare of preserving /proc/ file apis. thanks, greg k-h
On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > This is a simplification of the getvalues(2) prototype and moving it to the > getxattr(2) interface, as suggested by Dave. > > The patch itself just adds the possibility to retrieve a single line of > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > patchset grew out of). > > But this should be able to serve Amir's per-sb iostats, as well as a host of > other cases where some statistic needs to be retrieved from some object. Note: > a filesystem object often represents other kinds of objects (such as processes > in /proc) so this is not limited to fs attributes. > > This also opens up the interface to setting attributes via setxattr(2). > > After some pondering I made the namespace so: > > : - root > bar - an attribute > foo: - a folder (can contain attributes and/or folders) > > The contents of a folder is represented by a null separated list of names. > > Examples: > > $ getfattr -etext -n ":" . > # file: . > :="mnt:\000mntns:" > > $ getfattr -etext -n ":mnt:" . > # file: . > :mnt:="info" > > $ getfattr -etext -n ":mnt:info" . > # file: . > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > $ getfattr -etext -n ":mntns:" . > # file: . > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > $ getfattr -etext -n ":mntns:28:" . > # file: . > :mntns:28:="info" > > Comments? I like. :) > Thanks, > Miklos > > --- > fs/Makefile | 2 > fs/mount.h | 8 + > fs/namespace.c | 15 ++- > fs/pnode.h | 2 > fs/proc_namespace.c | 15 ++- > fs/values.c | 242 +++++++++++++++++++++++++++++++++++++++++++++++++ > fs/xattr.c | 16 ++- > include/linux/values.h | 11 ++ "values" is a very generic name - probably should end up being something more descriptive of the functionality is provides, especially if the header file is going to be dumped in include/linux/. I don't really have a good suggestion at the moment, though. .... > + > +enum { > + VAL_MNT_INFO, > +}; > + > +static struct val_desc val_mnt_group[] = { > + { VD_NAME("info"), .idx = VAL_MNT_INFO }, > + { } > +}; .... > + > + > +static struct val_desc val_toplevel_group[] = { > + { VD_NAME("mnt:"), .get = val_mnt_get, }, > + { VD_NAME("mntns:"), .get = val_mntns_get, }, > + { }, > +}; I know this is an early POC, my main question is how do you envisiage this table driven structure being extended down from just the mount into the filesystem so we can expose filesystem specific information that isn't covered by generic interfaces like statx? Cheers, Dave.
On Wed, 4 May 2022 at 00:43, Dave Chinner <david@fromorbit.com> wrote: > "values" is a very generic name - probably should end up being > something more descriptive of the functionality is provides, > especially if the header file is going to be dumped in > include/linux/. I don't really have a good suggestion at the moment, > though. The obvious ones are stat and attr, which are taken already. Info is probably too generic as well. Ideas are welcome. > > .... > > > + > > +enum { > > + VAL_MNT_INFO, > > +}; > > + > > +static struct val_desc val_mnt_group[] = { > > + { VD_NAME("info"), .idx = VAL_MNT_INFO }, > > + { } > > +}; > .... > > + > > + > > +static struct val_desc val_toplevel_group[] = { > > + { VD_NAME("mnt:"), .get = val_mnt_get, }, > > + { VD_NAME("mntns:"), .get = val_mntns_get, }, > > + { }, > > +}; > > I know this is an early POC, my main question is how do you > envisiage this table driven structure being extended down from just > the mount into the filesystem so we can expose filesystem specific > information that isn't covered by generic interfaces like statx? I was thinking of adding a i_op callback. The details are a bit fuzzy, since the vfs and the fs would have to work together when listing the attributes and possibly also when retrieving the attribute itself (think mount options). Thanks, Miklos
On Wed, May 4, 2022 at 10:18 AM Miklos Szeredi <miklos@szeredi.hu> wrote: > > On Wed, 4 May 2022 at 00:43, Dave Chinner <david@fromorbit.com> wrote: > > > "values" is a very generic name - probably should end up being > > something more descriptive of the functionality is provides, > > especially if the header file is going to be dumped in > > include/linux/. I don't really have a good suggestion at the moment, > > though. > > The obvious ones are stat and attr, which are taken already. Info is > probably too generic as well. > > Ideas are welcome. I was thinking of "properties". > > > > > .... > > > > > + > > > +enum { > > > + VAL_MNT_INFO, > > > +}; > > > + > > > +static struct val_desc val_mnt_group[] = { > > > + { VD_NAME("info"), .idx = VAL_MNT_INFO }, > > > + { } > > > +}; > > .... > > > + > > > + > > > +static struct val_desc val_toplevel_group[] = { > > > + { VD_NAME("mnt:"), .get = val_mnt_get, }, > > > + { VD_NAME("mntns:"), .get = val_mntns_get, }, > > > + { }, > > > +}; > > > > I know this is an early POC, my main question is how do you > > envisiage this table driven structure being extended down from just > > the mount into the filesystem so we can expose filesystem specific > > information that isn't covered by generic interfaces like statx? > > I was thinking of adding a i_op callback. The details are a bit > fuzzy, since the vfs and the fs would have to work together when > listing the attributes and possibly also when retrieving the attribute > itself (think mount options). > No, please do not think mount options :) Please think of an interface that does not mix vfs and fs properties. Sure, with mount(2) you can mix fs and vfs options, but with the new interface they should be clearly separated for set and get if possible. ":mnt:info" key to get the effective mount options (as in /proc/$$/mountinfo) does not contradict that, because it can be read only. Thanks, Amir.
On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > Examples: > > $ getfattr -etext -n ":" . > # file: . > :="mnt:\000mntns:" > > $ getfattr -etext -n ":mnt:" . > # file: . > :mnt:="info" > > $ getfattr -etext -n ":mnt:info" . > # file: . > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" Is there a way how to get mountinfo-like entry by mount ID for some sub-tree? Something like: getfattr -etext -n ":mnt:info:21" / The interface has to be consistent with some notification system (I see your question about fsnotify/fanotify at linux-fsdevel) and mount ID seems better than paths due to over-mounts, etc. > $ getfattr -etext -n ":mntns:" . > # file: . > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > $ getfattr -etext -n ":mntns:28:" . > # file: . > :mntns:28:="info" Karel
On Thu, 5 May 2022 at 14:30, Karel Zak <kzak@redhat.com> wrote: > Is there a way how to get mountinfo-like entry by mount ID for some > sub-tree? Something like: > > getfattr -etext -n ":mnt:info:21" / Yes: getfattr -etext -n ":mntns:21:info" / > > The interface has to be consistent with some notification system (I > see your question about fsnotify/fanotify at linux-fsdevel) and mount > ID seems better than paths due to over-mounts, etc. Right. If the mount notification doesn't fit into fsnotify well, the original patch from David could be used. I think that part was uncontroversial. Thanks, Miklos
On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > : - root > bar - an attribute > foo: - a folder (can contain attributes and/or folders) > > The contents of a folder is represented by a null separated list of names. > > Examples: > > $ getfattr -etext -n ":" . > # file: . > :="mnt:\000mntns:" In your example, does it matter what "." is? It looks like in some cases, it makes no difference at all, and in other cases, like this, '.' *does* matter: > $ getfattr -etext -n ":mnt:info" . > # file: . > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" Is that right? > $ getfattr -etext -n ":mntns:" . > # file: . > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" What is this returning? All possible mount name spaces? Or all of the mount spaces where '.' happens to exist? Also, using the null character means that we can't really use shell scripts calling getfattr. I understand that the problem is that in some cases, you might want to return a pathname, and NULL is the only character which is guaranteed not to show up in a pathname. However, it makes parsing the returned value in a shell script exciting. - Ted
On Fri, May 6, 2022 at 2:38 AM tytso <tytso@mit.edu> wrote: > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > > > : - root > > bar - an attribute > > foo: - a folder (can contain attributes and/or folders) > > > > The contents of a folder is represented by a null separated list of names. > > > > Examples: > > > > $ getfattr -etext -n ":" . > > # file: . > > :="mnt:\000mntns:" > > In your example, does it matter what "." is? It looks like in some > cases, it makes no difference at all, and in other cases, like this, > '.' *does* matter: It does. If "." was a directory in /proc/ or in ext4 it might have had more entries. > > > $ getfattr -etext -n ":mnt:info" . > > # file: . > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > Is that right? > > > $ getfattr -etext -n ":mntns:" . > > # file: . > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > What is this returning? All possible mount name spaces? Or all of > the mount spaces where '.' happens to exist? This confused me too. It is not returning the mount namespaces, it is returning all the mount ids in the mount namespace of ".". ":mntns:mounts:" might have been a better choice of key. Thanks, Amir. > > Also, using the null character means that we can't really use shell > scripts calling getfattr. I understand that the problem is that in > some cases, you might want to return a pathname, and NULL is the only > character which is guaranteed not to show up in a pathname. However, > it makes parsing the returned value in a shell script exciting. > > - Ted
On Thu, May 05, 2022 at 04:38:12PM -0700, tytso wrote: > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > > > : - root > > bar - an attribute > > foo: - a folder (can contain attributes and/or folders) > > > > The contents of a folder is represented by a null separated list of names. > > > > Examples: > > > > $ getfattr -etext -n ":" . > > # file: . > > :="mnt:\000mntns:" > > In your example, does it matter what "." is? It looks like in some > cases, it makes no difference at all, and in other cases, like this, > '.' *does* matter: > > > $ getfattr -etext -n ":mnt:info" . > > # file: . > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > Is that right? > > > $ getfattr -etext -n ":mntns:" . > > # file: . > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > What is this returning? All possible mount name spaces? Or all of > the mount spaces where '.' happens to exist? > > Also, using the null character means that we can't really use shell > scripts calling getfattr. Yeah, it should be returning an attr per namespace, not an attr whose value contains all the valid namespaces. i.e. if the next level of the heirachy is 21, 22, 24, .... we should be seeing a listing of multiple attributes with naming like: :mntns:21: :mntns:22: :mntns:24: .... rather than an attribute whose value contains the names of the attrbiutes in the next layer of the heirarchy. Then we can just pull the namespace we want and feed it directly to: $ getfattr -n ":mntns:21:" and we get a list of all the attributes available for that namespace... > I understand that the problem is that in > some cases, you might want to return a pathname, and NULL is the only > character which is guaranteed not to show up in a pathname. However, > it makes parsing the returned value in a shell script exciting. We shouldn't be returning the names of children in an attribute value. We have a syscall API for doing this that - listxattr() will iterate attribute names just like a directory does with readdir() via listxattr(). IOWs, we should not need to encode the next layer of child attribute names into the value of the parent attribute - if we do a listxattr on a parent that has children, return the list of child names as individual attribute names.... (Yes, I know listxattr null separates the attribute names - it's a godawful kernel API - but that's not the programmatic interface we should expose at the shell script level.) Cheers, Dave.
On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > This is a simplification of the getvalues(2) prototype and moving it to the > getxattr(2) interface, as suggested by Dave. > > The patch itself just adds the possibility to retrieve a single line of > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > patchset grew out of). > > But this should be able to serve Amir's per-sb iostats, as well as a host of > other cases where some statistic needs to be retrieved from some object. Note: > a filesystem object often represents other kinds of objects (such as processes > in /proc) so this is not limited to fs attributes. > > This also opens up the interface to setting attributes via setxattr(2). > > After some pondering I made the namespace so: > > : - root > bar - an attribute > foo: - a folder (can contain attributes and/or folders) > > The contents of a folder is represented by a null separated list of names. > > Examples: > > $ getfattr -etext -n ":" . > # file: . > :="mnt:\000mntns:" > > $ getfattr -etext -n ":mnt:" . > # file: . > :mnt:="info" > > $ getfattr -etext -n ":mnt:info" . > # file: . > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" Hey Miklos, One comment about this. We really need to have this interface support giving us mount options like "relatime" back in numeric form (I assume this will be possible.). It is royally annoying having to maintain a mapping table in userspace just to do: relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME ro -> MS_RDONLY/MOUNT_ATTR_RDONLY A library shouldn't be required to use this interface. Conservative low-level software that keeps its shared library dependencies minimal will need to be able to use that interface without having to go to an external library that transforms text-based output to binary form (Which I'm very sure will need to happen if we go with a text-based interface.). > > $ getfattr -etext -n ":mntns:" . > # file: . > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > $ getfattr -etext -n ":mntns:28:" . > # file: . > :mntns:28:="info" > > Comments? I'm not a fan of text-based APIs and I'm particularly not a fan of the xattr APIs. But at this point I'm ready to compromise on a lot as long as it gets us values out of the kernel in some way. :) I had to use xattrs extensively in various low-level userspace projects and they continue to be a source of races and memory bugs. A few initial questions: * The xattr APIs often require the caller to do sm like (copying some go code quickly as I have that lying around): for _, x := range split { xattr := string(x) // Call Getxattr() twice: First, to determine the size of the // buffer we need to allocate to store the extended attributes, // second, to actually store the extended attributes in the // buffer. Also, check if the size of the extended attribute // hasn't increased between the two calls. pre, err = unix.Getxattr(path, xattr, nil) if err != nil || pre < 0 { return nil, err } dest = make([]byte, pre) post := 0 if pre > 0 { post, err = unix.Getxattr(path, xattr, dest) if err != nil || post < 0 { return nil, err } } if post > pre { return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post) } xattrs[xattr] = string(dest) } This pattern of requesting the size first by passing empty arguments, then allocating the buffer and then passing down that buffer to retrieve that value is really annoying to use and error prone (I do of course understand why it exists.). For real xattrs it's not that bad because we can assume that these values don't change often and so the race window between getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But fwiw, the post > pre check doesn't exist for no reason; we do indeed hit that race. In addition, it is costly having to call getxattr() twice. Again, for retrieving xattrs it often doesn't matter because it's not a super common operation but for mount and other info it might matter. Will we have to use the same pattern for mnt and other info as well? If so, I worry that the race is way more likely than it is for real xattrs. * Would it be possible to support binary output with this interface? I really think users would love to have an interfact where they can get a struct with binary info back. I'm not advocating to make the whole interface binary but I wouldn't mind having the option to support it. Especially for some information at least. I'd really love to have a way go get a struct mount_info or whatever back that gives me all the details about a mount encompassed in a single struct. Callers like systemd will have to parse text and will end up converting everything from text into binary anyway; especially for mount information. So giving them an option for this out of the box would be quite good. Interfaces like statx aim to be as fast as possible because we exptect them to be called quite often. Retrieving mount info is quite costly and is done quite often as well. Maybe not for all software but for a lot of low-level software. Especially when starting services in systemd a lot of mount parsing happens similar when starting containers in runtimes. * If we decide to go forward with this interface - and I think I mentioned this in the lsfmm session - could we please at least add a new system call? It really feels wrong to retrieve mount and other information through the xattr interfaces. They aren't really xattrs. Imho, xattrs are a bit like a wonky version of streams already (One of the reasons I find them quite unpleasant.). Making mount and other information retrievable directly through the getxattr() interface will turn them into a full-on streams implementation imho. I'd prefer not to do that (Which is another reason I'd prefer at least a separate system call.).
On Mon, May 9, 2022 at 3:48 PM Christian Brauner <brauner@kernel.org> wrote: > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > This is a simplification of the getvalues(2) prototype and moving it to the > > getxattr(2) interface, as suggested by Dave. > > > > The patch itself just adds the possibility to retrieve a single line of > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > > patchset grew out of). > > > > But this should be able to serve Amir's per-sb iostats, as well as a host of > > other cases where some statistic needs to be retrieved from some object. Note: > > a filesystem object often represents other kinds of objects (such as processes > > in /proc) so this is not limited to fs attributes. > > > > This also opens up the interface to setting attributes via setxattr(2). > > > > After some pondering I made the namespace so: > > > > : - root > > bar - an attribute > > foo: - a folder (can contain attributes and/or folders) > > > > The contents of a folder is represented by a null separated list of names. > > > > Examples: > > > > $ getfattr -etext -n ":" . > > # file: . > > :="mnt:\000mntns:" > > > > $ getfattr -etext -n ":mnt:" . > > # file: . > > :mnt:="info" > > > > $ getfattr -etext -n ":mnt:info" . > > # file: . > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > Hey Miklos, > > One comment about this. We really need to have this interface support > giving us mount options like "relatime" back in numeric form (I assume > this will be possible.). It is royally annoying having to maintain a > mapping table in userspace just to do: > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY > > A library shouldn't be required to use this interface. Conservative > low-level software that keeps its shared library dependencies minimal > will need to be able to use that interface without having to go to an > external library that transforms text-based output to binary form (Which > I'm very sure will need to happen if we go with a text-based > interface.). > No need for a library. We can export: :mnt:attr:flags (in hex format) > > > > $ getfattr -etext -n ":mntns:" . > > # file: . > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > > > $ getfattr -etext -n ":mntns:28:" . > > # file: . > > :mntns:28:="info" > > > > Comments? > > I'm not a fan of text-based APIs and I'm particularly not a fan of the > xattr APIs. But at this point I'm ready to compromise on a lot as long > as it gets us values out of the kernel in some way. :) > > I had to use xattrs extensively in various low-level userspace projects > and they continue to be a source of races and memory bugs. > > A few initial questions: > > * The xattr APIs often require the caller to do sm like (copying some go > code quickly as I have that lying around): > > for _, x := range split { > xattr := string(x) > // Call Getxattr() twice: First, to determine the size of the > // buffer we need to allocate to store the extended attributes, > // second, to actually store the extended attributes in the > // buffer. Also, check if the size of the extended attribute > // hasn't increased between the two calls. > pre, err = unix.Getxattr(path, xattr, nil) > if err != nil || pre < 0 { > return nil, err > } > > dest = make([]byte, pre) > post := 0 > if pre > 0 { > post, err = unix.Getxattr(path, xattr, dest) > if err != nil || post < 0 { > return nil, err > } > } > > if post > pre { > return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post) > } > > xattrs[xattr] = string(dest) > } > > This pattern of requesting the size first by passing empty arguments, > then allocating the buffer and then passing down that buffer to > retrieve that value is really annoying to use and error prone (I do > of course understand why it exists.). > > For real xattrs it's not that bad because we can assume that these > values don't change often and so the race window between > getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But > fwiw, the post > pre check doesn't exist for no reason; we do indeed > hit that race. It is not really a race, you can do {} while (errno != ERANGE) and there will be no race. > > In addition, it is costly having to call getxattr() twice. Again, for > retrieving xattrs it often doesn't matter because it's not a super > common operation but for mount and other info it might matter. > samba and many other projects that care about efficiency solved this a long time ago with an opportunistic buffer - never start with NULL buffer most values will fit in a 1K buffer. > Will we have to use the same pattern for mnt and other info as well? > If so, I worry that the race is way more likely than it is for real > xattrs. > > * Would it be possible to support binary output with this interface? > I really think users would love to have an interfact where they can > get a struct with binary info back. I'm not advocating to make the > whole interface binary but I wouldn't mind having the option to > support it. > Especially for some information at least. I'd really love to have a > way go get a struct mount_info or whatever back that gives me all the > details about a mount encompassed in a single struct. > I suggested that up thread and Greg has explicitly and loudly NACKed it - so you will have to take it up with him > Callers like systemd will have to parse text and will end up > converting everything from text into binary anyway; especially for > mount information. So giving them an option for this out of the box > would be quite good. > > Interfaces like statx aim to be as fast as possible because we exptect > them to be called quite often. Retrieving mount info is quite costly > and is done quite often as well. Maybe not for all software but for a > lot of low-level software. Especially when starting services in > systemd a lot of mount parsing happens similar when starting > containers in runtimes. > This API is not for *everything*. Obviously it does not replace statx and some info (like the cifs OFFLINE flag) should be added to statx. Whether or not mount info needs to get a special treatment like statx is not proven. Miklos claims this is a notification issue- With David Howells' mount notification API, systemd can be pointed at the new mount that was added/removed/changed and then systemd will rarely need to parse thousands of mounts info. > * If we decide to go forward with this interface - and I think I > mentioned this in the lsfmm session - could we please at least add a > new system call? It really feels wrong to retrieve mount and other > information through the xattr interfaces. They aren't really xattrs. > > Imho, xattrs are a bit like a wonky version of streams already (One of > the reasons I find them quite unpleasant.). Making mount and other > information retrievable directly through the getxattr() interface will > turn them into a full-on streams implementation imho. I'd prefer not > to do that (Which is another reason I'd prefer at least a separate > system call.). If you are thinking about a read() like interface for xattr or any alternative data stream then Linus has NACKed it times before. However, we could add getxattr_multi() as Dave Chinner suggested for enumerating multiple keys+ (optional) values. In contrast to listxattr(), getxattr_multi() could allow a "short read" at least w.r.t the size of the vector. Thanks, Amir.
On Mon, May 09, 2022 at 05:20:50PM +0300, Amir Goldstein wrote: > On Mon, May 9, 2022 at 3:48 PM Christian Brauner <brauner@kernel.org> wrote: > > > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > > This is a simplification of the getvalues(2) prototype and moving it to the > > > getxattr(2) interface, as suggested by Dave. > > > > > > The patch itself just adds the possibility to retrieve a single line of > > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > > > patchset grew out of). > > > > > > But this should be able to serve Amir's per-sb iostats, as well as a host of > > > other cases where some statistic needs to be retrieved from some object. Note: > > > a filesystem object often represents other kinds of objects (such as processes > > > in /proc) so this is not limited to fs attributes. > > > > > > This also opens up the interface to setting attributes via setxattr(2). > > > > > > After some pondering I made the namespace so: > > > > > > : - root > > > bar - an attribute > > > foo: - a folder (can contain attributes and/or folders) > > > > > > The contents of a folder is represented by a null separated list of names. > > > > > > Examples: > > > > > > $ getfattr -etext -n ":" . > > > # file: . > > > :="mnt:\000mntns:" > > > > > > $ getfattr -etext -n ":mnt:" . > > > # file: . > > > :mnt:="info" > > > > > > $ getfattr -etext -n ":mnt:info" . > > > # file: . > > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > > > Hey Miklos, > > > > One comment about this. We really need to have this interface support > > giving us mount options like "relatime" back in numeric form (I assume > > this will be possible.). It is royally annoying having to maintain a > > mapping table in userspace just to do: > > > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY > > > > A library shouldn't be required to use this interface. Conservative > > low-level software that keeps its shared library dependencies minimal > > will need to be able to use that interface without having to go to an > > external library that transforms text-based output to binary form (Which > > I'm very sure will need to happen if we go with a text-based > > interface.). > > > > No need for a library. > We can export: > > :mnt:attr:flags (in hex format) So a binary attribute or a hex value as a string which we have to parse in userspace into proper hex? > > > > > > > $ getfattr -etext -n ":mntns:" . > > > # file: . > > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > > > > > $ getfattr -etext -n ":mntns:28:" . > > > # file: . > > > :mntns:28:="info" > > > > > > Comments? > > > > I'm not a fan of text-based APIs and I'm particularly not a fan of the > > xattr APIs. But at this point I'm ready to compromise on a lot as long > > as it gets us values out of the kernel in some way. :) > > > > I had to use xattrs extensively in various low-level userspace projects > > and they continue to be a source of races and memory bugs. > > > > A few initial questions: > > > > * The xattr APIs often require the caller to do sm like (copying some go > > code quickly as I have that lying around): > > > > for _, x := range split { > > xattr := string(x) > > // Call Getxattr() twice: First, to determine the size of the > > // buffer we need to allocate to store the extended attributes, > > // second, to actually store the extended attributes in the > > // buffer. Also, check if the size of the extended attribute > > // hasn't increased between the two calls. > > pre, err = unix.Getxattr(path, xattr, nil) > > if err != nil || pre < 0 { > > return nil, err > > } > > > > dest = make([]byte, pre) > > post := 0 > > if pre > 0 { > > post, err = unix.Getxattr(path, xattr, dest) > > if err != nil || post < 0 { > > return nil, err > > } > > } > > > > if post > pre { > > return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post) > > } > > > > xattrs[xattr] = string(dest) > > } > > > > This pattern of requesting the size first by passing empty arguments, > > then allocating the buffer and then passing down that buffer to > > retrieve that value is really annoying to use and error prone (I do > > of course understand why it exists.). > > > > For real xattrs it's not that bad because we can assume that these > > values don't change often and so the race window between > > getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But > > fwiw, the post > pre check doesn't exist for no reason; we do indeed > > hit that race. > > It is not really a race, you can do {} while (errno != ERANGE) and there > will be no race. I don't know what your definition of your race is but if a value can change between two calls and I have to call it in a loop until I get a consistent value then that's a race. At least I know of no better way of calling it. > > > > > In addition, it is costly having to call getxattr() twice. Again, for > > retrieving xattrs it often doesn't matter because it's not a super > > common operation but for mount and other info it might matter. > > > > samba and many other projects that care about efficiency solved this > a long time ago with an opportunistic buffer - never start with NULL buffer > most values will fit in a 1K buffer. I'm glad that it's been solved a long time ago. It's still not a good property for an interface. > > > Will we have to use the same pattern for mnt and other info as well? > > If so, I worry that the race is way more likely than it is for real > > xattrs. > > > > * Would it be possible to support binary output with this interface? > > I really think users would love to have an interfact where they can > > get a struct with binary info back. I'm not advocating to make the > > whole interface binary but I wouldn't mind having the option to > > support it. > > Especially for some information at least. I'd really love to have a > > way go get a struct mount_info or whatever back that gives me all the > > details about a mount encompassed in a single struct. > > > > I suggested that up thread and Greg has explicitly and loudly > NACKed it - so you will have to take it up with him This is a vfs API and ultimately I would think that if we agree as a subsystem that it would be desirable to have a way of providing binary output in the form of well-defined structs in some form then we are free to do so. > > > Callers like systemd will have to parse text and will end up > > converting everything from text into binary anyway; especially for > > mount information. So giving them an option for this out of the box > > would be quite good. > > > > Interfaces like statx aim to be as fast as possible because we exptect > > them to be called quite often. Retrieving mount info is quite costly > > and is done quite often as well. Maybe not for all software but for a > > lot of low-level software. Especially when starting services in > > systemd a lot of mount parsing happens similar when starting > > containers in runtimes. > > > > This API is not for *everything*. Obviously it does not replace statx and some > info (like the cifs OFFLINE flag) should be added to statx. I'm not sure why you bring up this API as a replacement for statx(). That was never part of the discussion. And I didn't think I gave the impression I was saying this. This is about mount information and you can't get a lot of meaningful mount information from statx(). Though tbh, I really think a mountx() or similar system call wouldn't hurt... > Whether or not mount info needs to get a special treatment like statx > is not proven. Hm, if we take that argument I could also claim that whether or not mount info needs to be given in textual form is not proven. Iow, I'm not sure what this is an argument for or against. In fact, most well-used vfs information providing APIs apart from a few such as the xattr APIs are based on well-defined structs. And I personally at least consider that to be a good thing. But I'm willing to compromise and support the textual thing. But I'd still like to have the possibility to have some information provided in binary form. I don't think that needs to be off the table completely. > Miklos claims this is a notification issue- > With David Howells' mount notification API, systemd can be pointed at the new > mount that was added/removed/changed and then systemd will rarely need to > parse thousands of mounts info. You seem to be replying to things I didn't say. :) The notification issue is orthogonal to that and yes, we need that. I'm just saying that I want the ability in principal for some properties to be given in binary form in addition to textual form. Performance may be an aspect of this but that's orthogonal to performance issues when being notified about mount changes and reacting to it in e.g. a service. > > > * If we decide to go forward with this interface - and I think I > > mentioned this in the lsfmm session - could we please at least add a > > new system call? It really feels wrong to retrieve mount and other > > information through the xattr interfaces. They aren't really xattrs. > > > > Imho, xattrs are a bit like a wonky version of streams already (One of > > the reasons I find them quite unpleasant.). Making mount and other > > information retrievable directly through the getxattr() interface will > > turn them into a full-on streams implementation imho. I'd prefer not > > to do that (Which is another reason I'd prefer at least a separate > > system call.). > > If you are thinking about a read() like interface for xattr or any alternative > data stream then Linus has NACKed it times before. Please don't just make up an argument for me and then counter it. :D > > However, we could add getxattr_multi() as Dave Chinner suggested for > enumerating multiple keys+ (optional) values. > In contrast to listxattr(), getxattr_multi() could allow a "short read" > at least w.r.t the size of the vector. Having "xattr" in the system call name is just confusing. These are fundamentally not "real" xattrs and we shouldn't mix semantics. There should be a clear distinction between traditional xattrs and this vfs and potentially fs information providing interface. Just thinking about what the manpage would look like. We would need to add a paragraph to xattr(7) explaining that in addition to the system.*, security.*, user.* and other namespaces we now also have a set of namespaces that function as ways to get information about mounts or other things instead of information attached to specific inodes. That's super random imho. If I were to be presented with this manpage I'd wonder if someone was too lazy to add a proper new system call with it's own semantics for this and just stuffed it into an existing API because it provided matching system call arguments. We can add a new system call. It's not that we're running out of them. Fwiw, and I'm genuinly _not_ trolling you could call it: fsinfo(int dfd, const char *path, const char *key, char *value);
On Mon, May 9, 2022 at 6:09 PM Christian Brauner <brauner@kernel.org> wrote: > > On Mon, May 09, 2022 at 05:20:50PM +0300, Amir Goldstein wrote: > > On Mon, May 9, 2022 at 3:48 PM Christian Brauner <brauner@kernel.org> wrote: > > > > > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > > > This is a simplification of the getvalues(2) prototype and moving it to the > > > > getxattr(2) interface, as suggested by Dave. > > > > > > > > The patch itself just adds the possibility to retrieve a single line of > > > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > > > > patchset grew out of). > > > > > > > > But this should be able to serve Amir's per-sb iostats, as well as a host of > > > > other cases where some statistic needs to be retrieved from some object. Note: > > > > a filesystem object often represents other kinds of objects (such as processes > > > > in /proc) so this is not limited to fs attributes. > > > > > > > > This also opens up the interface to setting attributes via setxattr(2). > > > > > > > > After some pondering I made the namespace so: > > > > > > > > : - root > > > > bar - an attribute > > > > foo: - a folder (can contain attributes and/or folders) > > > > > > > > The contents of a folder is represented by a null separated list of names. > > > > > > > > Examples: > > > > > > > > $ getfattr -etext -n ":" . > > > > # file: . > > > > :="mnt:\000mntns:" > > > > > > > > $ getfattr -etext -n ":mnt:" . > > > > # file: . > > > > :mnt:="info" > > > > > > > > $ getfattr -etext -n ":mnt:info" . > > > > # file: . > > > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > > > > > Hey Miklos, > > > > > > One comment about this. We really need to have this interface support > > > giving us mount options like "relatime" back in numeric form (I assume > > > this will be possible.). It is royally annoying having to maintain a > > > mapping table in userspace just to do: > > > > > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > > > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY > > > > > > A library shouldn't be required to use this interface. Conservative > > > low-level software that keeps its shared library dependencies minimal > > > will need to be able to use that interface without having to go to an > > > external library that transforms text-based output to binary form (Which > > > I'm very sure will need to happen if we go with a text-based > > > interface.). > > > > > > > No need for a library. > > We can export: > > > > :mnt:attr:flags (in hex format) > > So a binary attribute or a hex value as a string which we have to parse > in userspace into proper hex? > I do see the ugliness in that. I personally have no objection to binary values in the leaves of the tree. I don't the main concern that Greg raised was against binary structs, which end up being an ABI compatibility nightmare. > > > > > > > > > > $ getfattr -etext -n ":mntns:" . > > > > # file: . > > > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > > > > > > > $ getfattr -etext -n ":mntns:28:" . > > > > # file: . > > > > :mntns:28:="info" > > > > > > > > Comments? > > > > > > I'm not a fan of text-based APIs and I'm particularly not a fan of the > > > xattr APIs. But at this point I'm ready to compromise on a lot as long > > > as it gets us values out of the kernel in some way. :) > > > > > > I had to use xattrs extensively in various low-level userspace projects > > > and they continue to be a source of races and memory bugs. > > > > > > A few initial questions: > > > > > > * The xattr APIs often require the caller to do sm like (copying some go > > > code quickly as I have that lying around): > > > > > > for _, x := range split { > > > xattr := string(x) > > > // Call Getxattr() twice: First, to determine the size of the > > > // buffer we need to allocate to store the extended attributes, > > > // second, to actually store the extended attributes in the > > > // buffer. Also, check if the size of the extended attribute > > > // hasn't increased between the two calls. > > > pre, err = unix.Getxattr(path, xattr, nil) > > > if err != nil || pre < 0 { > > > return nil, err > > > } > > > > > > dest = make([]byte, pre) > > > post := 0 > > > if pre > 0 { > > > post, err = unix.Getxattr(path, xattr, dest) > > > if err != nil || post < 0 { > > > return nil, err > > > } > > > } > > > > > > if post > pre { > > > return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post) > > > } > > > > > > xattrs[xattr] = string(dest) > > > } > > > > > > This pattern of requesting the size first by passing empty arguments, > > > then allocating the buffer and then passing down that buffer to > > > retrieve that value is really annoying to use and error prone (I do > > > of course understand why it exists.). > > > > > > For real xattrs it's not that bad because we can assume that these > > > values don't change often and so the race window between > > > getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But > > > fwiw, the post > pre check doesn't exist for no reason; we do indeed > > > hit that race. > > > > It is not really a race, you can do {} while (errno != ERANGE) and there > > will be no race. > > I don't know what your definition of your race is but if a value can > change between two calls and I have to call it in a loop until I get a > consistent value then that's a race. At least I know of no better way of > calling it. > For enumerating multiple keys we need a "short read", probably offset too. For leaf values that is less likely to be needed - I hope. > > > > > > > > In addition, it is costly having to call getxattr() twice. Again, for > > > retrieving xattrs it often doesn't matter because it's not a super > > > common operation but for mount and other info it might matter. > > > > > > > samba and many other projects that care about efficiency solved this > > a long time ago with an opportunistic buffer - never start with NULL buffer > > most values will fit in a 1K buffer. > > I'm glad that it's been solved a long time ago. It's still not a good > property for an interface. > All right. All I am saying is that the argument that calling getxattr() twice is costly is not relevant most of the time. > > > > > Will we have to use the same pattern for mnt and other info as well? > > > If so, I worry that the race is way more likely than it is for real > > > xattrs. > > > > > > * Would it be possible to support binary output with this interface? > > > I really think users would love to have an interfact where they can > > > get a struct with binary info back. I'm not advocating to make the > > > whole interface binary but I wouldn't mind having the option to > > > support it. > > > Especially for some information at least. I'd really love to have a > > > way go get a struct mount_info or whatever back that gives me all the > > > details about a mount encompassed in a single struct. > > > > > > > I suggested that up thread and Greg has explicitly and loudly > > NACKed it - so you will have to take it up with him > > This is a vfs API and ultimately I would think that if we agree as a > subsystem that it would be desirable to have a way of providing binary > output in the form of well-defined structs in some form then we are free > to do so. > Perhaps the way to reconcile between the ABI issue and avoid userspace parsing is if we export both. As long as an exported binary blob is also self described as individual keys, then userspace is both able to understand how to parse the blob and does not need to parse text in every read of values. > > > > > Callers like systemd will have to parse text and will end up > > > converting everything from text into binary anyway; especially for > > > mount information. So giving them an option for this out of the box > > > would be quite good. > > > > > > Interfaces like statx aim to be as fast as possible because we exptect > > > them to be called quite often. Retrieving mount info is quite costly > > > and is done quite often as well. Maybe not for all software but for a > > > lot of low-level software. Especially when starting services in > > > systemd a lot of mount parsing happens similar when starting > > > containers in runtimes. > > > > > > > This API is not for *everything*. Obviously it does not replace statx and some > > info (like the cifs OFFLINE flag) should be added to statx. > > I'm not sure why you bring up this API as a replacement for statx(). > That was never part of the discussion. And I didn't think I gave the > impression I was saying this. > > This is about mount information and you can't get a lot of meaningful > mount information from statx(). Though tbh, I really think a mountx() or > similar system call wouldn't hurt... That was my point. This API solves problems - it may not solve all the problems. If mountx() is needed for some performant application then I think this should be discussed separately. > > > Whether or not mount info needs to get a special treatment like statx > > is not proven. > > Hm, if we take that argument I could also claim that whether or not > mount info needs to be given in textual form is not proven. Iow, I'm not > sure what this is an argument for or against. > > In fact, most well-used vfs information providing APIs apart from a few > such as the xattr APIs are based on well-defined structs. And I > personally at least consider that to be a good thing. > It is good for well known information structures. We want to have something that is flexible and easy to extend including with fs specific info. > But I'm willing to compromise and support the textual thing. But I'd > still like to have the possibility to have some information provided in > binary form. I don't think that needs to be off the table completely. > I am not opposed. > > Miklos claims this is a notification issue- > > With David Howells' mount notification API, systemd can be pointed at the new > > mount that was added/removed/changed and then systemd will rarely need to > > parse thousands of mounts info. > > You seem to be replying to things I didn't say. :) > Sorry. I guess I did not understand why there needs to be a lot of parsing of mounts when starting systemd services, but I am not a systemd export. > The notification issue is orthogonal to that and yes, we need that. > I'm just saying that I want the ability in principal for some properties > to be given in binary form in addition to textual form. Performance may > be an aspect of this but that's orthogonal to performance issues when > being notified about mount changes and reacting to it in e.g. a service. > > > > > > * If we decide to go forward with this interface - and I think I > > > mentioned this in the lsfmm session - could we please at least add a > > > new system call? It really feels wrong to retrieve mount and other > > > information through the xattr interfaces. They aren't really xattrs. > > > > > > Imho, xattrs are a bit like a wonky version of streams already (One of > > > the reasons I find them quite unpleasant.). Making mount and other > > > information retrievable directly through the getxattr() interface will > > > turn them into a full-on streams implementation imho. I'd prefer not > > > to do that (Which is another reason I'd prefer at least a separate > > > system call.). > > > > If you are thinking about a read() like interface for xattr or any alternative > > data stream then Linus has NACKed it times before. > > Please don't just make up an argument for me and then counter it. :D > Sorry. should have asked how you envision the new syscall instead of assuming. > > > > However, we could add getxattr_multi() as Dave Chinner suggested for > > enumerating multiple keys+ (optional) values. > > In contrast to listxattr(), getxattr_multi() could allow a "short read" > > at least w.r.t the size of the vector. > > Having "xattr" in the system call name is just confusing. These are > fundamentally not "real" xattrs and we shouldn't mix semantics. There > should be a clear distinction between traditional xattrs and this vfs > and potentially fs information providing interface. > I know I am not the only one to disagree with that claim. There is nothing "real" about traditional xattrs in my eyes. It's just a key/value API. The name xattr is just as arbitrary as statx and mountx in my eyes. But I won't shed a tear if people decide to call this something else. > Just thinking about what the manpage would look like. We would need to > add a paragraph to xattr(7) explaining that in addition to the system.*, > security.*, user.* and other namespaces we now also have a set of > namespaces that function as ways to get information about mounts or > other things instead of information attached to specific inodes. > > That's super random imho. If I were to be presented with this manpage > I'd wonder if someone was too lazy to add a proper new system call with > it's own semantics for this and just stuffed it into an existing API > because it provided matching system call arguments. We can add a new > system call. It's not that we're running out of them. > > Fwiw, and I'm genuinly _not_ trolling you could call it: > fsinfo(int dfd, const char *path, const char *key, char *value); And I genuinely don't care how to call this, but considering that we will need the get/set/list variants my preference would be (as I said in LSFMM - not sure if people heard me): getxattrat(dirfd, path, key, value, AT_METAVERSE) ;-) setxattrat()/listattrat() This way we can avert the collision with fs that agree to store ":mnt:info" Thanks, Amir.
On Mon, May 09, 2022 at 05:08:56PM +0200, Christian Brauner wrote: [..] > Having "xattr" in the system call name is just confusing. These are > fundamentally not "real" xattrs and we shouldn't mix semantics. There > should be a clear distinction between traditional xattrs and this vfs > and potentially fs information providing interface. > > Just thinking about what the manpage would look like. We would need to > add a paragraph to xattr(7) explaining that in addition to the system.*, > security.*, user.* and other namespaces we now also have a set of > namespaces that function as ways to get information about mounts or > other things instead of information attached to specific inodes. > > That's super random imho. If I were to be presented with this manpage > I'd wonder if someone was too lazy to add a proper new system call with > it's own semantics for this and just stuffed it into an existing API > because it provided matching system call arguments. We can add a new > system call. It's not that we're running out of them. FWIW, I also felt that using xattr API to get some sort of mount info felt very non-intutive. Thanks Vivek
On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote: > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > This is a simplification of the getvalues(2) prototype and moving it to the > > getxattr(2) interface, as suggested by Dave. > > > > The patch itself just adds the possibility to retrieve a single line of > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > > patchset grew out of). > > > > But this should be able to serve Amir's per-sb iostats, as well as a host of > > other cases where some statistic needs to be retrieved from some object. Note: > > a filesystem object often represents other kinds of objects (such as processes > > in /proc) so this is not limited to fs attributes. > > > > This also opens up the interface to setting attributes via setxattr(2). > > > > After some pondering I made the namespace so: > > > > : - root > > bar - an attribute > > foo: - a folder (can contain attributes and/or folders) > > > > The contents of a folder is represented by a null separated list of names. > > > > Examples: > > > > $ getfattr -etext -n ":" . > > # file: . > > :="mnt:\000mntns:" > > > > $ getfattr -etext -n ":mnt:" . > > # file: . > > :mnt:="info" > > > > $ getfattr -etext -n ":mnt:info" . > > # file: . > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > Hey Miklos, > > One comment about this. We really need to have this interface support > giving us mount options like "relatime" back in numeric form (I assume > this will be possible.). It is royally annoying having to maintain a > mapping table in userspace just to do: > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY You're asking for a complete change of output information there. This has nothing to do with the mechanism for extracting key/value information from the kernel. i.e. we need to separate demands for "data I want" from "mechanism to extract data". > > $ getfattr -etext -n ":mntns:" . > > # file: . > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > > > $ getfattr -etext -n ":mntns:28:" . > > # file: . > > :mntns:28:="info" > > > > Comments? > > I'm not a fan of text-based APIs and I'm particularly not a fan of the > xattr APIs. But at this point I'm ready to compromise on a lot as long > as it gets us values out of the kernel in some way. :) > > I had to use xattrs extensively in various low-level userspace projects > and they continue to be a source of races and memory bugs. > > A few initial questions: > > * The xattr APIs often require the caller to do sm like (copying some go > code quickly as I have that lying around): > > for _, x := range split { > xattr := string(x) > // Call Getxattr() twice: First, to determine the size of the > // buffer we need to allocate to store the extended attributes, > // second, to actually store the extended attributes in the > // buffer. Also, check if the size of the extended attribute > // hasn't increased between the two calls. > pre, err = unix.Getxattr(path, xattr, nil) > if err != nil || pre < 0 { > return nil, err > } > > dest = make([]byte, pre) > post := 0 > if pre > 0 { > post, err = unix.Getxattr(path, xattr, dest) > if err != nil || post < 0 { > return nil, err > } > } > > if post > pre { > return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post) > } > > xattrs[xattr] = string(dest) > } > > This pattern of requesting the size first by passing empty arguments, > then allocating the buffer and then passing down that buffer to > retrieve that value is really annoying to use and error prone (I do > of course understand why it exists.). You're doing it wrong. When you have an attr extraction loop like this, you allocate a single 64kB buffer for the value, adn then call getattr() with the buffer and a length of 64kB. Then the call returns both the value length and the value in one syscall, and then the application can allocate+copy into an exact sized buffer if it needs to. Then you use the same 64kB buffer for the next getxattr() call. I first saw this pattern in code written in the mid 1990s for Irix, and for basic listxattr/getxattr operations to find and retrieve key/value pairs this is much more efficient than the above. The real problem is that the linux listxattr() syscall only returns names. It's a shit API design at best, especially the part where it cannot iterate over a list larger than a single buffer. If you've got millions of xattrs, listxattr is fucking useless. The man page is actively harmful - that's where everyone learns the double getxattr anit-pattern you've described above. With that in mind, go look at XFS_IOC_ATTRLIST_BY_HANDLE that I've mentioned in previous discussions this topic. Each attr entry returned is one of these: struct xfs_attrlist_ent { /* data from attr_list() */ __u32 a_valuelen; /* number bytes in value of attr */ char a_name[1]; /* attr name (NULL terminated) */ }; It really needs a namelen to make parsing of the output buffer simpler, but this is the model we should be following - listing xattrs is no different from listing directory entries. Indeed, a directory entries is basically just a name/value pair - the name of the dirent and the inode number it points to is the value. Further to that point, the control structure for XFS_IOC_ATTRLIST_BY_HANDLE has a cursor value that works the same way as readdir cookies do. Hence when iterating over xattrs that require multiple syscalls to retrieve, the cursor allows the next list syscall to start off listing exactly where the previous syscall finished. IOWs, what Linux really needs is a listxattr2() syscall that works the same way that getdents/XFS_IOC_ATTRLIST_BY_HANDLE work. With the list function returning value sizes and being able to iterate effectively, every problem that listxattr() causes goes away. And while we are at it, we need to consider a xattr2() syscall to replace getxattr/setxattr. The model for that is XFS_IOC_ATTRMULTI_BY_HANDLE, which allows operations to be performed on mulitple xattrs in a single syscall. e.g. we can do a bulk get, set and remove operation across multiple xattrs - we can even mix and match get/set/remove operations on different xattrs in a single call. > * Would it be possible to support binary output with this interface? The xattr API already supports binary names and values. The only exception is you can't put NULLs in names because APIs use that as a name terminator. if listxattr2() returns a namelen in it's structure, then we could allow fully binary names, too (XFS already fully supports this internally!). In the current API, values are always determined by length, not null termiantions, so they are also already fully binary capable. > I really think users would love to have an interfact where they can > get a struct with binary info back. No. Not for kernel informational interfaces. We have ioctls and syscalls for defining structured binary interfaces that pass non-trivial objects. xattrs are no the place for this - they are key/value object stores like sysfs is supposed to be, so I really don't think we should support encoded binary data in xattrs under this special mount namespace... Solving the "retreive multiple values per syscall" problem is what the bulk get interface (XFS_IOC_ATTRMULTI_BY_HANDLE) is for. > Callers like systemd will have to parse text and will end up > converting everything from text into binary anyway; especially for > mount information. So giving them an option for this out of the box > would be quite good. That boat sailed years ago. You're making the same arguments about binary extraction interfaces being way more efficient than ascii based value-per-file interfaces like proc/sysfs that were made back in the early 2000s. That boat has long sailed - while the current method is somewhat inefficient, it certainly hasn't had all the problems that maintaining binary interfaces over decades has had.... Much as it pains me to say it (because I came to Linux from an OS that exported huge amounts of custom binary structures from the kernel), having all the export interfaces dump human readable information has proved far more flexible and usable than interfaces that required binary parsers to dump information before it could be used on the command line or in scripting languages.... > * If we decide to go forward with this interface - and I think I > mentioned this in the lsfmm session - could we please at least add a > new system call? It really feels wrong to retrieve mount and other > information through the xattr interfaces. They aren't really xattrs. We are trying to expose structured key-value information. That's exactly what the xattr API was orginally created to cater for... > Imho, xattrs are a bit like a wonky version of streams already (One of > the reasons I find them quite unpleasant.). Making mount and other > information retrievable directly through the getxattr() interface will > turn them into a full-on streams implementation imho. I'd prefer not > to do that (Which is another reason I'd prefer at least a separate > system call.). And that's a total misunderstanding of what xattrs are. Alternate data streams are just {file,offset} based data streams accessed via ithe same read/write() mechanisms as the primary data stream. Xattrs provide an *atomic key-value object store API*, not an offset based data stream API. They are completely different beasts, intended for completely different purposes. ADS are redundant when you have directories and files, whilst an atomic key-value store is something completely different. You do realise we have an independent, scalable, ACID compliant key-value object store in every inode in an XFS filesystem, right? Cheers, Dave.
On Mon, 2022-05-09 at 17:42 -0400, Vivek Goyal wrote: > On Mon, May 09, 2022 at 05:08:56PM +0200, Christian Brauner wrote: > > [..] > > Having "xattr" in the system call name is just confusing. These are > > fundamentally not "real" xattrs and we shouldn't mix semantics. > > There > > should be a clear distinction between traditional xattrs and this > > vfs > > and potentially fs information providing interface. > > > > Just thinking about what the manpage would look like. We would need > > to > > add a paragraph to xattr(7) explaining that in addition to the > > system.*, > > security.*, user.* and other namespaces we now also have a set of > > namespaces that function as ways to get information about mounts or > > other things instead of information attached to specific inodes. > > > > That's super random imho. If I were to be presented with this > > manpage > > I'd wonder if someone was too lazy to add a proper new system call > > with > > it's own semantics for this and just stuffed it into an existing > > API > > because it provided matching system call arguments. We can add a > > new > > system call. It's not that we're running out of them. > > FWIW, I also felt that using xattr API to get some sort of mount info > felt > very non-intutive. Yeah, people looking for this function simply wouldn't know to look here ... Ian
On Mon, 9 May 2022 at 14:48, Christian Brauner <brauner@kernel.org> wrote: > One comment about this. We really need to have this interface support > giving us mount options like "relatime" back in numeric form (I assume > this will be possible.). It is royally annoying having to maintain a > mapping table in userspace just to do: > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY > > A library shouldn't be required to use this interface. Conservative > low-level software that keeps its shared library dependencies minimal > will need to be able to use that interface without having to go to an > external library that transforms text-based output to binary form (Which > I'm very sure will need to happen if we go with a text-based > interface.). Agreed. > This pattern of requesting the size first by passing empty arguments, > then allocating the buffer and then passing down that buffer to > retrieve that value is really annoying to use and error prone (I do > of course understand why it exists.). > > For real xattrs it's not that bad because we can assume that these > values don't change often and so the race window between > getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But > fwiw, the post > pre check doesn't exist for no reason; we do indeed > hit that race. That code is wrong. Changing xattr size is explicitly documented in the man page as a non-error condition: If size is specified as zero, these calls return the current size of the named extended attribute (and leave value unchanged). This can be used to determine the size of the buffer that should be supplied in a subsequent call. (But, bear in mind that there is a possibility that the attribute value may change between the two calls, so that it is still necessary to check the return status from the second call.) > > In addition, it is costly having to call getxattr() twice. Again, for > retrieving xattrs it often doesn't matter because it's not a super > common operation but for mount and other info it might matter. You don't *have* to retrieve the size, it's perfectly valid to e.g. start with a fixed buffer size and double the size until the result fits. > * Would it be possible to support binary output with this interface? > I really think users would love to have an interfact where they can > get a struct with binary info back. I think that's bad taste. fsinfo(2) had the same issue. As well as mount(2) which still interprets the last argument as a binary blob in certain cases (nfs is one I know of). > Especially for some information at least. I'd really love to have a > way go get a struct mount_info or whatever back that gives me all the > details about a mount encompassed in a single struct. If we want that, then can do a new syscall with that specific struct as an argument. > Callers like systemd will have to parse text and will end up > converting everything from text into binary anyway; especially for > mount information. So giving them an option for this out of the box > would be quite good. What exactly are the attributes that systemd requires? > Interfaces like statx aim to be as fast as possible because we exptect > them to be called quite often. Retrieving mount info is quite costly > and is done quite often as well. Maybe not for all software but for a > lot of low-level software. Especially when starting services in > systemd a lot of mount parsing happens similar when starting > containers in runtimes. Was there ever a test patch for systemd using fsinfo(2)? I think not. Until systemd people start to reengineer the mount handing to allow for retrieving a single mount instead of the complete mount table we will never know where the performance bottleneck lies. > > * If we decide to go forward with this interface - and I think I > mentioned this in the lsfmm session - could we please at least add a > new system call? It really feels wrong to retrieve mount and other > information through the xattr interfaces. They aren't really xattrs. I'd argue with that statement. These are most definitely attributes. As for being extended, we'd just extended the xattr interface... Naming aside... imagine that read(2) has always been used to retrieve disk data, would you say that reading data from proc feels wrong? And in hindsight, would a new syscall for the purpose make any sense? Thanks, Miklos
On Tue, 2022-05-10 at 05:49 +0200, Miklos Szeredi wrote: > On Mon, 9 May 2022 at 14:48, Christian Brauner <brauner@kernel.org> > wrote: > > > One comment about this. We really need to have this interface > > support > > giving us mount options like "relatime" back in numeric form (I > > assume > > this will be possible.). It is royally annoying having to maintain > > a > > mapping table in userspace just to do: > > > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > > ro      -> MS_RDONLY/MOUNT_ATTR_RDONLY > > > > A library shouldn't be required to use this interface. Conservative > > low-level software that keeps its shared library dependencies > > minimal > > will need to be able to use that interface without having to go to > > an > > external library that transforms text-based output to binary form > > (Which > > I'm very sure will need to happen if we go with a text-based > > interface.). > > Agreed. > > >  This pattern of requesting the size first by passing empty > > arguments, > >  then allocating the buffer and then passing down that buffer to > >  retrieve that value is really annoying to use and error prone (I > > do > >  of course understand why it exists.). > > > >  For real xattrs it's not that bad because we can assume that > > these > >  values don't change often and so the race window between > >  getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. > > But > >  fwiw, the post > pre check doesn't exist for no reason; we do > > indeed > >  hit that race. > > That code is wrong. Changing xattr size is explicitly documented in > the man page as a non-error condition: > >       If size is specified as zero, these calls return the current > size of >       the named extended attribute (and leave value unchanged). > This can be >       used to determine the size of the buffer that should be > supplied in a >       subsequent call.  (But, bear in mind that there is a > possibility that >       the attribute value may change between the two calls, so > that it is >       still necessary to check the return status from the second > call.) > > > > >  In addition, it is costly having to call getxattr() twice. Again, > > for > >  retrieving xattrs it often doesn't matter because it's not a > > super > >  common operation but for mount and other info it might matter. > > You don't *have* to retrieve the size, it's perfectly valid to e.g. > start with a fixed buffer size and double the size until the result > fits. > > > * Would it be possible to support binary output with this > > interface? > >  I really think users would love to have an interfact where they > > can > >  get a struct with binary info back. > > I think that's bad taste.  fsinfo(2) had the same issue. As well as > mount(2) which still interprets the last argument as a binary blob in > certain cases (nfs is one I know of). > > >  Especially for some information at least. I'd really love to have > > a > >  way go get a struct mount_info or whatever back that gives me all > > the > >  details about a mount encompassed in a single struct. > > If we want that, then can do a new syscall with that specific struct > as an argument. > > >  Callers like systemd will have to parse text and will end up > >  converting everything from text into binary anyway; especially > > for > >  mount information. So giving them an option for this out of the > > box > >  would be quite good. > > What exactly are the attributes that systemd requires? It's been a while since I worked on this so my response might not be too accurrate now. Monitoring the mount table is used primarily to identify a mount started and mount completion. Mount table entry identification requires several fields. But, in reality, once a direct interface is available it should be possible to work out what is actually needed and that will be a rather subset of a mountinfo table entry. > > >  Interfaces like statx aim to be as fast as possible because we > > exptect > >  them to be called quite often. Retrieving mount info is quite > > costly > >  and is done quite often as well. Maybe not for all software but > > for a > >  lot of low-level software. Especially when starting services in > >  systemd a lot of mount parsing happens similar when starting > >  containers in runtimes. > > Was there ever a test patch for systemd using fsinfo(2)? I think > not. Mmm ... I'm hurt you didn't pay any attention to what I did on this during the original fsinfo() discussions. > > Until systemd people start to reengineer the mount handing to allow > for retrieving a single mount instead of the complete mount table we > will never know where the performance bottleneck lies. We didn't need the systemd people to do this only review and contribute to the pr for the change and eventually merge it. What I did on this showed that using fsinfo() allone about halved the CPU overhead (from around 4 processes using about 80%) and once the mount notifications was added too it went down to well under 10% per process. The problem here was systemd is quite good at servicing events and reducing event processing overhead meant more events would then be processed. Utilizing the mount notifications queueing was the key to improving this and that was what I was about to work on at the end. But everything stopped before the work was complete. As I said above it's been a long time since I looked at the systemd work and it definitely was a WIP so "what you see is what you get" at https://github.com/raven-au/systemd/commits/. It looks like the place to look to get some idea of what was being done is branch notifications-devel or notifications-rfc-pr. Also note that this uses the libmount fsinfo() infrastrucure that was done by Karal Zak (and a tiny bit by me) at the time. > > > > > * If we decide to go forward with this interface - and I think I > >  mentioned this in the lsfmm session - could we please at least > > add a > >  new system call? It really feels wrong to retrieve mount and > > other > >  information through the xattr interfaces. They aren't really > > xattrs. > > I'd argue with that statement. These are most definitely attributes. > As for being extended, we'd just extended the xattr interface... > > Naming aside... imagine that read(2) has always been used to retrieve > disk data, would you say that reading data from proc feels wrong? > And in hindsight, would a new syscall for the purpose make any sense? > > Thanks, > Miklos
On Tue, 10 May 2022 at 06:27, Ian Kent <raven@themaw.net> wrote: > > Was there ever a test patch for systemd using fsinfo(2)? I think > > not. > > Mmm ... I'm hurt you didn't pay any attention to what I did on this > during the original fsinfo() discussions. I can't find anything related to this in my mailbox. Maybe you mentioned it at some point, but I have not been involved with the actual systemd changes. So not meant to belittle your work at all. > > Until systemd people start to reengineer the mount handing to allow > > for retrieving a single mount instead of the complete mount table we > > will never know where the performance bottleneck lies. > > We didn't need the systemd people to do this only review and contribute > to the pr for the change and eventually merge it. > > What I did on this showed that using fsinfo() allone about halved the > CPU overhead (from around 4 processes using about 80%) and once the > mount notifications was added too it went down to well under 10% per > process. The problem here was systemd is quite good at servicing events > and reducing event processing overhead meant more events would then be > processed. Utilizing the mount notifications queueing was the key to > improving this and that was what I was about to work on at the end. > > But everything stopped before the work was complete. > > As I said above it's been a long time since I looked at the systemd > work and it definitely was a WIP so "what you see is what you get" > at https://github.com/raven-au/systemd/commits/. It looks like the > place to look to get some idea of what was being done is branch > notifications-devel or notifications-rfc-pr. Also note that this > uses the libmount fsinfo() infrastrucure that was done by Karal Zak > (and a tiny bit by me) at the time. Looks great as a first step. What do you mean by "Utilizing the mount notifications queueing"? Do you mean batching of notifications? I think that's a very important issue: processing each individual notifcation may not make sense when there are lots of them. For example, doing ceate mount+remote mount in a loop a million times will result in two million notification messages (with high likelyhood of queue overflow), but in the end the mount table will end up being the same... Thanks, Miklos
On Tue, 10 May 2022 at 10:06, Miklos Szeredi <miklos@szeredi.hu> wrote: > > On Tue, 10 May 2022 at 06:27, Ian Kent <raven@themaw.net> wrote: > > > > Was there ever a test patch for systemd using fsinfo(2)? I think > > > not. > > > > Mmm ... I'm hurt you didn't pay any attention to what I did on this > > during the original fsinfo() discussions. > > I can't find anything related to this in my mailbox. Maybe you > mentioned it at some point, but I have not been involved with the > actual systemd changes. So not meant to belittle your work at all. > > > > Until systemd people start to reengineer the mount handing to allow > > > for retrieving a single mount instead of the complete mount table we > > > will never know where the performance bottleneck lies. > > > > We didn't need the systemd people to do this only review and contribute > > to the pr for the change and eventually merge it. > > > > What I did on this showed that using fsinfo() allone about halved the > > CPU overhead (from around 4 processes using about 80%) and once the > > mount notifications was added too it went down to well under 10% per > > process. The problem here was systemd is quite good at servicing events > > and reducing event processing overhead meant more events would then be > > processed. Utilizing the mount notifications queueing was the key to > > improving this and that was what I was about to work on at the end. > > > > But everything stopped before the work was complete. > > > > As I said above it's been a long time since I looked at the systemd > > work and it definitely was a WIP so "what you see is what you get" > > at https://github.com/raven-au/systemd/commits/. It looks like the > > place to look to get some idea of what was being done is branch > > notifications-devel or notifications-rfc-pr. Also note that this > > uses the libmount fsinfo() infrastrucure that was done by Karal Zak > > (and a tiny bit by me) at the time. > > Looks great as a first step. > > What do you mean by "Utilizing the mount notifications queueing"? > > Do you mean batching of notifications? I think that's a very > important issue: processing each individual notifcation may not make > sense when there are lots of them. For example, doing ceate > mount+remote mount in a loop a million times will result in two s/remote/remove/ > million notification messages (with high likelyhood of queue > overflow), but in the end the mount table will end up being the > same... > > Thanks, > Miklos
On Tue, May 10, 2022 at 05:49:04AM +0200, Miklos Szeredi wrote: > On Mon, 9 May 2022 at 14:48, Christian Brauner <brauner@kernel.org> wrote: > > > One comment about this. We really need to have this interface support > > giving us mount options like "relatime" back in numeric form (I assume > > this will be possible.). It is royally annoying having to maintain a > > mapping table in userspace just to do: > > > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY > > > > A library shouldn't be required to use this interface. Conservative > > low-level software that keeps its shared library dependencies minimal > > will need to be able to use that interface without having to go to an > > external library that transforms text-based output to binary form (Which > > I'm very sure will need to happen if we go with a text-based > > interface.). > > Agreed. > > > This pattern of requesting the size first by passing empty arguments, > > then allocating the buffer and then passing down that buffer to > > retrieve that value is really annoying to use and error prone (I do > > of course understand why it exists.). > > > > For real xattrs it's not that bad because we can assume that these > > values don't change often and so the race window between > > getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But > > fwiw, the post > pre check doesn't exist for no reason; we do indeed > > hit that race. > > That code is wrong. Changing xattr size is explicitly documented in When recursively changing the ownership of a filesystem tree for a container some xattrs will need to be updated as well. For example, if I have files with POSIX ACLs set which store {g,u}ids then the ACL needs to be updated to store the {g,u}id mapped to the container so the container can interpret them when started. That is a rather sensitive operation with loads of potentials for bugs. So if a POSIX ACL changes beneath the chowning daemon they must be conservative because it means that there's concurrent modfication or possibly an attack going on. In general, I feel it's a bit easy to judge the code is wrong without looking at the concrete scenario. I'm also unsure how the manpage implies it's not an error condition. Afaict, it only implies that the caller needs to handle the case where the xattr changes. Whether or not that's an error is up to the caller to decide. If the caller expects to be the sole user of a specific filesystems then a changing xattr in between should probably be an error condition. But I think we're starting to go on a detour. > the man page as a non-error condition: > > If size is specified as zero, these calls return the current size of > the named extended attribute (and leave value unchanged). This can be > used to determine the size of the buffer that should be supplied in a > subsequent call. (But, bear in mind that there is a possibility that > the attribute value may change between the two calls, so that it is > still necessary to check the return status from the second call.) > > > > > In addition, it is costly having to call getxattr() twice. Again, for > > retrieving xattrs it often doesn't matter because it's not a super > > common operation but for mount and other info it might matter. > > You don't *have* to retrieve the size, it's perfectly valid to e.g. > start with a fixed buffer size and double the size until the result > fits. Yes, I understand and accept that. I'm just not fond of such APIs. > > > * Would it be possible to support binary output with this interface? > > I really think users would love to have an interfact where they can > > get a struct with binary info back. > > I think that's bad taste. fsinfo(2) had the same issue. As well as > mount(2) which still interprets the last argument as a binary blob in > certain cases (nfs is one I know of). In the same vein I could argue it's bad taste that everything gets returned as a string. But I do agree that binary blobs through void pointers aren't elegant. I just worry that if we have an interface and there's a legitimate subset of users that would be well served by a simple struct for e.g., mount properties any attempt to get something like this in the form of a separate system call will be shut down with the argument that we already have an interface for this. So I'd compromise if we have your/any other interface return binary blobs. But of course I'd be equally happy if we'd at least expose basic mount information in the form of a separate system call. > > > Especially for some information at least. I'd really love to have a > > way go get a struct mount_info or whatever back that gives me all the > > details about a mount encompassed in a single struct. > > If we want that, then can do a new syscall with that specific struct > as an argument. Ok, that sounds good to me. > > > Callers like systemd will have to parse text and will end up > > converting everything from text into binary anyway; especially for > > mount information. So giving them an option for this out of the box > > would be quite good. > > What exactly are the attributes that systemd requires? We keep a repo with ideas for (kernel) extensions - we should probably publish that somewhere - but the list we used for a prototype roughly contains: * mount flags MOUNT_ATTR_RDONLY etc. * time flags MOUNT_ATTR_RELATIME etc. (could probably be combined with mount flags. We missed the opportunity to make them proper enums separate from other mount flags imho.) * propagation "flags" (MS_SHARED) * peer group * mnt_id of the mount * mnt_id of the mount's parent * owning userns There's a bit more advanced stuff systemd would really want but which I think is misplaced in a mountinfo system call including: * list of primary and auxiliary block device major/minor * diskseq value of those device nodes (This is a new block device feature we added that allows preventing device recycling issues when e.g. removing usb devices very quickly and is needed for udev.) * uuid/fsid * feature flags (O_TMPFILE, RENAME_EXCHANGE supported etc.) > > > Interfaces like statx aim to be as fast as possible because we exptect > > them to be called quite often. Retrieving mount info is quite costly > > and is done quite often as well. Maybe not for all software but for a > > lot of low-level software. Especially when starting services in > > systemd a lot of mount parsing happens similar when starting > > containers in runtimes. > > Was there ever a test patch for systemd using fsinfo(2)? I think not. > > Until systemd people start to reengineer the mount handing to allow > for retrieving a single mount instead of the complete mount table we > will never know where the performance bottleneck lies. I defer to Ian and Karel to answer that. Both did work to prove that point triggered by one of your objections to fsinfo() iirc. Karel's commits at least are here: https://github.com/util-linux/util-linux/tree/topic/fsinfo > > > > > * If we decide to go forward with this interface - and I think I > > mentioned this in the lsfmm session - could we please at least add a > > new system call? It really feels wrong to retrieve mount and other > > information through the xattr interfaces. They aren't really xattrs. > > I'd argue with that statement. These are most definitely attributes. > As for being extended, we'd just extended the xattr interface... I just have a really hard time understanding how this belongs into the (f)getxattr() system call family and why it would be a big deal to just make this a separate system call. I saw that Dave has a long mail on the history of all this so maybe that'll help me. I hope I get around to reading it in detail today. > > Naming aside... imagine that read(2) has always been used to retrieve > disk data, would you say that reading data from proc feels wrong? > And in hindsight, would a new syscall for the purpose make any sense? I think past interface decisions don't need to always inform future interface decisions. And fwiw, yes. Imho, there's stuff in proc that should indeed have been covered by a dedicated system call instead of a read-like interface.
On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote: > One comment about this. We really need to have this interface support > giving us mount options like "relatime" back in numeric form (I assume > this will be possible.). It is royally annoying having to maintain a > mapping table in userspace just to do: > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY > > A library shouldn't be required to use this interface. Conservative > low-level software that keeps its shared library dependencies minimal > will need to be able to use that interface without having to go to an > external library that transforms text-based output to binary form (Which > I'm very sure will need to happen if we go with a text-based > interface.). Sounds like David's fsinfo() :-) We need an interface where the kernel returns a consistent mount table entry (more syscalls to get more key=value could be a way how to get inconsistent data). IMHO all the attempts to make a trivial interface will be unsuccessful because the mount table is complex (tree) and mixes strings, paths, and flags. We will always end with a complex interface or complex strings (like the last xatts attempt). There is no 3rd path to go ... The best would be simplified fsinfo() where userspace defines a request (wanted "keys"), and the kernel fills a buffer with data separated by some header metadata struct. In this case, the kernel can return strings and structs with binary data. I'd love something like: ssize_t sz; fsinfo_query query[] = { { .request = FSINFO_MOUNT_PATH }, { .request = FSINFO_PROPAGATION }, { .request = FSINFO_CHILDREN_IDS }, }; sz = fsinfo(dfd, "", AT_EMPTY_PATH, &query, ARRAY_SIZE(query), buf, sizeof(buf)); for (p = buf; p < buf + sz; ) { { fsinfo_entry *e = (struct fsinfo_entry) p; char *data = p + sizeof(struct fsinfo_entry); switch(e->request) { case FSINFO_MOUNT_PATH: printf("mountpoint %s\n", data); break; case FSINFO_PROPAGATION: printf("propagation %x\n", (uintptr_t) data); break; case FSINFO_CHILDREN_IDS: fsinfo_child *x = (fsinfo_child *) data; for (i = 0; i < e->count; i++) { printf("child: %d\n", x[i].mnt_id); } break; ... } p += sizeof(struct fsinfo_entry) + e->len; } ... my two cents :-) Karel
On Tue, May 10, 2022 at 10:55:33AM +1000, Dave Chinner wrote: > On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote: > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > > This is a simplification of the getvalues(2) prototype and moving it to the > > > getxattr(2) interface, as suggested by Dave. > > > > > > The patch itself just adds the possibility to retrieve a single line of > > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > > > patchset grew out of). > > > > > > But this should be able to serve Amir's per-sb iostats, as well as a host of > > > other cases where some statistic needs to be retrieved from some object. Note: > > > a filesystem object often represents other kinds of objects (such as processes > > > in /proc) so this is not limited to fs attributes. > > > > > > This also opens up the interface to setting attributes via setxattr(2). > > > > > > After some pondering I made the namespace so: > > > > > > : - root > > > bar - an attribute > > > foo: - a folder (can contain attributes and/or folders) > > > > > > The contents of a folder is represented by a null separated list of names. > > > > > > Examples: > > > > > > $ getfattr -etext -n ":" . > > > # file: . > > > :="mnt:\000mntns:" > > > > > > $ getfattr -etext -n ":mnt:" . > > > # file: . > > > :mnt:="info" > > > > > > $ getfattr -etext -n ":mnt:info" . > > > # file: . > > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012" > > > > Hey Miklos, > > > > One comment about this. We really need to have this interface support > > giving us mount options like "relatime" back in numeric form (I assume > > this will be possible.). It is royally annoying having to maintain a > > mapping table in userspace just to do: > > > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY > > You're asking for a complete change of output information there. > This has nothing to do with the mechanism for extracting key/value > information from the kernel. > > i.e. we need to separate demands for "data I want" from "mechanism > to extract data". > > > > $ getfattr -etext -n ":mntns:" . > > > # file: . > > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:" > > > > > > $ getfattr -etext -n ":mntns:28:" . > > > # file: . > > > :mntns:28:="info" > > > > > > Comments? > > > > I'm not a fan of text-based APIs and I'm particularly not a fan of the > > xattr APIs. But at this point I'm ready to compromise on a lot as long > > as it gets us values out of the kernel in some way. :) > > > > I had to use xattrs extensively in various low-level userspace projects > > and they continue to be a source of races and memory bugs. > > > > A few initial questions: > > > > * The xattr APIs often require the caller to do sm like (copying some go > > code quickly as I have that lying around): > > > > for _, x := range split { > > xattr := string(x) > > // Call Getxattr() twice: First, to determine the size of the > > // buffer we need to allocate to store the extended attributes, > > // second, to actually store the extended attributes in the > > // buffer. Also, check if the size of the extended attribute > > // hasn't increased between the two calls. > > pre, err = unix.Getxattr(path, xattr, nil) > > if err != nil || pre < 0 { > > return nil, err > > } > > > > dest = make([]byte, pre) > > post := 0 > > if pre > 0 { > > post, err = unix.Getxattr(path, xattr, dest) > > if err != nil || post < 0 { > > return nil, err > > } > > } > > > > if post > pre { > > return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post) > > } > > > > xattrs[xattr] = string(dest) > > } > > > > This pattern of requesting the size first by passing empty arguments, > > then allocating the buffer and then passing down that buffer to > > retrieve that value is really annoying to use and error prone (I do > > of course understand why it exists.). > > You're doing it wrong. > > When you have an attr extraction loop like this, you allocate a > single 64kB buffer for the value, adn then call getattr() with the > buffer and a length of 64kB. Then the call returns both the value > length and the value in one syscall, and then the application can > allocate+copy into an exact sized buffer if it needs to. > > Then you use the same 64kB buffer for the next getxattr() call. > > I first saw this pattern in code written in the mid 1990s for > Irix, and for basic listxattr/getxattr operations to find and > retrieve key/value pairs this is much more efficient than the above. Right, but this example is in a garbage collected language where this level of fine-grained memory management is rarely used or encouraged. But the point is taken. > > The real problem is that the linux listxattr() syscall only returns > names. It's a shit API design at best, especially the part where it > cannot iterate over a list larger than a single buffer. If you've > got millions of xattrs, listxattr is fucking useless. The man page > is actively harmful - that's where everyone learns the double > getxattr anit-pattern you've described above. > > > With that in mind, go look at XFS_IOC_ATTRLIST_BY_HANDLE that I've > mentioned in previous discussions this topic. Each attr entry > returned is one of these: > > struct xfs_attrlist_ent { /* data from attr_list() */ > __u32 a_valuelen; /* number bytes in value of attr */ > char a_name[1]; /* attr name (NULL terminated) */ > }; > > It really needs a namelen to make parsing of the output buffer > simpler, but this is the model we should be following - listing > xattrs is no different from listing directory entries. Indeed, > a directory entries is basically just a name/value pair - the name > of the dirent and the inode number it points to is the value. > > Further to that point, the control structure for > XFS_IOC_ATTRLIST_BY_HANDLE has a cursor value that works the same > way as readdir cookies do. Hence when iterating over xattrs that > require multiple syscalls to retrieve, the cursor allows the next > list syscall to start off listing exactly where the previous syscall > finished. > > IOWs, what Linux really needs is a listxattr2() syscall that works > the same way that getdents/XFS_IOC_ATTRLIST_BY_HANDLE work. With the > list function returning value sizes and being able to iterate > effectively, every problem that listxattr() causes goes away. That's an interesting proposal... > > And while we are at it, we need to consider a xattr2() syscall to > replace getxattr/setxattr. The model for that is > XFS_IOC_ATTRMULTI_BY_HANDLE, which allows operations to be performed > on mulitple xattrs in a single syscall. e.g. we can do a bulk get, > set and remove operation across multiple xattrs - we can even mix > and match get/set/remove operations on different xattrs in a single > call. Yeah, that would've come in handy in various codepaths I've seen. > > > * Would it be possible to support binary output with this interface? > > The xattr API already supports binary names and values. The only > exception is you can't put NULLs in names because APIs use that as a > name terminator. if listxattr2() returns a namelen in it's > structure, then we could allow fully binary names, too (XFS already > fully supports this internally!). In the current API, values are > always determined by length, not null termiantions, so they are > also already fully binary capable. Yeah, I'm aware of that. I want to know whether we might be amenable to support this for retrieving structs from mountinfo. > > > I really think users would love to have an interfact where they can > > get a struct with binary info back. > > No. Not for kernel informational interfaces. We have ioctls and That feels like semantics. statx is in all sensible readings of the words a kernel informational interface. > syscalls for defining structured binary interfaces that pass > non-trivial objects. xattrs are no the place for this - they are > key/value object stores like sysfs is supposed to be, so I really > don't think we should support encoded binary data in xattrs under > this special mount namespace... I do agree on that point but as I mentioned in my reply to Miklos (hard to keep track of all ends in this discussion...) if we are fine with adding some way - in the form of a system call - to retrieve basic mount information with some of the properties I listed in the other reply, I'm content. I'm really looking at this from the perspective of someone who uses these interfaces regularly in userspace and a text-based interface for very basic information such as detailed information about a mount is cumbersome. I know people like to "counter" with "parsing strings is easy" but it remains a giant pain for userspace; at least for basic info. If it's about really specialist deatiled information then sure; go text based or any other way to retrieve that info. > > Solving the "retreive multiple values per syscall" problem is what > the bulk get interface (XFS_IOC_ATTRMULTI_BY_HANDLE) is for. > > > Callers like systemd will have to parse text and will end up > > converting everything from text into binary anyway; especially for > > mount information. So giving them an option for this out of the box > > would be quite good. > > That boat sailed years ago. You're making the same arguments about > binary extraction interfaces being way more efficient than ascii > based value-per-file interfaces like proc/sysfs that were made back > in the early 2000s. That boat has long sailed - while the current > method is somewhat inefficient, it certainly hasn't had all the > problems that maintaining binary interfaces over decades has had.... > > Much as it pains me to say it (because I came to Linux from an OS > that exported huge amounts of custom binary structures from the > kernel), having all the export interfaces dump human readable > information has proved far more flexible and usable than interfaces > that required binary parsers to dump information before it could be > used on the command line or in scripting languages.... I'm really not trying to make far-reaching historical and future claims about OS interface design. I'm simply asking for a way to retrieve basic information about e.g. mounts in the form of a struct; statx for mounts. I feel this is quickly turning into a discussion about how we should in general design OS interfaces while that is really not at stake here. Linux is a mix of so many different interfaces - for better or worse - that any decision we take has no claim on codifying how we should do new interfaces. That is indeed a ship that has sailed long ago. > > > * If we decide to go forward with this interface - and I think I > > mentioned this in the lsfmm session - could we please at least add a > > new system call? It really feels wrong to retrieve mount and other > > information through the xattr interfaces. They aren't really xattrs. > > We are trying to expose structured key-value information. That's > exactly what the xattr API was orginally created to cater for... > > > Imho, xattrs are a bit like a wonky version of streams already (One of > > the reasons I find them quite unpleasant.). Making mount and other > > information retrievable directly through the getxattr() interface will > > turn them into a full-on streams implementation imho. I'd prefer not > > to do that (Which is another reason I'd prefer at least a separate > > system call.). > > And that's a total misunderstanding of what xattrs are. > > Alternate data streams are just {file,offset} based data streams > accessed via ithe same read/write() mechanisms as the primary data > stream. That's why I said "wonky". But I'm not going to argue this point. I think you by necessity have wider historical context on these things that I lack. But I don't find it unreasonable to also see them as an additional information channel. Sure, they are a generic key=value store for anything _in principle_. In practice however xattrs are very much perceived and used as information storage on files, a metadata side-channel if you will. All I'm claiming here is that it will confuse the living hell out of users if the getxattr() api suddenly is used not to just set and get information associated with inodes but to also provides filesystem or mount information. That's a totally a totally differnet type of information. Sure, it may fit well in the key=value scheme because the xattr key=value _interface_ is generic but that's a very technical argument. I'm looking at this from the experience of a user of the API for a moment and in code they'd do in one place: getxattr('/super/special/binary', "security.capability", ...); and then in another place they do: getxattr('/path/to/some/mount', "mntns:info", ...); that is just flatout confusing. > > Xattrs provide an *atomic key-value object store API*, not an offset > based data stream API. They are completely different beasts, > intended for completely different purposes. ADS are redundant when you > have directories and files, whilst an atomic key-value store is > something completely different. > > You do realise we have an independent, scalable, ACID compliant > key-value object store in every inode in an XFS filesystem, right? So far this was a really mail with good background information but I'm struggling to make sense of what that last sentence is trying to tell me. :)
* Dave Chinner: > IOWs, what Linux really needs is a listxattr2() syscall that works > the same way that getdents/XFS_IOC_ATTRLIST_BY_HANDLE work. With the > list function returning value sizes and being able to iterate > effectively, every problem that listxattr() causes goes away. getdents has issues of its own because it's unspecified what happens if the list of entries is modified during iteration. Few file systems add another tree just to guarantee stable iteration. Maybe that's different for xattrs because they are supposed to be small and can just be snapshotted with a full copy? Thanks, Florian
On Tue, 10 May 2022 at 13:53, Christian Brauner <brauner@kernel.org> wrote: > > What exactly are the attributes that systemd requires? > > We keep a repo with ideas for (kernel) extensions - we should probably > publish that somewhere - but the list we used for a prototype roughly > contains: > > * mount flags MOUNT_ATTR_RDONLY etc. > * time flags MOUNT_ATTR_RELATIME etc. (could probably be combined with > mount flags. We missed the opportunity to make them proper enums > separate from other mount flags imho.) > * propagation "flags" (MS_SHARED) > * peer group > * mnt_id of the mount > * mnt_id of the mount's parent > * owning userns Sounds good thus far. And hey, we don't even need a new syscall: statx(2) could handle these fine. > There's a bit more advanced stuff systemd would really want but which I > think is misplaced in a mountinfo system call including: > * list of primary and auxiliary block device major/minor It's when you need to return variable size arrays or list of strings that the statx kind of interface falls down. For that a hierarchical namespace is a much better choice, as it can represent arbitrary levels of arrays, while doing that with a specialized syscall is going to be cumbersome. > I just have a really hard time understanding how this belongs into the > (f)getxattr() system call family and why it would be a big deal to just > make this a separate system call. Fragmenting syntactically equivalent interfaces is bad, unifying them is good. Dave's example of adding a new syscall for retrieving multiple xattrs is a prime example. Thanks, Miklos
On Tue, 10 May 2022 at 15:15, Miklos Szeredi <miklos@szeredi.hu> wrote: > > On Tue, 10 May 2022 at 13:53, Christian Brauner <brauner@kernel.org> wrote: > > > > What exactly are the attributes that systemd requires? > > > > We keep a repo with ideas for (kernel) extensions - we should probably > > publish that somewhere - but the list we used for a prototype roughly > > contains: > > > > * mount flags MOUNT_ATTR_RDONLY etc. > > * time flags MOUNT_ATTR_RELATIME etc. (could probably be combined with > > mount flags. We missed the opportunity to make them proper enums > > separate from other mount flags imho.) > > * propagation "flags" (MS_SHARED) > > * peer group > > * mnt_id of the mount > > * mnt_id of the mount's parent > > * owning userns > > Sounds good thus far. And hey, we don't even need a new syscall: > statx(2) could handle these fine. Oh, we need this indexed with a mount id, which statx can't do. So indeed, a new syscall may be the best choice. Thanks, Miklos
On Tue, May 10, 2022 at 03:15:05PM +0200, Miklos Szeredi wrote: > On Tue, 10 May 2022 at 13:53, Christian Brauner <brauner@kernel.org> wrote: > > > > What exactly are the attributes that systemd requires? > > > > We keep a repo with ideas for (kernel) extensions - we should probably > > publish that somewhere - but the list we used for a prototype roughly > > contains: > > > > * mount flags MOUNT_ATTR_RDONLY etc. > > * time flags MOUNT_ATTR_RELATIME etc. (could probably be combined with > > mount flags. We missed the opportunity to make them proper enums > > separate from other mount flags imho.) > > * propagation "flags" (MS_SHARED) > > * peer group > > * mnt_id of the mount > > * mnt_id of the mount's parent > > * owning userns > > Sounds good thus far. And hey, we don't even need a new syscall: > statx(2) could handle these fine. > > > There's a bit more advanced stuff systemd would really want but which I > > think is misplaced in a mountinfo system call including: > > * list of primary and auxiliary block device major/minor > > It's when you need to return variable size arrays or list of strings > that the statx kind of interface falls down. > > For that a hierarchical namespace is a much better choice, as it can > represent arbitrary levels of arrays, while doing that with a > specialized syscall is going to be cumbersome. > > > I just have a really hard time understanding how this belongs into the > > (f)getxattr() system call family and why it would be a big deal to just > > make this a separate system call. > > Fragmenting syntactically equivalent interfaces is bad, unifying them Fwiw, turning this around: unifying semantically distinct interfaces because of syntactical similarities is bad. Moving them into a syntactically equivalent system call that expresses the difference in semantics in its name is good.
On Tue, 10 May 2022 at 16:19, Christian Brauner <brauner@kernel.org> wrote: > Fwiw, turning this around: unifying semantically distinct interfaces > because of syntactical similarities is bad. Moving them into a > syntactically equivalent system call that expresses the difference in > semantics in its name is good. You are ignoring the arguments against fragmentation. You are also ignoring the fact that semantically the current xattr interface is already fragmented. Grep for "strncmp(name, XATTR_" in fs/xattr.c. We don't have getsecurityxattr(), getuserxattr(), gettrustedxattr() and getsystemxattr(). It would be crazy. Adding getfsxattr() would be equally crazy. getxattr() pretty much describes the semantics of all of these things. Thanks, Miklos
On Tue, May 10, 2022 at 04:41:35PM +0200, Miklos Szeredi wrote: > On Tue, 10 May 2022 at 16:19, Christian Brauner <brauner@kernel.org> wrote: > > > Fwiw, turning this around: unifying semantically distinct interfaces > > because of syntactical similarities is bad. Moving them into a > > syntactically equivalent system call that expresses the difference in > > semantics in its name is good. > > You are ignoring the arguments against fragmentation. No, I'm not ignoring it and really wasn't trying to. What I tried to say by this, is that the inverse of the argument against fragmentation is simply equally worth supporting. Meaning the argument against fragmentation isn't stronger than the argument against aggressive unification. (Fwiw, I think that basing the argument on syntactical similarities is problematic. Stretching this argument for a second, all multiplexers are almost by necessity syntactically similar (e.g. ptrace() and seccomp()) but we don't use that as an argument to make them a single system call.) > > You are also ignoring the fact that semantically the current xattr > interface is already fragmented. Grep for "strncmp(name, XATTR_" in > fs/xattr.c. > > We don't have getsecurityxattr(), getuserxattr(), gettrustedxattr() > and getsystemxattr(). It would be crazy. Adding getfsxattr() would > be equally crazy. getxattr() pretty much describes the semantics of > all of these things. getxattr() describes the syntax of all of these things and barely that. It describes the method of retrieval. And the method of retrieval is super generic to the point where strings _or binary data_ can be returned (e.g. POSIX ACLs or fscaps) depending on the xattr namespace. But wight now, everything we currently get from getxattr() is attributes associated with inodes. So getsecurityxattr(), getuserxattr(), gettrustedxattr() etc. would arguably be fragmentation because all of these things are associated with inodes. But now we're in the process of extending the *xattr() calls to operate on mounts and filesystems so an additional getfsattr() (or another name) is not fragmentation imho. And I definitely don't think this would qualify as "crazy".
On Tue, 10 May 2022 at 17:30, Christian Brauner <brauner@kernel.org> wrote: > But now we're in the process of extending the *xattr() calls to operate > on mounts and filesystems so an additional getfsattr() (or another name) > is not fragmentation imho. And I definitely don't think this would > qualify as "crazy". In that spirit st_dev does not belong in struct stat, because that is the property of the block device, not the inode. But I feel we are going round in circles, lets please not get hung up on this issue. Linus will have the final word on which variant (if either) is going to go in. Thanks, Miklos
On Tue, May 10, 2022 at 05:47:13PM +0200, Miklos Szeredi wrote: > On Tue, 10 May 2022 at 17:30, Christian Brauner <brauner@kernel.org> wrote: > > > But now we're in the process of extending the *xattr() calls to operate > > on mounts and filesystems so an additional getfsattr() (or another name) > > is not fragmentation imho. And I definitely don't think this would > > qualify as "crazy". > > In that spirit st_dev does not belong in struct stat, because that is > the property of the block device, not the inode. > > But I feel we are going round in circles, lets please not get hung up > on this issue. Linus will have the final word on which variant (if > either) is going to go in. Well yes, I'm obviously not going to be d*ck about it and go around NAKing it just because I didn't get my favorite name but I at least want to register my strong opposition to the current "unification" approach loud and clear. :)
On Tue, May 10, 2022 at 02:45:39PM +0200, Florian Weimer wrote: > * Dave Chinner: > > > IOWs, what Linux really needs is a listxattr2() syscall that works > > the same way that getdents/XFS_IOC_ATTRLIST_BY_HANDLE work. With the > > list function returning value sizes and being able to iterate > > effectively, every problem that listxattr() causes goes away. > > getdents has issues of its own because it's unspecified what happens if > the list of entries is modified during iteration. Few file systems add > another tree just to guarantee stable iteration. The filesystem I care about (XFS) guarantees stable iteration and stable seekdir/telldir cookies. It's not that hard to do, but it requires the filesystem designer to understand that this is a necessary feature before they start designing the on-disk directory format and lookup algorithms.... > Maybe that's different for xattrs because they are supposed to be small > and can just be snapshotted with a full copy? It's different for xattrs because we directly control the API specification for XFS_IOC_ATTRLIST_BY_HANDLE, not POSIX. We can define the behaviour however we want. Stable iteration is what listing keys needs. The cursor is defined as 16 bytes of opaque data, enabling us to encoded exactly where in the hashed name btree index we have traversed to: /* * Kernel-internal version of the attrlist cursor. */ struct xfs_attrlist_cursor_kern { __u32 hashval; /* hash value of next entry to add */ __u32 blkno; /* block containing entry (suggestion) */ __u32 offset; /* offset in list of equal-hashvals */ __u16 pad1; /* padding to match user-level */ __u8 pad2; /* padding to match user-level */ __u8 initted; /* T/F: cursor has been initialized */ }; Hence we have all the information in the cursor we need to reset the btree traversal index to the exact entry we finished at (even in the presence of hash collisions in the index). Hence removal of the entry the cursor points to isn't a problem for us, we just move to the next highest sequential hash index in the btree and start again from there. Of course, if this is how we define listxattr2() behaviour (or maybe we should call it "list_keys()" to make it clear we are treating this as a key/value store instead of xattrs) then each filesystem can put what it needs in that cursor to guarantee it can restart key iteration correctly if the entry the cursor points to has been removed. We can also make the cursor larger if necessary for other filesystems to store the information they need. Cheers, Dave.
On Tue, May 10, 2022 at 02:35:12PM +0200, Karel Zak wrote: > On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote: > > One comment about this. We really need to have this interface support > > giving us mount options like "relatime" back in numeric form (I assume > > this will be possible.). It is royally annoying having to maintain a > > mapping table in userspace just to do: > > > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME > > ro -> MS_RDONLY/MOUNT_ATTR_RDONLY > > > > A library shouldn't be required to use this interface. Conservative > > low-level software that keeps its shared library dependencies minimal > > will need to be able to use that interface without having to go to an > > external library that transforms text-based output to binary form (Which > > I'm very sure will need to happen if we go with a text-based > > interface.). > > Sounds like David's fsinfo() :-) > > We need an interface where the kernel returns a consistent mount table > entry (more syscalls to get more key=value could be a way how to get > inconsistent data). > > IMHO all the attempts to make a trivial interface will be unsuccessful > because the mount table is complex (tree) and mixes strings, paths, > and flags. We will always end with a complex interface or complex > strings (like the last xatts attempt). There is no 3rd path to go ... > > The best would be simplified fsinfo() where userspace defines > a request (wanted "keys"), and the kernel fills a buffer with data > separated by some header metadata struct. In this case, the kernel can > return strings and structs with binary data. > > > I'd love something like: > > ssize_t sz; > fsinfo_query query[] = { > { .request = FSINFO_MOUNT_PATH }, > { .request = FSINFO_PROPAGATION }, > { .request = FSINFO_CHILDREN_IDS }, > }; > > sz = fsinfo(dfd, "", AT_EMPTY_PATH, > &query, ARRAY_SIZE(query), > buf, sizeof(buf)); > > for (p = buf; p < buf + sz; ) { > { > fsinfo_entry *e = (struct fsinfo_entry) p; > char *data = p + sizeof(struct fsinfo_entry); > > switch(e->request) { > case FSINFO_MOUNT_PATH: > printf("mountpoint %s\n", data); > break; > case FSINFO_PROPAGATION: > printf("propagation %x\n", (uintptr_t) data); > break; > case FSINFO_CHILDREN_IDS: > fsinfo_child *x = (fsinfo_child *) data; > for (i = 0; i < e->count; i++) { > printf("child: %d\n", x[i].mnt_id); > } > break; > ... > } > > p += sizeof(struct fsinfo_entry) + e->len; > } That's pretty much what a multi-xattr get operation looks like. It's a bit more more intricate in the setup of the request/return buffer, but otherwise the structure of the code is the same. I just don't see why we need special purpose interfaces like this for key/value information when small tweaks to the existing generic key/value interfaces can provide exactly the same functionality.... Cheers, Dave.
On Tue, May 10, 2022 at 02:40:33PM +0200, Christian Brauner wrote: > On Tue, May 10, 2022 at 10:55:33AM +1000, Dave Chinner wrote: > > On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote: > > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > > I really think users would love to have an interfact where they can > > > get a struct with binary info back. > > > > No. Not for kernel informational interfaces. We have ioctls and > > That feels like semantics. statx is in all sensible readings of the > words a kernel informational interface. statx is an special purpose binary syscall interface for returning inode specific information, it's not an abstract, generic informational interface. > I'm really looking at this from the perspective of someone who uses > these interfaces regularly in userspace and a text-based interface for > very basic information such as detailed information about a mount is > cumbersome. I know people like to "counter" with "parsing strings is > easy" but it remains a giant pain for userspace; at least for basic > info. As I said last reply, you are making all the same arguements against text based information interfaces that were made against proc and sysfs a long long time again. they weren't convincing a couple of decades ago, and there aren't really convincing now. Text-based key/value data is hard to screw up in the long run, binary interfaces have a habit of biting hard whenever the contents of the binary structure needs to change... > > > Imho, xattrs are a bit like a wonky version of streams already (One of > > > the reasons I find them quite unpleasant.). Making mount and other > > > information retrievable directly through the getxattr() interface will > > > turn them into a full-on streams implementation imho. I'd prefer not > > > to do that (Which is another reason I'd prefer at least a separate > > > system call.). > > > > And that's a total misunderstanding of what xattrs are. > > > > Alternate data streams are just {file,offset} based data streams > > accessed via ithe same read/write() mechanisms as the primary data > > stream. > > That's why I said "wonky". But I'm not going to argue this point. I > think you by necessity have wider historical context on these things > that I lack. But I don't find it unreasonable to also see them as an > additional information channel. > > Sure, they are a generic key=value store for anything _in principle_. In > practice however xattrs are very much perceived and used as information > storage on files, a metadata side-channel if you will. That's how *you* perceive them, not how everyone perceives them. > All I'm claiming here is that it will confuse the living hell out of > users if the getxattr() api suddenly is used not to just set and get > information associated with inodes but to also provides filesystem or > mount information. Why would it confuse people? The xattr namespace is already well known to be heirarchical and context dependent based on the intial name prefix (user, security, btrfs, trusted, etc). i.e. if you don't know that the context the xattr acts on is determined by the initial name prefix, then you need to read the xattr(7) man page again: Extended attribute namespaces Attribute names are null-terminated strings. The attribute name is always specified in the fully qualified namespace.attribute form, for example, user.mime_type, trusted.md5sum, system.posix_acl_access, or security.selinux. The namespace mechanism is used to define different classes of extended attributes. These different classes exist for several reasons; for example, the permissions and capabilities required for manipulating extended attributes of one namespace may differ to another. Currently, the security, system, trusted, and user extended attribute classes are defined as described below. Additional classes may be added in the future. > That's a totally a totally differnet type of information. Sure, it may > fit well in the key=value scheme because the xattr key=value _interface_ > is generic but that's a very technical argument. Yet adding a new xattr namespace for a new class of information that is associated the mount that the path/inode/fd is associated with is exactly what the xattr namespaces are intended to allow. And it is clearly documented that new classes "may be added in the future". I just don't see where the confusion would come from... > > I'm looking at this from the experience of a user of the API for a > moment and in code they'd do in one place: > > getxattr('/super/special/binary', "security.capability", ...); > > and then in another place they do: > > getxattr('/path/to/some/mount', "mntns:info", ...); > > that is just flatout confusing. Why? Both are getting different classes of key/value information that is specific to the given path. Just because on is on-disk and the other is ephemeral doesn't make it in any way confusing. This is exactly what xattr namesapces are intended to support... > > Xattrs provide an *atomic key-value object store API*, not an offset > > based data stream API. They are completely different beasts, > > intended for completely different purposes. ADS are redundant when you > > have directories and files, whilst an atomic key-value store is > > something completely different. > > > > You do realise we have an independent, scalable, ACID compliant > > key-value object store in every inode in an XFS filesystem, right? > > So far this was a really mail with good background information but I'm > struggling to make sense of what that last sentence is trying to tell > me. :) That people in the past have built large scale data storage applications that use XFS inodes as key based object stores, not as a offset based data stream. Who needs atomic write() functionality when you have ACID set and replace operations for named objects? The reality is that modern filesystems are really just btree based object stores with high performance transaction engines overlaid with a POSIX wrapper. And in the case of xattrs, we effectively expose that btree based key-value database functionality directly to userspace.... Stop thinking like xattrs are some useless metadata side channel, and start thinking of them as an atomic object store that stores and retreives millions of small (< 1/2 the filesystem block size) named objects far space effciently than a directory structure full of small files indexed by object hash. Cheers, Dave.
On Wed, May 11, 2022 at 09:25:52AM +1000, Dave Chinner wrote: > > I'd love something like: > > > > ssize_t sz; > > fsinfo_query query[] = { > > { .request = FSINFO_MOUNT_PATH }, > > { .request = FSINFO_PROPAGATION }, > > { .request = FSINFO_CHILDREN_IDS }, > > }; > > > > sz = fsinfo(dfd, "", AT_EMPTY_PATH, > > &query, ARRAY_SIZE(query), > > buf, sizeof(buf)); > > > > for (p = buf; p < buf + sz; ) { > > { > > fsinfo_entry *e = (struct fsinfo_entry) p; > > char *data = p + sizeof(struct fsinfo_entry); > > > > switch(e->request) { > > case FSINFO_MOUNT_PATH: > > printf("mountpoint %s\n", data); > > break; > > case FSINFO_PROPAGATION: > > printf("propagation %x\n", (uintptr_t) data); > > break; > > case FSINFO_CHILDREN_IDS: > > fsinfo_child *x = (fsinfo_child *) data; > > for (i = 0; i < e->count; i++) { > > printf("child: %d\n", x[i].mnt_id); > > } > > break; > > ... > > } > > > > p += sizeof(struct fsinfo_entry) + e->len; > > } > > That's pretty much what a multi-xattr get operation looks like. > It's a bit more more intricate in the setup of the request/return > buffer, but otherwise the structure of the code is the same. > > I just don't see why we need special purpose interfaces like this > for key/value information when small tweaks to the existing > generic key/value interfaces can provide exactly the same > functionality.... I don't say we need a new interface ;-) I'd be happy with whatever as long as: * minimal strings parsing (wish than a requirement) * one syscall returns multiple key/value * can address mount table entries by ID * can ask for list of children (submounts) * extensible if this will be possible with xattr (listxattr2(), or so) when great. Karel
On Wed, May 11, 2022 at 10:42:00AM +1000, Dave Chinner wrote: > On Tue, May 10, 2022 at 02:40:33PM +0200, Christian Brauner wrote: > > On Tue, May 10, 2022 at 10:55:33AM +1000, Dave Chinner wrote: > > > On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote: > > > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote: > > > > I really think users would love to have an interfact where they can > > > > get a struct with binary info back. > > > > > > No. Not for kernel informational interfaces. We have ioctls and > > > > That feels like semantics. statx is in all sensible readings of the > > words a kernel informational interface. > > statx is an special purpose binary syscall interface for returning > inode specific information, it's not an abstract, generic > informational interface. I want the ability to also have a statx like interface for some basic mount information is what I'm saying; so a special-purpose interface. I nowhere argued that statx is a _generic_ information interface. > > > I'm really looking at this from the perspective of someone who uses > > these interfaces regularly in userspace and a text-based interface for > > very basic information such as detailed information about a mount is > > cumbersome. I know people like to "counter" with "parsing strings is > > easy" but it remains a giant pain for userspace; at least for basic > > info. > > As I said last reply, you are making all the same arguements against > text based information interfaces that were made against proc and > sysfs a long long time again. they weren't convincing a couple of > decades ago, and there aren't really convincing now. Text-based I'm not in general opposed to an interface that allows to retrieve text-based values. I mentioned this earlier in the thread more than once. Also, we're not talking about pseudo-filesystems here. I'm saying I want the ability to get some basic information in binary form. Whether it's as part of this interface or another. Also, *xattr() interfaces aren't really just text-based as you've mentioned yourself. They return binary data such as struct vfs_ns_cap_data or struct posix_acl_xattr_entry. > key/value data is hard to screw up in the long run, binary > interfaces have a habit of biting hard whenever the contents of > the binary structure needs to change... I partially agree. If we're talking about a netlink like protocol with complex structs, sure. If we're talking about statx or openat2 then I don't think so. These are not structs that constantly change and newer system calls are designed with extensions in mind. I also think there's a misunderstanding here: I really want to stress that I'm not arguing for the necessity for a generic binary information retrieval interface. I also mentioned this earlier. I really just want the ability to still have some well-defined, special-purpose binary interface similar to e.g. statx that isn't rejected with the argument that you can get it text-based from *xattr. And I think others have agreed in other parts of the thread that is still ok. If that's is indeed true I'm content. > > > > > Imho, xattrs are a bit like a wonky version of streams already (One of > > > > the reasons I find them quite unpleasant.). Making mount and other > > > > information retrievable directly through the getxattr() interface will > > > > turn them into a full-on streams implementation imho. I'd prefer not > > > > to do that (Which is another reason I'd prefer at least a separate > > > > system call.). > > > > > > And that's a total misunderstanding of what xattrs are. > > > > > > Alternate data streams are just {file,offset} based data streams > > > accessed via ithe same read/write() mechanisms as the primary data > > > stream. > > > > That's why I said "wonky". But I'm not going to argue this point. I > > think you by necessity have wider historical context on these things > > that I lack. But I don't find it unreasonable to also see them as an > > additional information channel. > > > > Sure, they are a generic key=value store for anything _in principle_. In > > practice however xattrs are very much perceived and used as information > > storage on files, a metadata side-channel if you will. > > That's how *you* perceive them, not how everyone perceives them. I mean, this is as an argument we can't bring to a conclusion or that will bring us forward in understanding each other. All I could reply to that is "No, that's how *you* perceive them." in return. I don't know how this would help. Let's put it this way, if you go into userspace and ask users of xattrs what they think xattrs are then they will not come up with a technical definition for a completely generic key=value interface based on filesystem btree consideration. They will see them as specific attributes on files speaking from experience. > > > All I'm claiming here is that it will confuse the living hell out of > > users if the getxattr() api suddenly is used not to just set and get > > information associated with inodes but to also provides filesystem or > > mount information. > > Why would it confuse people? The xattr namespace is already well > known to be heirarchical and context dependent based on the intial > name prefix (user, security, btrfs, trusted, etc). i.e. if you don't > know that the context the xattr acts on is determined by the initial > name prefix, then you need to read the xattr(7) man page again: I take it this is a rhetorical device here and you know that I've not just patched these codepaths multiple times in the kernel but also extensively programmed with xattrs in userspace for various container projects. The reasons why I think it confuses people I already explained in various parts earlier in the thread. And there are at least two replies to this thread that find it confusing as well. If you can't relate to any of these reasons then me reiterating them won't bring this forward. > > Extended attribute namespaces > > Attribute names are null-terminated strings. The > attribute name is always specified in the fully qualified > namespace.attribute form, for example, user.mime_type, > trusted.md5sum, system.posix_acl_access, or > security.selinux. > > The namespace mechanism is used to define different classes > of extended attributes. These different classes exist for > several reasons; for example, the permissions and > capabilities required for manipulating extended attributes > of one namespace may differ to another. > > Currently, the security, system, trusted, and user > extended attribute classes are defined as described below. > Additional classes may be added in the future. > > > That's a totally a totally differnet type of information. Sure, it may > > fit well in the key=value scheme because the xattr key=value _interface_ > > is generic but that's a very technical argument. > > Yet adding a new xattr namespace for a new class of information that > is associated the mount that the path/inode/fd is associated with is > exactly what the xattr namespaces are intended to allow. And it is > clearly documented that new classes "may be added in the future". Fwiw, I don't think the "Additional classes may be added in the future." means that this interface in principle can be used to retrieve any key=value based information. If that was really the case then xattr(7) should really put your definition below on to their. > > I just don't see where the confusion would come from... Again, there's people who can relate to the arguments even on this thread but if you can't relate to any of them at all and can't see where we're coming from than me reiterating all of it once more won't help. > > > > > I'm looking at this from the experience of a user of the API for a > > moment and in code they'd do in one place: > > > > getxattr('/super/special/binary', "security.capability", ...); > > > > and then in another place they do: > > > > getxattr('/path/to/some/mount', "mntns:info", ...); > > > > that is just flatout confusing. > > Why? Both are getting different classes of key/value information > that is specific to the given path. Just because on is on-disk and > the other is ephemeral doesn't make it in any way confusing. This is > exactly what xattr namesapces are intended to support... > > > > Xattrs provide an *atomic key-value object store API*, not an offset > > > based data stream API. They are completely different beasts, > > > intended for completely different purposes. ADS are redundant when you > > > have directories and files, whilst an atomic key-value store is > > > something completely different. > > > > > > You do realise we have an independent, scalable, ACID compliant > > > key-value object store in every inode in an XFS filesystem, right? > > > > So far this was a really mail with good background information but I'm > > struggling to make sense of what that last sentence is trying to tell > > me. :) > > That people in the past have built large scale data storage > applications that use XFS inodes as key based object stores, not as > a offset based data stream. Who needs atomic write() functionality > when you have ACID set and replace operations for named objects? > > The reality is that modern filesystems are really just btree based > object stores with high performance transaction engines overlaid > with a POSIX wrapper. And in the case of xattrs, we effectively > expose that btree based key-value database functionality directly to > userspace.... > > Stop thinking like xattrs are some useless metadata side channel, I did say they are a "metadata side channel"; I very much never called them "useless". (I don't like it a lot that a subset of them - posix acls - store additional {g,u}id information on disk but that's a totally different topic.) > and start thinking of them as an atomic object store that stores and > retreives millions of small (< 1/2 the filesystem block size) named > objects far space effciently than a directory structure full of > small files indexed by object hash. What you outline is the perspective of a probably +20 years kernel and very low-level system software developer. I appreciate that perspective and I can very much appreciate that this is how we can conceptualize xattrs in the kernel. But on the other side of the equation is userspace and it can't be the expectation that this is how they conceptualize the *xattr() kernel interfaces based on the manpage or system call. This is so generic that it becomes meaningless. By this definition we can just start retrieving any type of information via this interface all across the kernel. And you'd probably argue that in principle we could. But then we can also just go back to calling it getvalues(). I think that's another part of the disconnect between our viewpoints: you perceive the *xattr() interfaces as a super generic retrieval method for an abstract key=value storage. And so adding anything in there is fine as long as it has key=value semantics. I find that to be a shift in how the *xattr() interfaces are currently used and even how they are defined. I think we've pretty much tried to explain our viewpoints in detail and we don't seem to be getting to common ground for some of it and as I said somewhere else in the thread I have no intention of blocking this if people want this interface including making this a direct part of getxattr() and Linus accepts it.
Hi Miklos and anyone interested in this proposal, is there any update on this? Sorry that I didn't find any.. Thanks & Best regards, Abel On 5/3/22 8:23 PM, Miklos Szeredi wrote: > This is a simplification of the getvalues(2) prototype and moving it to the > getxattr(2) interface, as suggested by Dave. > > The patch itself just adds the possibility to retrieve a single line of > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo > patchset grew out of). > > But this should be able to serve Amir's per-sb iostats, as well as a host of > other cases where some statistic needs to be retrieved from some object. Note: > a filesystem object often represents other kinds of objects (such as processes > in /proc) so this is not limited to fs attributes. > > This also opens up the interface to setting attributes via setxattr(2). >
On Mon, 14 Nov 2022 at 10:00, Abel Wu <wuyun.abel@bytedance.com> wrote: > > Hi Miklos and anyone interested in this proposal, is there any update on > this? Sorry that I didn't find any.. No update. Which part are you interested in? Getting mount attributes? Or a generic key-value retrieval and storage interface? For the first one there are multiple proposals, one of them is adding a new system call using binary structs. The fsinfo(2) syscall was deemed overdesigned and rejected. Something simpler would probably be fairly uncontroversial. As for the other proposal it seems like some people would prefer a set of new syscalls, while some others would like to reuse the xattr syscalls. No agreement seems to have been reached. Also I think a notification system for mount related events is also a much needed component. I've tried to explore using the fsnotify framework for this, but the code is pretty convoluted and I couldn't get prototype working. Thanks, Miklos
On 11/14/22 8:35 PM, Miklos Szeredi wrote: > On Mon, 14 Nov 2022 at 10:00, Abel Wu <wuyun.abel@bytedance.com> wrote: >> >> Hi Miklos and anyone interested in this proposal, is there any update on >> this? Sorry that I didn't find any.. > > No update. > > Which part are you interested in? We noticed that atop(1) can introduce a burst cpu usage once number of processes becoming large. It is mostly due to the overhead of massive syscalls. There are similar cases like monitor agents recording system status and consuming resources in modern data centers. So it would be nice to get a bunch of info in one syscall. > > Getting mount attributes? Or a generic key-value retrieval and > storage interface? The latter. > > For the first one there are multiple proposals, one of them is adding > a new system call using binary structs. The fsinfo(2) syscall was > deemed overdesigned and rejected. Something simpler would probably be > fairly uncontroversial. > > As for the other proposal it seems like some people would prefer a set > of new syscalls, while some others would like to reuse the xattr > syscalls. No agreement seems to have been reached. So the divergence comes from 'how' rather than 'why', right? Thanks & Best, Abel > > Also I think a notification system for mount related events is also a > much needed component. I've tried to explore using the fsnotify > framework for this, but the code is pretty convoluted and I couldn't > get prototype working. > > Thanks, > Miklos
--- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table. pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o remap_range.o + kernel_read_file.o remap_range.o values.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o direct-io.o mpage.o --- a/fs/mount.h +++ b/fs/mount.h @@ -148,3 +148,11 @@ static inline bool is_anon_ns(struct mnt } extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); + +struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p); +extern void namespace_lock_read(void); +extern void namespace_unlock_read(void); +extern int show_mountinfo_root(struct seq_file *m, struct vfsmount *mnt, + struct path *root); +extern bool is_path_reachable(struct mount *, struct dentry *, + const struct path *root); --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1332,9 +1332,7 @@ struct vfsmount *mnt_clone_internal(cons return &p->mnt; } -#ifdef CONFIG_PROC_FS -static struct mount *mnt_list_next(struct mnt_namespace *ns, - struct list_head *p) +struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p) { struct mount *mnt, *ret = NULL; @@ -1351,6 +1349,7 @@ static struct mount *mnt_list_next(struc return ret; } +#ifdef CONFIG_PROC_FS /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { @@ -1507,6 +1506,16 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } +void namespace_lock_read(void) +{ + down_read(&namespace_sem); +} + +void namespace_unlock_read(void) +{ + up_read(&namespace_sem); +} + enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, --- a/fs/pnode.h +++ b/fs/pnode.h @@ -50,7 +50,5 @@ void mnt_set_mountpoint(struct mount *, void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); -bool is_path_reachable(struct mount *, struct dentry *, - const struct path *root); int count_mounts(struct mnt_namespace *ns, struct mount *mnt); #endif /* _LINUX_PNODE_H */ --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -132,9 +132,9 @@ static int show_vfsmnt(struct seq_file * return err; } -static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) +int show_mountinfo_root(struct seq_file *m, struct vfsmount *mnt, + struct path *root) { - struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -152,7 +152,7 @@ static int show_mountinfo(struct seq_fil seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ - err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + err = seq_path_root(m, &mnt_path, root, " \t\n\\"); if (err) goto out; @@ -164,7 +164,7 @@ static int show_mountinfo(struct seq_fil seq_printf(m, " shared:%i", r->mnt_group_id); if (IS_MNT_SLAVE(r)) { int master = r->mnt_master->mnt_group_id; - int dom = get_dominating_id(r, &p->root); + int dom = get_dominating_id(r, root); seq_printf(m, " master:%i", master); if (dom && dom != master) seq_printf(m, " propagate_from:%i", dom); @@ -194,6 +194,13 @@ static int show_mountinfo(struct seq_fil return err; } +static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = m->private; + + return show_mountinfo_root(m, mnt, &p->root); +} + static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; --- /dev/null +++ b/fs/values.c @@ -0,0 +1,242 @@ +#include <linux/values.h> +#include <linux/fs_struct.h> +#include <linux/seq_file.h> +#include <linux/nsproxy.h> +#include "../lib/kstrtox.h" +#include "mount.h" + +struct val_string { + const char *str; + size_t len; +}; + +struct val_iter { + struct val_string name; + struct seq_file seq; + int error; +}; + +struct val_desc { + struct val_string name; + union { + u64 idx; + int (*get)(struct val_iter *vi, const struct path *path); + }; +}; + +#define VAL_STRING(x) { .str = x, .len = sizeof(x) - 1 } +#define VD_NAME(x) .name = VAL_STRING(x) + +static int val_err(struct val_iter *vi, int err) +{ + vi->error = err; + return 0; +} + +static int val_end_seq(struct val_iter *vi) +{ + if (vi->seq.count == vi->seq.size) + return -EOVERFLOW; + + return 0; +} + +static inline void val_string_skip(struct val_string *s, size_t count) +{ + WARN_ON(s->len < count); + s->str += count; + s->len -= count; +} + +static bool val_string_prefix(const struct val_string *p, + const struct val_string *s) +{ + return s->len >= p->len && !memcmp(s->str, p->str, p->len); +} + +static struct val_desc *val_lookup(struct val_iter *vi, struct val_desc *vd) +{ + for (; vd->name.len; vd++) { + if (val_string_prefix(&vd->name, &vi->name)) { + val_string_skip(&vi->name, vd->name.len); + break; + } + } + return vd; +} + +static int val_get_group(struct val_iter *vi, struct val_desc *vd) +{ + for (; vd->name.len; vd++) + seq_write(&vi->seq, vd->name.str, vd->name.len + 1); + + return val_end_seq(vi); +} + +enum { + VAL_MNT_INFO, +}; + +static struct val_desc val_mnt_group[] = { + { VD_NAME("info"), .idx = VAL_MNT_INFO }, + { } +}; + +static int val_mnt_show(struct val_iter *vi, struct vfsmount *mnt) +{ + struct val_desc *vd = val_lookup(vi, val_mnt_group); + struct path root; + + if (!vd->name.str) + return val_err(vi, -ENOENT); + + switch(vd->idx) { + case VAL_MNT_INFO: + get_fs_root(current->fs, &root); + show_mountinfo_root(&vi->seq, mnt, &root); + path_put(&root); + break; + } + + return 0; +} + +static int val_mnt_get(struct val_iter *vi, const struct path *path) +{ + int err; + + if (!vi->name.len) + return val_get_group(vi, val_mnt_group); + + namespace_lock_read(); + err = val_mnt_show(vi, path->mnt); + namespace_unlock_read(); + + return err; +} + +/* called with namespace_sem held for read */ +static struct vfsmount *mnt_lookup_by_id(struct mnt_namespace *ns, + struct path *root, int id) +{ + struct mount *m; + + for (m = mnt_list_next(ns, &ns->list); m; m = mnt_list_next(ns, &m->mnt_list)) { + if (m->mnt_id == id) { + if (is_path_reachable(m, m->mnt.mnt_root, root)) + return mntget(&m->mnt); + else + return NULL; + } + } + return NULL; +} + +static void seq_mnt_list(struct seq_file *seq, struct mnt_namespace *ns, + struct path *root) +{ + struct mount *m; + + namespace_lock_read(); + for (m = mnt_list_next(ns, &ns->list); m; m = mnt_list_next(ns, &m->mnt_list)) { + if (is_path_reachable(m, m->mnt.mnt_root, root)) { + seq_printf(seq, "%i:", m->mnt_id); + seq_putc(seq, '\0'); + } + } + namespace_unlock_read(); +} + +static int val_mntns_get(struct val_iter *vi, const struct path *path) +{ + struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns; + struct vfsmount *mnt; + struct path root; + unsigned long long mnt_id; + unsigned int end; + int err; + + if (!vi->name.len) { + get_fs_root(current->fs, &root); + seq_mnt_list(&vi->seq, mnt_ns, &root); + path_put(&root); + return val_end_seq(vi); + } + + end = _parse_integer(vi->name.str, 10, &mnt_id); + if (end & KSTRTOX_OVERFLOW) + return val_err(vi, -ENOENT); + if (vi->name.str[end] != VAL_SEP) + return val_err(vi, -ENOENT); + val_string_skip(&vi->name, end + 1); + + namespace_lock_read(); + get_fs_root(current->fs, &root); + mnt = mnt_lookup_by_id(mnt_ns, &root, mnt_id); + path_put(&root); + if (!mnt) { + namespace_unlock_read(); + return val_err(vi, -ENOENT); + } + if (vi->name.len) + err = val_mnt_show(vi, mnt); + else + err = val_get_group(vi, val_mnt_group); + + namespace_unlock_read(); + mntput(mnt); + + return err; +} + + + +static struct val_desc val_toplevel_group[] = { + { VD_NAME("mnt:"), .get = val_mnt_get, }, + { VD_NAME("mntns:"), .get = val_mntns_get, }, + { }, +}; + +static int getvalues(struct val_iter *vi, const struct path *path) +{ + struct val_desc *vd; + int err; + + if (!vi->name.len) + return val_get_group(vi, val_toplevel_group); + + vd = val_lookup(vi, val_toplevel_group); + if (!vd->name.len) + err = val_err(vi, -ENOENT); + else + err = vd->get(vi, path); + + return err ?: vi->error; +} + +ssize_t val_getxattr(struct path *path, const char *name, size_t namelen, + void __user *value, size_t size) +{ + int err; + char val[1024]; + struct val_iter vi = { + .name = { .str = name, .len = namelen }, + .seq = { .buf = val, .size = min(sizeof(val), size) }, + }; + + if (!size) + return sizeof(val); + + val_string_skip(&vi.name, 1); + + err = getvalues(&vi, path); + if (err < 0) + return err; + + WARN_ON(vi.seq.count > size); + if (copy_to_user(value, vi.seq.buf, vi.seq.count)) + return -EFAULT; + + return vi.seq.count; +} + --- a/fs/xattr.c +++ b/fs/xattr.c @@ -22,6 +22,7 @@ #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> +#include <linux/values.h> #include <linux/uaccess.h> @@ -643,12 +644,13 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, cons * Extended attribute GET operations */ static ssize_t -getxattr(struct user_namespace *mnt_userns, struct dentry *d, - const char __user *name, void __user *value, size_t size) +getxattr(struct path *path, const char __user *name, + void __user *value, size_t size) { ssize_t error; void *kvalue = NULL; char kname[XATTR_NAME_MAX + 1]; + struct user_namespace *mnt_userns = mnt_user_ns(path->mnt); error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) @@ -656,6 +658,9 @@ getxattr(struct user_namespace *mnt_user if (error < 0) return error; + if (kname[0] == VAL_SEP) + return val_getxattr(path, kname, error, value, size); + if (size) { if (size > XATTR_SIZE_MAX) size = XATTR_SIZE_MAX; @@ -664,7 +669,7 @@ getxattr(struct user_namespace *mnt_user return -ENOMEM; } - error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); + error = vfs_getxattr(mnt_userns, path->dentry, kname, kvalue, size); if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) @@ -693,7 +698,7 @@ static ssize_t path_getxattr(const char error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; - error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size); + error = getxattr(&path, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -723,8 +728,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, cons if (!f.file) return error; audit_file(f.file); - error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, - name, value, size); + error = getxattr(&f.file->f_path, name, value, size); fdput(f); return error; } --- /dev/null +++ b/include/linux/values.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/types.h> + +#define VAL_SEP ':' + +struct path; + +ssize_t val_getxattr(struct path *path, const char *name, size_t namelen, + void __user *value, size_t size); +