Message ID | 20230928130147.564503-5-mszeredi@redhat.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Paul Moore |
Headers | show |
Series | querying mount attributes | expand |
On Thu, Sep 28, 2023 at 9:04 AM Miklos Szeredi <mszeredi@redhat.com> wrote: > > Add way to query the children of a particular mount. This is a more > flexible way to iterate the mount tree than having to parse the complete > /proc/self/mountinfo. > > Lookup the mount by the new 64bit mount ID. If a mount needs to be queried > based on path, then statx(2) can be used to first query the mount ID > belonging to the path. > > Return an array of new (64bit) mount ID's. Without privileges only mounts > are listed which are reachable from the task's root. > > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > fs/namespace.c | 69 ++++++++++++++++++++++++++ > include/linux/syscalls.h | 3 ++ > include/uapi/asm-generic/unistd.h | 5 +- > include/uapi/linux/mount.h | 3 ++ > 6 files changed, 81 insertions(+), 1 deletion(-) ... > diff --git a/fs/namespace.c b/fs/namespace.c > index 3326ba2b2810..050e2d2af110 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -4970,6 +4970,75 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req, > return ret; > } > > +static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize, > + const struct path *root, unsigned int flags) > +{ > + struct mount *r, *m = real_mount(mnt); > + struct path rootmnt = { > + .mnt = root->mnt, > + .dentry = root->mnt->mnt_root > + }; > + long ctr = 0; > + bool reachable_only = true; > + int err; > + > + err = security_sb_statfs(mnt->mnt_root); > + if (err) > + return err; > + > + if (flags & LISTMOUNT_UNREACHABLE) { > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + reachable_only = false; > + } > + > + if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt)) > + return capable(CAP_SYS_ADMIN) ? 0 : -EPERM; > + > + list_for_each_entry(r, &m->mnt_mounts, mnt_child) { > + if (reachable_only && > + !is_path_reachable(r, r->mnt.mnt_root, root)) > + continue; I believe we would want to move the security_sb_statfs() call from above to down here; something like this I think ... err = security_sb_statfs(r->mnt.mnt_root); if (err) /* if we can't access the mount, pretend it doesn't exist */ continue; > + if (ctr >= bufsize) > + return -EOVERFLOW; > + if (put_user(r->mnt_id_unique, buf + ctr)) > + return -EFAULT; > + ctr++; > + if (ctr < 0) > + return -ERANGE; > + } > + return ctr; > +}
On Wed, 4 Oct 2023 at 21:38, Paul Moore <paul@paul-moore.com> wrote: > > On Thu, Sep 28, 2023 at 9:04 AM Miklos Szeredi <mszeredi@redhat.com> wrote: > > > > Add way to query the children of a particular mount. This is a more > > flexible way to iterate the mount tree than having to parse the complete > > /proc/self/mountinfo. > > > > Lookup the mount by the new 64bit mount ID. If a mount needs to be queried > > based on path, then statx(2) can be used to first query the mount ID > > belonging to the path. > > > > Return an array of new (64bit) mount ID's. Without privileges only mounts > > are listed which are reachable from the task's root. > > > > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> > > --- > > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > > fs/namespace.c | 69 ++++++++++++++++++++++++++ > > include/linux/syscalls.h | 3 ++ > > include/uapi/asm-generic/unistd.h | 5 +- > > include/uapi/linux/mount.h | 3 ++ > > 6 files changed, 81 insertions(+), 1 deletion(-) > > ... > > > diff --git a/fs/namespace.c b/fs/namespace.c > > index 3326ba2b2810..050e2d2af110 100644 > > --- a/fs/namespace.c > > +++ b/fs/namespace.c > > @@ -4970,6 +4970,75 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req, > > return ret; > > } > > > > +static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize, > > + const struct path *root, unsigned int flags) > > +{ > > + struct mount *r, *m = real_mount(mnt); > > + struct path rootmnt = { > > + .mnt = root->mnt, > > + .dentry = root->mnt->mnt_root > > + }; > > + long ctr = 0; > > + bool reachable_only = true; > > + int err; > > + > > + err = security_sb_statfs(mnt->mnt_root); > > + if (err) > > + return err; > > + > > + if (flags & LISTMOUNT_UNREACHABLE) { > > + if (!capable(CAP_SYS_ADMIN)) > > + return -EPERM; > > + reachable_only = false; > > + } > > + > > + if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt)) > > + return capable(CAP_SYS_ADMIN) ? 0 : -EPERM; > > + > > + list_for_each_entry(r, &m->mnt_mounts, mnt_child) { > > + if (reachable_only && > > + !is_path_reachable(r, r->mnt.mnt_root, root)) > > + continue; > > I believe we would want to move the security_sb_statfs() call from > above to down here; something like this I think ... > > err = security_sb_statfs(r->mnt.mnt_root); > if (err) > /* if we can't access the mount, pretend it doesn't exist */ > continue; Hmm. Why is this specific to listing mounts (i.e. why doesn't readdir have a similar filter)? Also why hasn't this come up with regards to the proc interfaces that list mounts? I just want to understand the big picture here. Thanks, Miklos
On 5/10/23 12:01, Miklos Szeredi wrote: > On Wed, 4 Oct 2023 at 21:38, Paul Moore <paul@paul-moore.com> wrote: >> On Thu, Sep 28, 2023 at 9:04 AM Miklos Szeredi <mszeredi@redhat.com> wrote: >>> Add way to query the children of a particular mount. This is a more >>> flexible way to iterate the mount tree than having to parse the complete >>> /proc/self/mountinfo. >>> >>> Lookup the mount by the new 64bit mount ID. If a mount needs to be queried >>> based on path, then statx(2) can be used to first query the mount ID >>> belonging to the path. >>> >>> Return an array of new (64bit) mount ID's. Without privileges only mounts >>> are listed which are reachable from the task's root. >>> >>> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> >>> --- >>> arch/x86/entry/syscalls/syscall_32.tbl | 1 + >>> arch/x86/entry/syscalls/syscall_64.tbl | 1 + >>> fs/namespace.c | 69 ++++++++++++++++++++++++++ >>> include/linux/syscalls.h | 3 ++ >>> include/uapi/asm-generic/unistd.h | 5 +- >>> include/uapi/linux/mount.h | 3 ++ >>> 6 files changed, 81 insertions(+), 1 deletion(-) >> ... >> >>> diff --git a/fs/namespace.c b/fs/namespace.c >>> index 3326ba2b2810..050e2d2af110 100644 >>> --- a/fs/namespace.c >>> +++ b/fs/namespace.c >>> @@ -4970,6 +4970,75 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req, >>> return ret; >>> } >>> >>> +static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize, >>> + const struct path *root, unsigned int flags) >>> +{ >>> + struct mount *r, *m = real_mount(mnt); >>> + struct path rootmnt = { >>> + .mnt = root->mnt, >>> + .dentry = root->mnt->mnt_root >>> + }; >>> + long ctr = 0; >>> + bool reachable_only = true; >>> + int err; >>> + >>> + err = security_sb_statfs(mnt->mnt_root); >>> + if (err) >>> + return err; >>> + >>> + if (flags & LISTMOUNT_UNREACHABLE) { >>> + if (!capable(CAP_SYS_ADMIN)) >>> + return -EPERM; >>> + reachable_only = false; >>> + } >>> + >>> + if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt)) >>> + return capable(CAP_SYS_ADMIN) ? 0 : -EPERM; >>> + >>> + list_for_each_entry(r, &m->mnt_mounts, mnt_child) { >>> + if (reachable_only && >>> + !is_path_reachable(r, r->mnt.mnt_root, root)) >>> + continue; >> I believe we would want to move the security_sb_statfs() call from >> above to down here; something like this I think ... >> >> err = security_sb_statfs(r->mnt.mnt_root); >> if (err) >> /* if we can't access the mount, pretend it doesn't exist */ >> continue; > Hmm. Why is this specific to listing mounts (i.e. why doesn't readdir > have a similar filter)? > > Also why hasn't this come up with regards to the proc interfaces that > list mounts? The proc interfaces essentially use <mount namespace>->list to provide the mounts that can be seen so it's filtered by mount namespace of the task that's doing the open(). See fs/namespace.c:mnt_list_next() and just below the m_start(), m_next(), etc. Ian > > I just want to understand the big picture here. > > Thanks, > Miklos
On Thu, 5 Oct 2023 at 06:23, Ian Kent <raven@themaw.net> wrote: > The proc interfaces essentially use <mount namespace>->list to provide > > the mounts that can be seen so it's filtered by mount namespace of the > > task that's doing the open(). > > > See fs/namespace.c:mnt_list_next() and just below the m_start(), m_next(), /proc/$PID/mountinfo will list the mount namespace of $PID. Whether current task has permission to do so is decided at open time. listmount() will list the children of the given mount ID. The mount ID is looked up in the task's mount namespace, so this cannot be used to list mounts of other namespaces. It's a more limited interface. I sort of understand the reasoning behind calling into a security hook on entry to statmount() and listmount(). And BTW I also think that if statmount() and listmount() is limited in this way, then the same limitation should be applied to the proc interfaces. But that needs to be done real carefully because it might cause regressions. OTOH if it's only done on the new interfaces, then what is the point, since the old interfaces will be available indefinitely? Also I cannot see the point in hiding some mount ID's from the list. It seems to me that the list is just an array of numbers that in itself doesn't carry any information. Thanks, Miklos
On 5/10/23 23:47, Miklos Szeredi wrote: > On Thu, 5 Oct 2023 at 06:23, Ian Kent <raven@themaw.net> wrote: > >> The proc interfaces essentially use <mount namespace>->list to provide >> >> the mounts that can be seen so it's filtered by mount namespace of the >> >> task that's doing the open(). >> >> >> See fs/namespace.c:mnt_list_next() and just below the m_start(), m_next(), > /proc/$PID/mountinfo will list the mount namespace of $PID. Whether > current task has permission to do so is decided at open time. > > listmount() will list the children of the given mount ID. The mount > ID is looked up in the task's mount namespace, so this cannot be used > to list mounts of other namespaces. It's a more limited interface. Yep. But isn't the ability to see these based on task privilege? Is the proc style restriction actually what we need here (or some variation of that implementation)? An privileged task typically has the init namespace as its mount namespace and mounts should propagate from there so it should be able to see all mounts. If the file handle has been opened in a task that is using some other mount namespace then presumably that's what the program author wants the task to see. So I'm not sure I see a problem obeying the namespace of a given task. Ian > > I sort of understand the reasoning behind calling into a security hook > on entry to statmount() and listmount(). And BTW I also think that if > statmount() and listmount() is limited in this way, then the same > limitation should be applied to the proc interfaces. But that needs > to be done real carefully because it might cause regressions. OTOH if > it's only done on the new interfaces, then what is the point, since > the old interfaces will be available indefinitely? > > Also I cannot see the point in hiding some mount ID's from the list. > It seems to me that the list is just an array of numbers that in > itself doesn't carry any information. > > Thanks, > Miklos
On Thu, Oct 5, 2023 at 11:47 AM Miklos Szeredi <miklos@szeredi.hu> wrote: > On Thu, 5 Oct 2023 at 06:23, Ian Kent <raven@themaw.net> wrote: > > The proc interfaces essentially use <mount namespace>->list to provide > > > > the mounts that can be seen so it's filtered by mount namespace of the > > > > task that's doing the open(). > > > > > > See fs/namespace.c:mnt_list_next() and just below the m_start(), m_next(), > > /proc/$PID/mountinfo will list the mount namespace of $PID. Whether > current task has permission to do so is decided at open time. > > listmount() will list the children of the given mount ID. The mount > ID is looked up in the task's mount namespace, so this cannot be used > to list mounts of other namespaces. It's a more limited interface. > > I sort of understand the reasoning behind calling into a security hook > on entry to statmount() and listmount(). And BTW I also think that if > statmount() and listmount() is limited in this way, then the same > limitation should be applied to the proc interfaces. But that needs > to be done real carefully because it might cause regressions. OTOH if > it's only done on the new interfaces, then what is the point, since > the old interfaces will be available indefinitely? LSMs that are designed to enforce access controls on procfs interfaces typically leverage the fact that the procfs interfaces are file based and the normal file I/O access controls can be used. In some cases, e.g. /proc/self/attr, there may also be additional access controls implemented via a dedicated set of LSM hooks. > Also I cannot see the point in hiding some mount ID's from the list. > It seems to me that the list is just an array of numbers that in > itself doesn't carry any information. I think it really comes down to the significance of the mount ID, and I can't say I know enough of the details here to be entirely comfortable taking a hard stance on this. Can you help me understand the mount ID concept a bit better? While I'm reasonably confident that we want a security_sb_statfs() control point in statmount(), it may turn out that we don't want/need a call in the listmount() case. Perhaps your original patch was correct in the sense that we only want a single security_sb_statfs() call for the root (implying that the child mount IDs are attributes of the root/parent mount)? Maybe it's something else entirely?
On Fri, 6 Oct 2023 at 04:56, Paul Moore <paul@paul-moore.com> wrote: > > Also I cannot see the point in hiding some mount ID's from the list. > > It seems to me that the list is just an array of numbers that in > > itself doesn't carry any information. > > I think it really comes down to the significance of the mount ID, and > I can't say I know enough of the details here to be entirely > comfortable taking a hard stance on this. Can you help me understand > the mount ID concept a bit better? Mount ID is a descriptor that allows referring to a specific struct mount from userspace. The old 32 bit mount id is allocated with IDA from a global pool. Because it's non-referencing it doesn't allow uniquely identifying a mount. That was a design mistake that I made back in 2008, thinking that the same sort of dense descriptor space as used for file descriptors would work. Originally it was used to identify the mount and the parent mount in /proc/PID/mountinfo. Later it was also added to the following interfaces: - name_to_handle_at(2) returns 32 bit value - /proc/PID/FD/fdinfo - statx(2) returns 64 bit value It was never used on the kernel interfaces as an input argument. statmount(2) and listmount(2) require the mount to be identified by userspace, so having a unique ID is important. So the "[1/4] add unique mount ID" adds a new 64 bit ID (still global) that is allocated sequentially and only reused after reboot. It is used as an input to these syscalls. It is returned by statx(2) if requested by STATX_MNT_ID_UNIQUE and as an array of ID's by listmount(2). I can see mild security problems with the global allocation, since a task can observe mounts being done in other namespaces. This doesn't sound too serious, and the old ID has similar issues. But I think making the new ID be local to the mount namespace is also feasible. > While I'm reasonably confident that we want a security_sb_statfs() > control point in statmount(), it may turn out that we don't want/need > a call in the listmount() case. Perhaps your original patch was > correct in the sense that we only want a single security_sb_statfs() > call for the root (implying that the child mount IDs are attributes of > the root/parent mount)? Maybe it's something else entirely? Mounts are arranged in a tree (I think it obvious how) and listmount(2) just lists the IDs of the immediate children of a mount. I don't see ID being an attribute of a mount, it's a descriptor. Thanks, Miklos
On Fri, Oct 6, 2023 at 4:53 AM Miklos Szeredi <miklos@szeredi.hu> wrote: > On Fri, 6 Oct 2023 at 04:56, Paul Moore <paul@paul-moore.com> wrote: > > > > Also I cannot see the point in hiding some mount ID's from the list. > > > It seems to me that the list is just an array of numbers that in > > > itself doesn't carry any information. > > > > I think it really comes down to the significance of the mount ID, and > > I can't say I know enough of the details here to be entirely > > comfortable taking a hard stance on this. Can you help me understand > > the mount ID concept a bit better? > > Mount ID is a descriptor that allows referring to a specific struct > mount from userspace. > > The old 32 bit mount id is allocated with IDA from a global pool. > Because it's non-referencing it doesn't allow uniquely identifying a > mount. That was a design mistake that I made back in 2008, thinking > that the same sort of dense descriptor space as used for file > descriptors would work. Originally it was used to identify the mount > and the parent mount in /proc/PID/mountinfo. Later it was also added > to the following interfaces: > > - name_to_handle_at(2) returns 32 bit value > - /proc/PID/FD/fdinfo > - statx(2) returns 64 bit value > > It was never used on the kernel interfaces as an input argument. Thanks for the background. > statmount(2) and listmount(2) require the mount to be identified by > userspace, so having a unique ID is important. So the "[1/4] add > unique mount ID" adds a new 64 bit ID (still global) that is allocated > sequentially and only reused after reboot. It is used as an input to > these syscalls. It is returned by statx(2) if requested by > STATX_MNT_ID_UNIQUE and as an array of ID's by listmount(2). > > I can see mild security problems with the global allocation, since a > task can observe mounts being done in other namespaces. This doesn't > sound too serious, and the old ID has similar issues. But I think > making the new ID be local to the mount namespace is also feasible. The LSM hook API is designed to operate independently from any of the kernel namespaces; while some LSMs may choose to be aware of namespaces and adjust their controls accordingly, there is no requirement that they do so. For that reason, I'm not too bothered either way if the mount ID is global or tied to a namespace. > > While I'm reasonably confident that we want a security_sb_statfs() > > control point in statmount(), it may turn out that we don't want/need > > a call in the listmount() case. Perhaps your original patch was > > correct in the sense that we only want a single security_sb_statfs() > > call for the root (implying that the child mount IDs are attributes of > > the root/parent mount)? Maybe it's something else entirely? > > Mounts are arranged in a tree (I think it obvious how) and > listmount(2) just lists the IDs of the immediate children of a mount. > > I don't see ID being an attribute of a mount, it's a descriptor. In this case I think the approach you took originally in this thread is likely what we want, call security_sb_statfs() against the root mount in listmount(). Just please move it after the capability checks. If you look at the two LSMs which implement the security_sb_statfs(), Smack and SELinux, you see that Smack treats this as a read operation between the current process and the specified mount and SELinux treats this as a filesystem:getattr operations between the current process and the specified mount. In both cases I can see that being the right approach for reading a list of child mounts off of a root mount. Does that sound good? I'm guessing that's okay since that was how you wrote it in your original patch, but there has been a lot of discussion since then :)
On 6/10/23 08:27, Ian Kent wrote: > On 5/10/23 23:47, Miklos Szeredi wrote: >> On Thu, 5 Oct 2023 at 06:23, Ian Kent <raven@themaw.net> wrote: >> >>> The proc interfaces essentially use <mount namespace>->list to provide >>> >>> the mounts that can be seen so it's filtered by mount namespace of the >>> >>> task that's doing the open(). >>> >>> >>> See fs/namespace.c:mnt_list_next() and just below the m_start(), >>> m_next(), >> /proc/$PID/mountinfo will list the mount namespace of $PID. Whether >> current task has permission to do so is decided at open time. >> >> listmount() will list the children of the given mount ID. The mount >> ID is looked up in the task's mount namespace, so this cannot be used >> to list mounts of other namespaces. It's a more limited interface. > > Yep. But isn't the ability to see these based on task privilege? > > > Is the proc style restriction actually what we need here (or some > variation > > of that implementation)? > > > An privileged task typically has the init namespace as its mount > namespace > > and mounts should propagate from there so it should be able to see all > mounts. > > > If the file handle has been opened in a task that is using some other > mount > > namespace then presumably that's what the program author wants the > task to see. > > So I'm not sure I see a problem obeying the namespace of a given task. I've had a look through the code we had in the old fsinfo() proposal because I think we need to consider the use cases that are needed. IIRC initially we had a flag FSINFO_ATTR_MOUNT_CHILDREN that essentially enumerated the children of the given mount in much the same way as is done now in this system call. But because we needed to enumerate mounts in the same way as the proc file system mount tables a flag FSINFO_ATTR_MOUNT_ALL was added that essentially used the mount namespace mounts list in a similar way to the proc file system so that a list of mounts for a mount namespace could be retrieved. This later use case is what is used by processes that monitor mounts and is what's needed more so than enumerating the children as we do now. I'm still looking at the mount id lookup. Ian > > > Ian > >> >> I sort of understand the reasoning behind calling into a security hook >> on entry to statmount() and listmount(). And BTW I also think that if >> statmount() and listmount() is limited in this way, then the same >> limitation should be applied to the proc interfaces. But that needs >> to be done real carefully because it might cause regressions. OTOH if >> it's only done on the new interfaces, then what is the point, since >> the old interfaces will be available indefinitely? >> >> Also I cannot see the point in hiding some mount ID's from the list. >> It seems to me that the list is just an array of numbers that in >> itself doesn't carry any information. >> >> Thanks, >> Miklos
On Fri, 13 Oct 2023 at 04:40, Ian Kent <raven@themaw.net> wrote: > But because we needed to enumerate mounts in the same way as the proc file > > system mount tables a flag FSINFO_ATTR_MOUNT_ALL was added that essentially > > used the mount namespace mounts list in a similar way to the proc file > > system so that a list of mounts for a mount namespace could be retrieved. Makes sense. Added a LISTMOUNT_RECURSIVE flag. Thanks, Miklos
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 317b1320ad18..65e0185b47a9 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -458,3 +458,4 @@ 451 i386 cachestat sys_cachestat 452 i386 fchmodat2 sys_fchmodat2 454 i386 statmount sys_statmount +455 i386 listmount sys_listmount diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 7312c440978f..a1b3ce7d22cc 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -376,6 +376,7 @@ 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack 454 common statmount sys_statmount +455 common listmount sys_listmount # # Due to a historical design error, certain syscalls are numbered differently diff --git a/fs/namespace.c b/fs/namespace.c index 3326ba2b2810..050e2d2af110 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4970,6 +4970,75 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req, return ret; } +static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize, + const struct path *root, unsigned int flags) +{ + struct mount *r, *m = real_mount(mnt); + struct path rootmnt = { + .mnt = root->mnt, + .dentry = root->mnt->mnt_root + }; + long ctr = 0; + bool reachable_only = true; + int err; + + err = security_sb_statfs(mnt->mnt_root); + if (err) + return err; + + if (flags & LISTMOUNT_UNREACHABLE) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + reachable_only = false; + } + + if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt)) + return capable(CAP_SYS_ADMIN) ? 0 : -EPERM; + + list_for_each_entry(r, &m->mnt_mounts, mnt_child) { + if (reachable_only && + !is_path_reachable(r, r->mnt.mnt_root, root)) + continue; + + if (ctr >= bufsize) + return -EOVERFLOW; + if (put_user(r->mnt_id_unique, buf + ctr)) + return -EFAULT; + ctr++; + if (ctr < 0) + return -ERANGE; + } + return ctr; +} + +SYSCALL_DEFINE4(listmount, const struct __mount_arg __user *, req, + u64 __user *, buf, size_t, bufsize, unsigned int, flags) +{ + struct __mount_arg kreq; + struct vfsmount *mnt; + struct path root; + long err; + + if (flags & ~LISTMOUNT_UNREACHABLE) + return -EINVAL; + + if (copy_from_user(&kreq, req, sizeof(kreq))) + return -EFAULT; + + down_read(&namespace_sem); + mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); + err = -ENOENT; + if (mnt) { + get_fs_root(current->fs, &root); + err = do_listmount(mnt, buf, bufsize, &root, flags); + path_put(&root); + } + up_read(&namespace_sem); + + return err; +} + + static void __init init_mount_tree(void) { struct vfsmount *mnt; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ba371024d902..38f3da7e04d1 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -413,6 +413,9 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, asmlinkage long sys_statmount(const struct __mount_arg __user *req, struct statmnt __user *buf, size_t bufsize, unsigned int flags); +asmlinkage long sys_listmount(const struct __mount_arg __user *req, + u64 __user *buf, size_t bufsize, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); #if BITS_PER_LONG == 32 diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 8f034e934a2e..8df6a747e21a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -826,8 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2) #define __NR_statmount 454 __SYSCALL(__NR_statmount, sys_statmount) +#define __NR_listmount 455 +__SYSCALL(__NR_listmount, sys_listmount) + #undef __NR_syscalls -#define __NR_syscalls 455 +#define __NR_syscalls 456 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index d2c988ab526b..7aa9916659d2 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -194,4 +194,7 @@ struct __mount_arg { #define STMT_MNT_POINT 0x00000010U /* Want/got mnt_point */ #define STMT_FS_TYPE 0x00000020U /* Want/got fs_type */ +/* listmount(2) flags */ +#define LISTMOUNT_UNREACHABLE 0x01 /* List unreachable mounts too */ + #endif /* _UAPI_LINUX_MOUNT_H */
Add way to query the children of a particular mount. This is a more flexible way to iterate the mount tree than having to parse the complete /proc/self/mountinfo. Lookup the mount by the new 64bit mount ID. If a mount needs to be queried based on path, then statx(2) can be used to first query the mount ID belonging to the path. Return an array of new (64bit) mount ID's. Without privileges only mounts are listed which are reachable from the task's root. Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/namespace.c | 69 ++++++++++++++++++++++++++ include/linux/syscalls.h | 3 ++ include/uapi/asm-generic/unistd.h | 5 +- include/uapi/linux/mount.h | 3 ++ 6 files changed, 81 insertions(+), 1 deletion(-)