Message ID | 20201013140609.2269319-2-gscrivan@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | fs, close_range: add flag CLOSE_RANGE_CLOEXEC | expand |
On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote: Hey Guiseppe, Thanks for the patch! > When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't > immediately close the files but it sets the close-on-exec bit. Hm, please expand on the use-cases a little here so people know where and how this is useful. Keeping the rationale for a change in the commit log is really important. > > Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com> > --- > fs/file.c | 56 ++++++++++++++++++++++---------- > include/uapi/linux/close_range.h | 3 ++ > 2 files changed, 42 insertions(+), 17 deletions(-) > > diff --git a/fs/file.c b/fs/file.c > index 21c0893f2f1d..ad4ebee41e09 100644 > --- a/fs/file.c > +++ b/fs/file.c > @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd) > } > EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ > > +static unsigned int __get_max_fds(struct files_struct *cur_fds) > +{ > + unsigned int max_fds; > + > + rcu_read_lock(); > + /* cap to last valid index into fdtable */ > + max_fds = files_fdtable(cur_fds)->max_fds; > + rcu_read_unlock(); > + return max_fds; > +} > + > /** > * __close_range() - Close all file descriptors in a given range. > * > @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ > */ > int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) > { > - unsigned int cur_max; > + unsigned int cur_max = UINT_MAX; > struct task_struct *me = current; > struct files_struct *cur_fds = me->files, *fds = NULL; > > - if (flags & ~CLOSE_RANGE_UNSHARE) > + if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) > return -EINVAL; > > if (fd > max_fd) > return -EINVAL; > > - rcu_read_lock(); > - cur_max = files_fdtable(cur_fds)->max_fds; > - rcu_read_unlock(); > - > - /* cap to last valid index into fdtable */ > - cur_max--; > - > if (flags & CLOSE_RANGE_UNSHARE) { > int ret; > unsigned int max_unshare_fds = NR_OPEN_MAX; > > + /* cap to last valid index into fdtable */ > + cur_max = __get_max_fds(cur_fds) - 1; > + > /* > * If the requested range is greater than the current maximum, > * we're closing everything so only copy all file descriptors > @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) > swap(cur_fds, fds); > } > > - max_fd = min(max_fd, cur_max); > - while (fd <= max_fd) { > - struct file *file; > + if (flags & CLOSE_RANGE_CLOEXEC) { > + struct fdtable *fdt; > > - file = pick_file(cur_fds, fd++); > - if (!file) > - continue; > + spin_lock(&cur_fds->file_lock); > + fdt = files_fdtable(cur_fds); > + cur_max = fdt->max_fds - 1; > + max_fd = min(max_fd, cur_max); > + while (fd <= max_fd) > + __set_close_on_exec(fd++, fdt); > + spin_unlock(&cur_fds->file_lock); > + } else { > + /* Initialize cur_max if needed. */ > + if (cur_max == UINT_MAX) > + cur_max = __get_max_fds(cur_fds) - 1; The separation between how cur_fd is retrieved in the two branches makes the code more difficult to follow imho. Unless there's a clear reason why you've done it that way I would think that something like the patch I appended below might be a little clearer and easier to maintain(?). > + max_fd = min(max_fd, cur_max); > + while (fd <= max_fd) { > + struct file *file; > > - filp_close(file, cur_fds); > - cond_resched(); > + file = pick_file(cur_fds, fd++); > + if (!file) > + continue; > + > + filp_close(file, cur_fds); > + cond_resched(); > + } > } I think I don't have quarrels with this patch in principle but I wonder if something like the following wouldn't be easier to follow: diff --git a/fs/file.c b/fs/file.c index 21c0893f2f1d..872a4098c3be 100644 --- a/fs/file.c +++ b/fs/file.c @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd) } EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ +static inline void __range_cloexec(struct files_struct *cur_fds, + unsigned int fd, unsigned max_fd) +{ + struct fdtable *fdt; + spin_lock(&cur_fds->file_lock); + fdt = files_fdtable(cur_fds); + while (fd <= max_fd) + __set_close_on_exec(fd++, fdt); + spin_unlock(&cur_fds->file_lock); +} + +static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, + unsigned max_fd) +{ + while (fd <= max_fd) { + struct file *file; + + file = pick_file(cur_fds, fd++); + if (!file) + continue; + + filp_close(file, cur_fds); + cond_resched(); + } +} + /** * __close_range() - Close all file descriptors in a given range. * @@ -687,7 +713,7 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; - if (flags & ~CLOSE_RANGE_UNSHARE) + if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) @@ -725,16 +751,10 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) } max_fd = min(max_fd, cur_max); - while (fd <= max_fd) { - struct file *file; - - file = pick_file(cur_fds, fd++); - if (!file) - continue; - - filp_close(file, cur_fds); - cond_resched(); - } + if (flags & CLOSE_RANGE_CLOEXEC) + __range_cloexec(cur_fds, fd, max_fd); + else + __range_close(cur_fds, fd, max_fd); if (fds) { /* diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h index 6928a9fdee3c..2d804281554c 100644 --- a/include/uapi/linux/close_range.h +++ b/include/uapi/linux/close_range.h @@ -5,5 +5,8 @@ /* Unshare the file descriptor table before closing file descriptors. */ #define CLOSE_RANGE_UNSHARE (1U << 1) +/* Set the FD_CLOEXEC bit instead of closing the file descriptor. */ +#define CLOSE_RANGE_CLOEXEC (1U << 2) + #endif /* _UAPI_LINUX_CLOSE_RANGE_H */
On 13/10/2020 22.54, Christian Brauner wrote: > On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote: > > Hey Guiseppe, > > Thanks for the patch! > >> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't >> immediately close the files but it sets the close-on-exec bit. > > Hm, please expand on the use-cases a little here so people know where > and how this is useful. Keeping the rationale for a change in the commit > log is really important. > > I think I don't have quarrels with this patch in principle but I wonder > if something like the following wouldn't be easier to follow: > > diff --git a/fs/file.c b/fs/file.c > index 21c0893f2f1d..872a4098c3be 100644 > --- a/fs/file.c > +++ b/fs/file.c > @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd) > } > EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ > > +static inline void __range_cloexec(struct files_struct *cur_fds, > + unsigned int fd, unsigned max_fd) > +{ > + struct fdtable *fdt; > + spin_lock(&cur_fds->file_lock); > + fdt = files_fdtable(cur_fds); > + while (fd <= max_fd) > + __set_close_on_exec(fd++, fdt); Doesn't that want to be bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1) to do word-at-a-time? I assume this would mostly be called with (3, ~0U) as arguments or something like that. Rasmus
On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote: > + spin_lock(&cur_fds->file_lock); > + fdt = files_fdtable(cur_fds); > + cur_max = fdt->max_fds - 1; > + max_fd = min(max_fd, cur_max); > + while (fd <= max_fd) > + __set_close_on_exec(fd++, fdt); > + spin_unlock(&cur_fds->file_lock); First of all, this is an atrocious way to set all bits in a range. What's more, you don't want to set it for *all* bits - only for the ones present in open bitmap. It's probably harmless at the moment, but let's not create interesting surprises for the future.
On Tue, Oct 13, 2020 at 11:04:21PM +0200, Rasmus Villemoes wrote: > On 13/10/2020 22.54, Christian Brauner wrote: > > On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote: > > > > Hey Guiseppe, > > > > Thanks for the patch! > > > >> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't > >> immediately close the files but it sets the close-on-exec bit. > > > > Hm, please expand on the use-cases a little here so people know where > > and how this is useful. Keeping the rationale for a change in the commit > > log is really important. > > > > > I think I don't have quarrels with this patch in principle but I wonder > > if something like the following wouldn't be easier to follow: > > > > diff --git a/fs/file.c b/fs/file.c > > index 21c0893f2f1d..872a4098c3be 100644 > > --- a/fs/file.c > > +++ b/fs/file.c > > @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd) > > } > > EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ > > > > +static inline void __range_cloexec(struct files_struct *cur_fds, > > + unsigned int fd, unsigned max_fd) > > +{ > > + struct fdtable *fdt; > > + spin_lock(&cur_fds->file_lock); > > + fdt = files_fdtable(cur_fds); > > + while (fd <= max_fd) > > + __set_close_on_exec(fd++, fdt); > (I should've warned that I just proposed this as a completely untested brainstorm.) > Doesn't that want to be > > bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1) > > to do word-at-a-time? I assume this would mostly be called with (3, ~0U) > as arguments or something like that. Yes, that is the common case. Thanks Rasmus, I was unaware we had that function. In that case I think we'd actually need sm like: spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); cur_max = files_fdtable(cur_fds)->max_fds - 1; max_fd = min(max_fd, cur_max); bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1) so we retrieve max_fd with the spinlock held, I think. Christian
On Tue, Oct 13, 2020 at 10:09:25PM +0100, Al Viro wrote: > On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote: > > + spin_lock(&cur_fds->file_lock); > > + fdt = files_fdtable(cur_fds); > > + cur_max = fdt->max_fds - 1; > > + max_fd = min(max_fd, cur_max); > > + while (fd <= max_fd) > > + __set_close_on_exec(fd++, fdt); > > + spin_unlock(&cur_fds->file_lock); > > First of all, this is an atrocious way to set all bits > in a range. What's more, you don't want to set it for *all* Hm, good point. Would it make sense to just use the bitmap_set() proposal since the 3 to ~0 case is most common or to actually iterate based on the open fds? Christian
On 13/10/2020 23.09, Al Viro wrote: > On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote: >> + spin_lock(&cur_fds->file_lock); >> + fdt = files_fdtable(cur_fds); >> + cur_max = fdt->max_fds - 1; >> + max_fd = min(max_fd, cur_max); >> + while (fd <= max_fd) >> + __set_close_on_exec(fd++, fdt); >> + spin_unlock(&cur_fds->file_lock); > > First of all, this is an atrocious way to set all bits > in a range. What's more, you don't want to set it for *all* > bits - only for the ones present in open bitmap. It's probably > harmless at the moment, but let's not create interesting surprises > for the future. Eh, why not? They can already be set for unallocated slots: commit 5297908270549b734c7c2556745e2385b6d4941d Author: Mateusz Guzik <mguzik@redhat.com> Date: Tue Oct 3 12:58:14 2017 +0200 vfs: stop clearing close on exec when closing a fd Codepaths allocating a fd always make sure the bit is set/unset depending on flags, thus clearing on close is redundant. And while we're on that subject, yours truly suggested exactly that two years prior [1], with a follow-up [2] in 2018 to do what wasn't done in 5297908, but (still) seems like obvious micro-optimizations, given that the close_on_exec bitmap is not maintained as a subset of the open bitmap. Mind taking a look at [2]? [1] https://lore.kernel.org/lkml/1446543679-28849-1-git-send-email-linux@rasmusvillemoes.dk/t/#u [2] https://lore.kernel.org/lkml/20181024160159.25884-1-linux@rasmusvillemoes.dk/ Rasmus
Christian Brauner <christian.brauner@ubuntu.com> writes: > On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote: > > Hey Guiseppe, > > Thanks for the patch! > >> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't >> immediately close the files but it sets the close-on-exec bit. > > Hm, please expand on the use-cases a little here so people know where > and how this is useful. Keeping the rationale for a change in the commit > log is really important. > >> >> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com> >> --- > >> fs/file.c | 56 ++++++++++++++++++++++---------- >> include/uapi/linux/close_range.h | 3 ++ >> 2 files changed, 42 insertions(+), 17 deletions(-) >> >> diff --git a/fs/file.c b/fs/file.c >> index 21c0893f2f1d..ad4ebee41e09 100644 >> --- a/fs/file.c >> +++ b/fs/file.c >> @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd) >> } >> EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ >> >> +static unsigned int __get_max_fds(struct files_struct *cur_fds) >> +{ >> + unsigned int max_fds; >> + >> + rcu_read_lock(); >> + /* cap to last valid index into fdtable */ >> + max_fds = files_fdtable(cur_fds)->max_fds; >> + rcu_read_unlock(); >> + return max_fds; >> +} >> + >> /** >> * __close_range() - Close all file descriptors in a given range. >> * >> @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ >> */ >> int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) >> { >> - unsigned int cur_max; >> + unsigned int cur_max = UINT_MAX; >> struct task_struct *me = current; >> struct files_struct *cur_fds = me->files, *fds = NULL; >> >> - if (flags & ~CLOSE_RANGE_UNSHARE) >> + if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) >> return -EINVAL; >> >> if (fd > max_fd) >> return -EINVAL; >> >> - rcu_read_lock(); >> - cur_max = files_fdtable(cur_fds)->max_fds; >> - rcu_read_unlock(); >> - >> - /* cap to last valid index into fdtable */ >> - cur_max--; >> - >> if (flags & CLOSE_RANGE_UNSHARE) { >> int ret; >> unsigned int max_unshare_fds = NR_OPEN_MAX; >> >> + /* cap to last valid index into fdtable */ >> + cur_max = __get_max_fds(cur_fds) - 1; >> + >> /* >> * If the requested range is greater than the current maximum, >> * we're closing everything so only copy all file descriptors >> @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) >> swap(cur_fds, fds); >> } >> >> - max_fd = min(max_fd, cur_max); >> - while (fd <= max_fd) { >> - struct file *file; >> + if (flags & CLOSE_RANGE_CLOEXEC) { >> + struct fdtable *fdt; >> >> - file = pick_file(cur_fds, fd++); >> - if (!file) >> - continue; >> + spin_lock(&cur_fds->file_lock); >> + fdt = files_fdtable(cur_fds); >> + cur_max = fdt->max_fds - 1; >> + max_fd = min(max_fd, cur_max); >> + while (fd <= max_fd) >> + __set_close_on_exec(fd++, fdt); >> + spin_unlock(&cur_fds->file_lock); >> + } else { >> + /* Initialize cur_max if needed. */ >> + if (cur_max == UINT_MAX) >> + cur_max = __get_max_fds(cur_fds) - 1; > > The separation between how cur_fd is retrieved in the two branches makes > the code more difficult to follow imho. Unless there's a clear reason > why you've done it that way I would think that something like the patch > I appended below might be a little clearer and easier to maintain(?). Thanks for the review! I've opted for this version as in the flags=CLOSE_RANGE_CLOEXEC case we can read max_fds directly from the fds table and avoid doing it from the RCU critical section as well. I'll change it in favor of more readable code. Giuseppe
diff --git a/fs/file.c b/fs/file.c index 21c0893f2f1d..ad4ebee41e09 100644 --- a/fs/file.c +++ b/fs/file.c @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd) } EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ +static unsigned int __get_max_fds(struct files_struct *cur_fds) +{ + unsigned int max_fds; + + rcu_read_lock(); + /* cap to last valid index into fdtable */ + max_fds = files_fdtable(cur_fds)->max_fds; + rcu_read_unlock(); + return max_fds; +} + /** * __close_range() - Close all file descriptors in a given range. * @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ */ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { - unsigned int cur_max; + unsigned int cur_max = UINT_MAX; struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; - if (flags & ~CLOSE_RANGE_UNSHARE) + if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; - rcu_read_lock(); - cur_max = files_fdtable(cur_fds)->max_fds; - rcu_read_unlock(); - - /* cap to last valid index into fdtable */ - cur_max--; - if (flags & CLOSE_RANGE_UNSHARE) { int ret; unsigned int max_unshare_fds = NR_OPEN_MAX; + /* cap to last valid index into fdtable */ + cur_max = __get_max_fds(cur_fds) - 1; + /* * If the requested range is greater than the current maximum, * we're closing everything so only copy all file descriptors @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) swap(cur_fds, fds); } - max_fd = min(max_fd, cur_max); - while (fd <= max_fd) { - struct file *file; + if (flags & CLOSE_RANGE_CLOEXEC) { + struct fdtable *fdt; - file = pick_file(cur_fds, fd++); - if (!file) - continue; + spin_lock(&cur_fds->file_lock); + fdt = files_fdtable(cur_fds); + cur_max = fdt->max_fds - 1; + max_fd = min(max_fd, cur_max); + while (fd <= max_fd) + __set_close_on_exec(fd++, fdt); + spin_unlock(&cur_fds->file_lock); + } else { + /* Initialize cur_max if needed. */ + if (cur_max == UINT_MAX) + cur_max = __get_max_fds(cur_fds) - 1; + max_fd = min(max_fd, cur_max); + while (fd <= max_fd) { + struct file *file; - filp_close(file, cur_fds); - cond_resched(); + file = pick_file(cur_fds, fd++); + if (!file) + continue; + + filp_close(file, cur_fds); + cond_resched(); + } } if (fds) { diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h index 6928a9fdee3c..2d804281554c 100644 --- a/include/uapi/linux/close_range.h +++ b/include/uapi/linux/close_range.h @@ -5,5 +5,8 @@ /* Unshare the file descriptor table before closing file descriptors. */ #define CLOSE_RANGE_UNSHARE (1U << 1) +/* Set the FD_CLOEXEC bit instead of closing the file descriptor. */ +#define CLOSE_RANGE_CLOEXEC (1U << 2) + #endif /* _UAPI_LINUX_CLOSE_RANGE_H */
When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't immediately close the files but it sets the close-on-exec bit. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com> --- fs/file.c | 56 ++++++++++++++++++++++---------- include/uapi/linux/close_range.h | 3 ++ 2 files changed, 42 insertions(+), 17 deletions(-)