diff mbox series

[1/2] fs, close_range: add flag CLOSE_RANGE_CLOEXEC

Message ID 20201013140609.2269319-2-gscrivan@redhat.com (mailing list archive)
State New, archived
Headers show
Series fs, close_range: add flag CLOSE_RANGE_CLOEXEC | expand

Commit Message

Giuseppe Scrivano Oct. 13, 2020, 2:06 p.m. UTC
When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
immediately close the files but it sets the close-on-exec bit.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 fs/file.c                        | 56 ++++++++++++++++++++++----------
 include/uapi/linux/close_range.h |  3 ++
 2 files changed, 42 insertions(+), 17 deletions(-)

Comments

Christian Brauner Oct. 13, 2020, 8:54 p.m. UTC | #1
On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:

Hey Guiseppe,

Thanks for the patch!

> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
> immediately close the files but it sets the close-on-exec bit.

Hm, please expand on the use-cases a little here so people know where
and how this is useful. Keeping the rationale for a change in the commit
log is really important.

> 
> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
> ---

>  fs/file.c                        | 56 ++++++++++++++++++++++----------
>  include/uapi/linux/close_range.h |  3 ++
>  2 files changed, 42 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/file.c b/fs/file.c
> index 21c0893f2f1d..ad4ebee41e09 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd)
>  }
>  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>  
> +static unsigned int __get_max_fds(struct files_struct *cur_fds)
> +{
> +	unsigned int max_fds;
> +
> +	rcu_read_lock();
> +	/* cap to last valid index into fdtable */
> +	max_fds = files_fdtable(cur_fds)->max_fds;
> +	rcu_read_unlock();
> +	return max_fds;
> +}
> +
>  /**
>   * __close_range() - Close all file descriptors in a given range.
>   *
> @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>   */
>  int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>  {
> -	unsigned int cur_max;
> +	unsigned int cur_max = UINT_MAX;
>  	struct task_struct *me = current;
>  	struct files_struct *cur_fds = me->files, *fds = NULL;
>  
> -	if (flags & ~CLOSE_RANGE_UNSHARE)
> +	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
>  		return -EINVAL;
>  
>  	if (fd > max_fd)
>  		return -EINVAL;
>  
> -	rcu_read_lock();
> -	cur_max = files_fdtable(cur_fds)->max_fds;
> -	rcu_read_unlock();
> -
> -	/* cap to last valid index into fdtable */
> -	cur_max--;
> -
>  	if (flags & CLOSE_RANGE_UNSHARE) {
>  		int ret;
>  		unsigned int max_unshare_fds = NR_OPEN_MAX;
>  
> +		/* cap to last valid index into fdtable */
> +		cur_max = __get_max_fds(cur_fds) - 1;
> +
>  		/*
>  		 * If the requested range is greater than the current maximum,
>  		 * we're closing everything so only copy all file descriptors
> @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>  			swap(cur_fds, fds);
>  	}
>  
> -	max_fd = min(max_fd, cur_max);
> -	while (fd <= max_fd) {
> -		struct file *file;
> +	if (flags & CLOSE_RANGE_CLOEXEC) {
> +		struct fdtable *fdt;
>  
> -		file = pick_file(cur_fds, fd++);
> -		if (!file)
> -			continue;
> +		spin_lock(&cur_fds->file_lock);
> +		fdt = files_fdtable(cur_fds);
> +		cur_max = fdt->max_fds - 1;
> +		max_fd = min(max_fd, cur_max);
> +		while (fd <= max_fd)
> +			__set_close_on_exec(fd++, fdt);
> +		spin_unlock(&cur_fds->file_lock);
> +	} else {
> +		/* Initialize cur_max if needed.  */
> +		if (cur_max == UINT_MAX)
> +			cur_max = __get_max_fds(cur_fds) - 1;

The separation between how cur_fd is retrieved in the two branches makes
the code more difficult to follow imho. Unless there's a clear reason
why you've done it that way I would think that something like the patch
I appended below might be a little clearer and easier to maintain(?).

> +		max_fd = min(max_fd, cur_max);
> +		while (fd <= max_fd) {
> +			struct file *file;
>  
> -		filp_close(file, cur_fds);
> -		cond_resched();
> +			file = pick_file(cur_fds, fd++);
> +			if (!file)
> +				continue;
> +
> +			filp_close(file, cur_fds);
> +			cond_resched();
> +		}
>  	}

I think I don't have quarrels with this patch in principle but I wonder
if something like the following wouldn't be easier to follow:

diff --git a/fs/file.c b/fs/file.c
index 21c0893f2f1d..872a4098c3be 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
 }
 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
 
+static inline void __range_cloexec(struct files_struct *cur_fds,
+				   unsigned int fd, unsigned max_fd)
+{
+	struct fdtable *fdt;
+	spin_lock(&cur_fds->file_lock);
+	fdt = files_fdtable(cur_fds);
+	while (fd <= max_fd)
+		__set_close_on_exec(fd++, fdt);
+	spin_unlock(&cur_fds->file_lock);
+}
+
+static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
+				 unsigned max_fd)
+{
+	while (fd <= max_fd) {
+		struct file *file;
+
+		file = pick_file(cur_fds, fd++);
+		if (!file)
+			continue;
+
+		filp_close(file, cur_fds);
+		cond_resched();
+	}
+}
+
 /**
  * __close_range() - Close all file descriptors in a given range.
  *
@@ -687,7 +713,7 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 	struct task_struct *me = current;
 	struct files_struct *cur_fds = me->files, *fds = NULL;
 
-	if (flags & ~CLOSE_RANGE_UNSHARE)
+	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
 		return -EINVAL;
 
 	if (fd > max_fd)
@@ -725,16 +751,10 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 	}
 
 	max_fd = min(max_fd, cur_max);
-	while (fd <= max_fd) {
-		struct file *file;
-
-		file = pick_file(cur_fds, fd++);
-		if (!file)
-			continue;
-
-		filp_close(file, cur_fds);
-		cond_resched();
-	}
+	if (flags & CLOSE_RANGE_CLOEXEC)
+		__range_cloexec(cur_fds, fd, max_fd);
+	else
+		__range_close(cur_fds, fd, max_fd);
 
 	if (fds) {
 		/*
diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h
index 6928a9fdee3c..2d804281554c 100644
--- a/include/uapi/linux/close_range.h
+++ b/include/uapi/linux/close_range.h
@@ -5,5 +5,8 @@
 /* Unshare the file descriptor table before closing file descriptors. */
 #define CLOSE_RANGE_UNSHARE	(1U << 1)
 
+/* Set the FD_CLOEXEC bit instead of closing the file descriptor. */
+#define CLOSE_RANGE_CLOEXEC	(1U << 2)
+
 #endif /* _UAPI_LINUX_CLOSE_RANGE_H */
Rasmus Villemoes Oct. 13, 2020, 9:04 p.m. UTC | #2
On 13/10/2020 22.54, Christian Brauner wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> 
> Hey Guiseppe,
> 
> Thanks for the patch!
> 
>> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
>> immediately close the files but it sets the close-on-exec bit.
> 
> Hm, please expand on the use-cases a little here so people know where
> and how this is useful. Keeping the rationale for a change in the commit
> log is really important.
> 

> I think I don't have quarrels with this patch in principle but I wonder
> if something like the following wouldn't be easier to follow:
> 
> diff --git a/fs/file.c b/fs/file.c
> index 21c0893f2f1d..872a4098c3be 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
>  }
>  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>  
> +static inline void __range_cloexec(struct files_struct *cur_fds,
> +				   unsigned int fd, unsigned max_fd)
> +{
> +	struct fdtable *fdt;
> +	spin_lock(&cur_fds->file_lock);
> +	fdt = files_fdtable(cur_fds);
> +	while (fd <= max_fd)
> +		__set_close_on_exec(fd++, fdt);

Doesn't that want to be

  bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)

to do word-at-a-time? I assume this would mostly be called with (3, ~0U)
as arguments or something like that.

Rasmus
Al Viro Oct. 13, 2020, 9:09 p.m. UTC | #3
On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> +		spin_lock(&cur_fds->file_lock);
> +		fdt = files_fdtable(cur_fds);
> +		cur_max = fdt->max_fds - 1;
> +		max_fd = min(max_fd, cur_max);
> +		while (fd <= max_fd)
> +			__set_close_on_exec(fd++, fdt);
> +		spin_unlock(&cur_fds->file_lock);

	First of all, this is an atrocious way to set all bits
in a range.  What's more, you don't want to set it for *all*
bits - only for the ones present in open bitmap.  It's probably
harmless at the moment, but let's not create interesting surprises
for the future.
Christian Brauner Oct. 13, 2020, 9:22 p.m. UTC | #4
On Tue, Oct 13, 2020 at 11:04:21PM +0200, Rasmus Villemoes wrote:
> On 13/10/2020 22.54, Christian Brauner wrote:
> > On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> > 
> > Hey Guiseppe,
> > 
> > Thanks for the patch!
> > 
> >> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
> >> immediately close the files but it sets the close-on-exec bit.
> > 
> > Hm, please expand on the use-cases a little here so people know where
> > and how this is useful. Keeping the rationale for a change in the commit
> > log is really important.
> > 
> 
> > I think I don't have quarrels with this patch in principle but I wonder
> > if something like the following wouldn't be easier to follow:
> > 
> > diff --git a/fs/file.c b/fs/file.c
> > index 21c0893f2f1d..872a4098c3be 100644
> > --- a/fs/file.c
> > +++ b/fs/file.c
> > @@ -672,6 +672,32 @@ int __close_fd(struct files_struct *files, unsigned fd)
> >  }
> >  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
> >  
> > +static inline void __range_cloexec(struct files_struct *cur_fds,
> > +				   unsigned int fd, unsigned max_fd)
> > +{
> > +	struct fdtable *fdt;
> > +	spin_lock(&cur_fds->file_lock);
> > +	fdt = files_fdtable(cur_fds);
> > +	while (fd <= max_fd)
> > +		__set_close_on_exec(fd++, fdt);
> 

(I should've warned that I just proposed this as a completely untested
brainstorm.)

> Doesn't that want to be
> 
>   bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)
> 
> to do word-at-a-time? I assume this would mostly be called with (3, ~0U)
> as arguments or something like that.

Yes, that is the common case.

Thanks Rasmus, I was unaware we had that function.

In that case I think we'd actually need sm like:
spin_lock(&cur_fds->file_lock);
fdt = files_fdtable(cur_fds);
cur_max = files_fdtable(cur_fds)->max_fds - 1;
max_fd = min(max_fd, cur_max);
bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1)

so we retrieve max_fd with the spinlock held, I think.

Christian
Christian Brauner Oct. 13, 2020, 9:32 p.m. UTC | #5
On Tue, Oct 13, 2020 at 10:09:25PM +0100, Al Viro wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
> > +		spin_lock(&cur_fds->file_lock);
> > +		fdt = files_fdtable(cur_fds);
> > +		cur_max = fdt->max_fds - 1;
> > +		max_fd = min(max_fd, cur_max);
> > +		while (fd <= max_fd)
> > +			__set_close_on_exec(fd++, fdt);
> > +		spin_unlock(&cur_fds->file_lock);
> 
> 	First of all, this is an atrocious way to set all bits
> in a range.  What's more, you don't want to set it for *all*

Hm, good point.

Would it make sense to just use the bitmap_set() proposal since the 3 to
~0 case is most common or to actually iterate based on the open fds?


Christian
Rasmus Villemoes Oct. 13, 2020, 9:49 p.m. UTC | #6
On 13/10/2020 23.09, Al Viro wrote:
> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
>> +		spin_lock(&cur_fds->file_lock);
>> +		fdt = files_fdtable(cur_fds);
>> +		cur_max = fdt->max_fds - 1;
>> +		max_fd = min(max_fd, cur_max);
>> +		while (fd <= max_fd)
>> +			__set_close_on_exec(fd++, fdt);
>> +		spin_unlock(&cur_fds->file_lock);
> 
> 	First of all, this is an atrocious way to set all bits
> in a range.  What's more, you don't want to set it for *all*
> bits - only for the ones present in open bitmap.  It's probably
> harmless at the moment, but let's not create interesting surprises
> for the future.

Eh, why not? They can already be set for unallocated slots:

commit 5297908270549b734c7c2556745e2385b6d4941d
Author: Mateusz Guzik <mguzik@redhat.com>
Date:   Tue Oct 3 12:58:14 2017 +0200

    vfs: stop clearing close on exec when closing a fd

    Codepaths allocating a fd always make sure the bit is set/unset
    depending on flags, thus clearing on close is redundant.

And while we're on that subject, yours truly suggested exactly that two
years prior [1], with a follow-up [2] in 2018 to do what wasn't done in
5297908, but (still) seems like obvious micro-optimizations, given that
the close_on_exec bitmap is not maintained as a subset of the open
bitmap. Mind taking a look at [2]?

[1]
https://lore.kernel.org/lkml/1446543679-28849-1-git-send-email-linux@rasmusvillemoes.dk/t/#u
[2]
https://lore.kernel.org/lkml/20181024160159.25884-1-linux@rasmusvillemoes.dk/

Rasmus
Giuseppe Scrivano Oct. 13, 2020, 10:45 p.m. UTC | #7
Christian Brauner <christian.brauner@ubuntu.com> writes:

> On Tue, Oct 13, 2020 at 04:06:08PM +0200, Giuseppe Scrivano wrote:
>
> Hey Guiseppe,
>
> Thanks for the patch!
>
>> When the flag CLOSE_RANGE_CLOEXEC is set, close_range doesn't
>> immediately close the files but it sets the close-on-exec bit.
>
> Hm, please expand on the use-cases a little here so people know where
> and how this is useful. Keeping the rationale for a change in the commit
> log is really important.
>
>> 
>> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
>> ---
>
>>  fs/file.c                        | 56 ++++++++++++++++++++++----------
>>  include/uapi/linux/close_range.h |  3 ++
>>  2 files changed, 42 insertions(+), 17 deletions(-)
>> 
>> diff --git a/fs/file.c b/fs/file.c
>> index 21c0893f2f1d..ad4ebee41e09 100644
>> --- a/fs/file.c
>> +++ b/fs/file.c
>> @@ -672,6 +672,17 @@ int __close_fd(struct files_struct *files, unsigned fd)
>>  }
>>  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>>  
>> +static unsigned int __get_max_fds(struct files_struct *cur_fds)
>> +{
>> +	unsigned int max_fds;
>> +
>> +	rcu_read_lock();
>> +	/* cap to last valid index into fdtable */
>> +	max_fds = files_fdtable(cur_fds)->max_fds;
>> +	rcu_read_unlock();
>> +	return max_fds;
>> +}
>> +
>>  /**
>>   * __close_range() - Close all file descriptors in a given range.
>>   *
>> @@ -683,27 +694,23 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
>>   */
>>  int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>>  {
>> -	unsigned int cur_max;
>> +	unsigned int cur_max = UINT_MAX;
>>  	struct task_struct *me = current;
>>  	struct files_struct *cur_fds = me->files, *fds = NULL;
>>  
>> -	if (flags & ~CLOSE_RANGE_UNSHARE)
>> +	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
>>  		return -EINVAL;
>>  
>>  	if (fd > max_fd)
>>  		return -EINVAL;
>>  
>> -	rcu_read_lock();
>> -	cur_max = files_fdtable(cur_fds)->max_fds;
>> -	rcu_read_unlock();
>> -
>> -	/* cap to last valid index into fdtable */
>> -	cur_max--;
>> -
>>  	if (flags & CLOSE_RANGE_UNSHARE) {
>>  		int ret;
>>  		unsigned int max_unshare_fds = NR_OPEN_MAX;
>>  
>> +		/* cap to last valid index into fdtable */
>> +		cur_max = __get_max_fds(cur_fds) - 1;
>> +
>>  		/*
>>  		 * If the requested range is greater than the current maximum,
>>  		 * we're closing everything so only copy all file descriptors
>> @@ -724,16 +731,31 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
>>  			swap(cur_fds, fds);
>>  	}
>>  
>> -	max_fd = min(max_fd, cur_max);
>> -	while (fd <= max_fd) {
>> -		struct file *file;
>> +	if (flags & CLOSE_RANGE_CLOEXEC) {
>> +		struct fdtable *fdt;
>>  
>> -		file = pick_file(cur_fds, fd++);
>> -		if (!file)
>> -			continue;
>> +		spin_lock(&cur_fds->file_lock);
>> +		fdt = files_fdtable(cur_fds);
>> +		cur_max = fdt->max_fds - 1;
>> +		max_fd = min(max_fd, cur_max);
>> +		while (fd <= max_fd)
>> +			__set_close_on_exec(fd++, fdt);
>> +		spin_unlock(&cur_fds->file_lock);
>> +	} else {
>> +		/* Initialize cur_max if needed.  */
>> +		if (cur_max == UINT_MAX)
>> +			cur_max = __get_max_fds(cur_fds) - 1;
>
> The separation between how cur_fd is retrieved in the two branches makes
> the code more difficult to follow imho. Unless there's a clear reason
> why you've done it that way I would think that something like the patch
> I appended below might be a little clearer and easier to maintain(?).

Thanks for the review!

I've opted for this version as in the flags=CLOSE_RANGE_CLOEXEC case we
can read max_fds directly from the fds table and avoid doing it from the
RCU critical section as well.  I'll change it in favor of more readable
code.

Giuseppe
diff mbox series

Patch

diff --git a/fs/file.c b/fs/file.c
index 21c0893f2f1d..ad4ebee41e09 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -672,6 +672,17 @@  int __close_fd(struct files_struct *files, unsigned fd)
 }
 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
 
+static unsigned int __get_max_fds(struct files_struct *cur_fds)
+{
+	unsigned int max_fds;
+
+	rcu_read_lock();
+	/* cap to last valid index into fdtable */
+	max_fds = files_fdtable(cur_fds)->max_fds;
+	rcu_read_unlock();
+	return max_fds;
+}
+
 /**
  * __close_range() - Close all file descriptors in a given range.
  *
@@ -683,27 +694,23 @@  EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
  */
 int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 {
-	unsigned int cur_max;
+	unsigned int cur_max = UINT_MAX;
 	struct task_struct *me = current;
 	struct files_struct *cur_fds = me->files, *fds = NULL;
 
-	if (flags & ~CLOSE_RANGE_UNSHARE)
+	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
 		return -EINVAL;
 
 	if (fd > max_fd)
 		return -EINVAL;
 
-	rcu_read_lock();
-	cur_max = files_fdtable(cur_fds)->max_fds;
-	rcu_read_unlock();
-
-	/* cap to last valid index into fdtable */
-	cur_max--;
-
 	if (flags & CLOSE_RANGE_UNSHARE) {
 		int ret;
 		unsigned int max_unshare_fds = NR_OPEN_MAX;
 
+		/* cap to last valid index into fdtable */
+		cur_max = __get_max_fds(cur_fds) - 1;
+
 		/*
 		 * If the requested range is greater than the current maximum,
 		 * we're closing everything so only copy all file descriptors
@@ -724,16 +731,31 @@  int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 			swap(cur_fds, fds);
 	}
 
-	max_fd = min(max_fd, cur_max);
-	while (fd <= max_fd) {
-		struct file *file;
+	if (flags & CLOSE_RANGE_CLOEXEC) {
+		struct fdtable *fdt;
 
-		file = pick_file(cur_fds, fd++);
-		if (!file)
-			continue;
+		spin_lock(&cur_fds->file_lock);
+		fdt = files_fdtable(cur_fds);
+		cur_max = fdt->max_fds - 1;
+		max_fd = min(max_fd, cur_max);
+		while (fd <= max_fd)
+			__set_close_on_exec(fd++, fdt);
+		spin_unlock(&cur_fds->file_lock);
+	} else {
+		/* Initialize cur_max if needed.  */
+		if (cur_max == UINT_MAX)
+			cur_max = __get_max_fds(cur_fds) - 1;
+		max_fd = min(max_fd, cur_max);
+		while (fd <= max_fd) {
+			struct file *file;
 
-		filp_close(file, cur_fds);
-		cond_resched();
+			file = pick_file(cur_fds, fd++);
+			if (!file)
+				continue;
+
+			filp_close(file, cur_fds);
+			cond_resched();
+		}
 	}
 
 	if (fds) {
diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h
index 6928a9fdee3c..2d804281554c 100644
--- a/include/uapi/linux/close_range.h
+++ b/include/uapi/linux/close_range.h
@@ -5,5 +5,8 @@ 
 /* Unshare the file descriptor table before closing file descriptors. */
 #define CLOSE_RANGE_UNSHARE	(1U << 1)
 
+/* Set the FD_CLOEXEC bit instead of closing the file descriptor. */
+#define CLOSE_RANGE_CLOEXEC	(1U << 2)
+
 #endif /* _UAPI_LINUX_CLOSE_RANGE_H */