diff mbox series

[v7,1/8] fs: introduce kernel_pread_file* support

Message ID 20200606050458.17281-2-scott.branden@broadcom.com (mailing list archive)
State New, archived
Headers show
Series firmware: add partial read support in request_firmware_into_buf | expand

Commit Message

Scott Branden June 6, 2020, 5:04 a.m. UTC
Add kernel_pread_file* support to kernel to allow for partial read
of files with an offset into the file.  Existing kernel_read_file
functions call new kernel_pread_file functions with offset=0 and
opt=KERNEL_PREAD_WHOLE.

Signed-off-by: Scott Branden <scott.branden@broadcom.com>
---
 fs/exec.c          | 95 ++++++++++++++++++++++++++++++++++++----------
 include/linux/fs.h | 29 ++++++++++++++
 2 files changed, 103 insertions(+), 21 deletions(-)

Comments

Matthew Wilcox (Oracle) June 6, 2020, 3:52 p.m. UTC | #1
On Fri, Jun 05, 2020 at 10:04:51PM -0700, Scott Branden wrote:
> -int kernel_read_file(struct file *file, void **buf, loff_t *size,
> -		     loff_t max_size, enum kernel_read_file_id id)
> -{
> -	loff_t i_size, pos;
> +int kernel_pread_file(struct file *file, void **buf, loff_t *size,
> +		      loff_t pos, loff_t max_size,
> +		      enum kernel_pread_opt opt,
> +		      enum kernel_read_file_id id)
> +{
> +	loff_t alloc_size;
> +	loff_t buf_pos;
> +	loff_t read_end;
> +	loff_t i_size;
>  	ssize_t bytes = 0;
>  	int ret;
>  

Look, it's not your fault, but this is a great example of how we end
up with atrocious interfaces.  Someone comes along and implements a
simple DWIM interface that solves their problem.  Then somebody else
adds a slight variant that solves their problem, and so on and so on,
and we end up with this bonkers API where the arguments literally change
meaning depending on other arguments.

> @@ -950,21 +955,31 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
>  		ret = -EINVAL;
>  		goto out;
>  	}
> -	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
> +
> +	/* Default read to end of file */
> +	read_end = i_size;
> +
> +	/* Allow reading partial portion of file */
> +	if ((opt == KERNEL_PREAD_PART) &&
> +	    (i_size > (pos + max_size)))
> +		read_end = pos + max_size;
> +
> +	alloc_size = read_end - pos;
> +	if (i_size > SIZE_MAX || (max_size > 0 && alloc_size > max_size)) {
>  		ret = -EFBIG;
>  		goto out;

... like that.

I think what we actually want is:

ssize_t vmap_file_range(struct file *, loff_t start, loff_t end, void **bufp);
void vunmap_file_range(struct file *, void *buf);

If end > i_size, limit the allocation to i_size.  Returns the number
of bytes allocated, or a negative errno.  Writes the pointer allocated
to *bufp.  Internally, it should use the page cache to read in the pages
(taking appropriate reference counts).  Then it maps them using vmap()
instead of copying them to a private vmalloc() array.

kernel_read_file() can be converted to use this API.  The users will
need to be changed to call kernel_read_end(struct file *file, void *buf)
instead of vfree() so it can call allow_write_access() for them.

vmap_file_range() has a lot of potential uses.  I'm surprised we don't
have it already, to be honest.
Mimi Zohar June 8, 2020, 1:03 p.m. UTC | #2
On Sat, 2020-06-06 at 08:52 -0700, Matthew Wilcox wrote:
> On Fri, Jun 05, 2020 at 10:04:51PM -0700, Scott Branden wrote:
> > -int kernel_read_file(struct file *file, void **buf, loff_t *size,
> > -		     loff_t max_size, enum kernel_read_file_id id)
> > -{
> > -	loff_t i_size, pos;
> > +int kernel_pread_file(struct file *file, void **buf, loff_t *size,
> > +		      loff_t pos, loff_t max_size,
> > +		      enum kernel_pread_opt opt,
> > +		      enum kernel_read_file_id id)
> > +{
> > +	loff_t alloc_size;
> > +	loff_t buf_pos;
> > +	loff_t read_end;
> > +	loff_t i_size;
> >  	ssize_t bytes = 0;
> >  	int ret;
> >  
> 
> Look, it's not your fault, but this is a great example of how we end
> up with atrocious interfaces.  Someone comes along and implements a
> simple DWIM interface that solves their problem.  Then somebody else
> adds a slight variant that solves their problem, and so on and so on,
> and we end up with this bonkers API where the arguments literally change
> meaning depending on other arguments.
> 
> > @@ -950,21 +955,31 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
> >  		ret = -EINVAL;
> >  		goto out;
> >  	}
> > -	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
> > +
> > +	/* Default read to end of file */
> > +	read_end = i_size;
> > +
> > +	/* Allow reading partial portion of file */
> > +	if ((opt == KERNEL_PREAD_PART) &&
> > +	    (i_size > (pos + max_size)))
> > +		read_end = pos + max_size;
> > +
> > +	alloc_size = read_end - pos;
> > +	if (i_size > SIZE_MAX || (max_size > 0 && alloc_size > max_size)) {
> >  		ret = -EFBIG;
> >  		goto out;
> 
> ... like that.
> 
> I think what we actually want is:
> 
> ssize_t vmap_file_range(struct file *, loff_t start, loff_t end, void **bufp);
> void vunmap_file_range(struct file *, void *buf);
> 
> If end > i_size, limit the allocation to i_size.  Returns the number
> of bytes allocated, or a negative errno.  Writes the pointer allocated
> to *bufp.  Internally, it should use the page cache to read in the pages
> (taking appropriate reference counts).  Then it maps them using vmap()
> instead of copying them to a private vmalloc() array.
> 
> kernel_read_file() can be converted to use this API.  The users will
> need to be changed to call kernel_read_end(struct file *file, void *buf)
> instead of vfree() so it can call allow_write_access() for them.
> 
> vmap_file_range() has a lot of potential uses.  I'm surprised we don't
> have it already, to be honest.

Prior to kernel_read_file() the same or verify similar code existed in
multiple places in the kernel.  The kernel_read_file() API
consolidated the existing code adding the pre and post security hooks.

With this new design of not using a private vmalloc, will the file
data be accessible prior to the post security hooks?  From an IMA
perspective, the hooks are used for measuring and/or verifying the
integrity of the file.

Mimi
Matthew Wilcox (Oracle) June 8, 2020, 1:16 p.m. UTC | #3
On Mon, Jun 08, 2020 at 09:03:21AM -0400, Mimi Zohar wrote:
> On Sat, 2020-06-06 at 08:52 -0700, Matthew Wilcox wrote:
> > On Fri, Jun 05, 2020 at 10:04:51PM -0700, Scott Branden wrote:
> > > -int kernel_read_file(struct file *file, void **buf, loff_t *size,
> > > -		     loff_t max_size, enum kernel_read_file_id id)
> > > -{
> > > -	loff_t i_size, pos;
> > > +int kernel_pread_file(struct file *file, void **buf, loff_t *size,
> > > +		      loff_t pos, loff_t max_size,
> > > +		      enum kernel_pread_opt opt,
> > > +		      enum kernel_read_file_id id)
> > > +{
> > > +	loff_t alloc_size;
> > > +	loff_t buf_pos;
> > > +	loff_t read_end;
> > > +	loff_t i_size;
> > >  	ssize_t bytes = 0;
> > >  	int ret;
> > >  
> > 
> > Look, it's not your fault, but this is a great example of how we end
> > up with atrocious interfaces.  Someone comes along and implements a
> > simple DWIM interface that solves their problem.  Then somebody else
> > adds a slight variant that solves their problem, and so on and so on,
> > and we end up with this bonkers API where the arguments literally change
> > meaning depending on other arguments.
> > 
> > > @@ -950,21 +955,31 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
> > >  		ret = -EINVAL;
> > >  		goto out;
> > >  	}
> > > -	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
> > > +
> > > +	/* Default read to end of file */
> > > +	read_end = i_size;
> > > +
> > > +	/* Allow reading partial portion of file */
> > > +	if ((opt == KERNEL_PREAD_PART) &&
> > > +	    (i_size > (pos + max_size)))
> > > +		read_end = pos + max_size;
> > > +
> > > +	alloc_size = read_end - pos;
> > > +	if (i_size > SIZE_MAX || (max_size > 0 && alloc_size > max_size)) {
> > >  		ret = -EFBIG;
> > >  		goto out;
> > 
> > ... like that.
> > 
> > I think what we actually want is:
> > 
> > ssize_t vmap_file_range(struct file *, loff_t start, loff_t end, void **bufp);
> > void vunmap_file_range(struct file *, void *buf);
> > 
> > If end > i_size, limit the allocation to i_size.  Returns the number
> > of bytes allocated, or a negative errno.  Writes the pointer allocated
> > to *bufp.  Internally, it should use the page cache to read in the pages
> > (taking appropriate reference counts).  Then it maps them using vmap()
> > instead of copying them to a private vmalloc() array.
> > 
> > kernel_read_file() can be converted to use this API.  The users will
> > need to be changed to call kernel_read_end(struct file *file, void *buf)
> > instead of vfree() so it can call allow_write_access() for them.
> > 
> > vmap_file_range() has a lot of potential uses.  I'm surprised we don't
> > have it already, to be honest.
> 
> Prior to kernel_read_file() the same or verify similar code existed in
> multiple places in the kernel.  The kernel_read_file() API
> consolidated the existing code adding the pre and post security hooks.
> 
> With this new design of not using a private vmalloc, will the file
> data be accessible prior to the post security hooks?  From an IMA
> perspective, the hooks are used for measuring and/or verifying the
> integrity of the file.

File data is already accessible prior to the post security hooks.
Look how kernel_read_file works:

        ret = deny_write_access(file);
        ret = security_kernel_read_file(file, id);
                *buf = vmalloc(i_size);
                bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
        ret = security_kernel_post_read_file(file, *buf, i_size, id);

kernel_read() will read the data into the page cache and then copy it
into the vmalloc'd buffer.  There's nothing here to prevent read accesses
to the file.
Mimi Zohar June 8, 2020, 1:22 p.m. UTC | #4
On Mon, 2020-06-08 at 06:16 -0700, Matthew Wilcox wrote:
> On Mon, Jun 08, 2020 at 09:03:21AM -0400, Mimi Zohar wrote:
> > On Sat, 2020-06-06 at 08:52 -0700, Matthew Wilcox wrote:
> > > On Fri, Jun 05, 2020 at 10:04:51PM -0700, Scott Branden wrote:
> > > > -int kernel_read_file(struct file *file, void **buf, loff_t *size,
> > > > -		     loff_t max_size, enum kernel_read_file_id id)
> > > > -{
> > > > -	loff_t i_size, pos;
> > > > +int kernel_pread_file(struct file *file, void **buf, loff_t *size,
> > > > +		      loff_t pos, loff_t max_size,
> > > > +		      enum kernel_pread_opt opt,
> > > > +		      enum kernel_read_file_id id)
> > > > +{
> > > > +	loff_t alloc_size;
> > > > +	loff_t buf_pos;
> > > > +	loff_t read_end;
> > > > +	loff_t i_size;
> > > >  	ssize_t bytes = 0;
> > > >  	int ret;
> > > >  
> > > 
> > > Look, it's not your fault, but this is a great example of how we end
> > > up with atrocious interfaces.  Someone comes along and implements a
> > > simple DWIM interface that solves their problem.  Then somebody else
> > > adds a slight variant that solves their problem, and so on and so on,
> > > and we end up with this bonkers API where the arguments literally change
> > > meaning depending on other arguments.
> > > 
> > > > @@ -950,21 +955,31 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
> > > >  		ret = -EINVAL;
> > > >  		goto out;
> > > >  	}
> > > > -	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
> > > > +
> > > > +	/* Default read to end of file */
> > > > +	read_end = i_size;
> > > > +
> > > > +	/* Allow reading partial portion of file */
> > > > +	if ((opt == KERNEL_PREAD_PART) &&
> > > > +	    (i_size > (pos + max_size)))
> > > > +		read_end = pos + max_size;
> > > > +
> > > > +	alloc_size = read_end - pos;
> > > > +	if (i_size > SIZE_MAX || (max_size > 0 && alloc_size > max_size)) {
> > > >  		ret = -EFBIG;
> > > >  		goto out;
> > > 
> > > ... like that.
> > > 
> > > I think what we actually want is:
> > > 
> > > ssize_t vmap_file_range(struct file *, loff_t start, loff_t end, void **bufp);
> > > void vunmap_file_range(struct file *, void *buf);
> > > 
> > > If end > i_size, limit the allocation to i_size.  Returns the number
> > > of bytes allocated, or a negative errno.  Writes the pointer allocated
> > > to *bufp.  Internally, it should use the page cache to read in the pages
> > > (taking appropriate reference counts).  Then it maps them using vmap()
> > > instead of copying them to a private vmalloc() array.
> > > 
> > > kernel_read_file() can be converted to use this API.  The users will
> > > need to be changed to call kernel_read_end(struct file *file, void *buf)
> > > instead of vfree() so it can call allow_write_access() for them.
> > > 
> > > vmap_file_range() has a lot of potential uses.  I'm surprised we don't
> > > have it already, to be honest.
> > 
> > Prior to kernel_read_file() the same or verify similar code existed in
> > multiple places in the kernel.  The kernel_read_file() API
> > consolidated the existing code adding the pre and post security hooks.
> > 
> > With this new design of not using a private vmalloc, will the file
> > data be accessible prior to the post security hooks?  From an IMA
> > perspective, the hooks are used for measuring and/or verifying the
> > integrity of the file.
> 
> File data is already accessible prior to the post security hooks.
> Look how kernel_read_file works:
> 
>         ret = deny_write_access(file);
>         ret = security_kernel_read_file(file, id);
>                 *buf = vmalloc(i_size);
>                 bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
>         ret = security_kernel_post_read_file(file, *buf, i_size, id);
> 
> kernel_read() will read the data into the page cache and then copy it
> into the vmalloc'd buffer.  There's nothing here to prevent read accesses
> to the file.

The post security hook needs to access to the file data in order to
calculate the file hash.  The question is whether prior to returning
from kernel_read_file() the caller can access the file data.

Mimi
Mimi Zohar June 8, 2020, 1:27 p.m. UTC | #5
On Mon, 2020-06-08 at 09:22 -0400, Mimi Zohar wrote:
> On Mon, 2020-06-08 at 06:16 -0700, Matthew Wilcox wrote:
> > On Mon, Jun 08, 2020 at 09:03:21AM -0400, Mimi Zohar wrote:
> > > On Sat, 2020-06-06 at 08:52 -0700, Matthew Wilcox wrote:
> > > > On Fri, Jun 05, 2020 at 10:04:51PM -0700, Scott Branden wrote:
> > > > > -int kernel_read_file(struct file *file, void **buf, loff_t *size,
> > > > > -		     loff_t max_size, enum kernel_read_file_id id)
> > > > > -{
> > > > > -	loff_t i_size, pos;
> > > > > +int kernel_pread_file(struct file *file, void **buf, loff_t *size,
> > > > > +		      loff_t pos, loff_t max_size,
> > > > > +		      enum kernel_pread_opt opt,
> > > > > +		      enum kernel_read_file_id id)
> > > > > +{
> > > > > +	loff_t alloc_size;
> > > > > +	loff_t buf_pos;
> > > > > +	loff_t read_end;
> > > > > +	loff_t i_size;
> > > > >  	ssize_t bytes = 0;
> > > > >  	int ret;
> > > > >  
> > > > 
> > > > Look, it's not your fault, but this is a great example of how we end
> > > > up with atrocious interfaces.  Someone comes along and implements a
> > > > simple DWIM interface that solves their problem.  Then somebody else
> > > > adds a slight variant that solves their problem, and so on and so on,
> > > > and we end up with this bonkers API where the arguments literally change
> > > > meaning depending on other arguments.
> > > > 
> > > > > @@ -950,21 +955,31 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
> > > > >  		ret = -EINVAL;
> > > > >  		goto out;
> > > > >  	}
> > > > > -	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
> > > > > +
> > > > > +	/* Default read to end of file */
> > > > > +	read_end = i_size;
> > > > > +
> > > > > +	/* Allow reading partial portion of file */
> > > > > +	if ((opt == KERNEL_PREAD_PART) &&
> > > > > +	    (i_size > (pos + max_size)))
> > > > > +		read_end = pos + max_size;
> > > > > +
> > > > > +	alloc_size = read_end - pos;
> > > > > +	if (i_size > SIZE_MAX || (max_size > 0 && alloc_size > max_size)) {
> > > > >  		ret = -EFBIG;
> > > > >  		goto out;
> > > > 
> > > > ... like that.
> > > > 
> > > > I think what we actually want is:
> > > > 
> > > > ssize_t vmap_file_range(struct file *, loff_t start, loff_t end, void **bufp);
> > > > void vunmap_file_range(struct file *, void *buf);
> > > > 
> > > > If end > i_size, limit the allocation to i_size.  Returns the number
> > > > of bytes allocated, or a negative errno.  Writes the pointer allocated
> > > > to *bufp.  Internally, it should use the page cache to read in the pages
> > > > (taking appropriate reference counts).  Then it maps them using vmap()
> > > > instead of copying them to a private vmalloc() array.
> > > > 
> > > > kernel_read_file() can be converted to use this API.  The users will
> > > > need to be changed to call kernel_read_end(struct file *file, void *buf)
> > > > instead of vfree() so it can call allow_write_access() for them.
> > > > 
> > > > vmap_file_range() has a lot of potential uses.  I'm surprised we don't
> > > > have it already, to be honest.
> > > 
> > > Prior to kernel_read_file() the same or verify similar code existed in
> > > multiple places in the kernel.  The kernel_read_file() API
> > > consolidated the existing code adding the pre and post security hooks.
> > > 
> > > With this new design of not using a private vmalloc, will the file
> > > data be accessible prior to the post security hooks?  From an IMA
> > > perspective, the hooks are used for measuring and/or verifying the
> > > integrity of the file.
> > 
> > File data is already accessible prior to the post security hooks.
> > Look how kernel_read_file works:
> > 
> >         ret = deny_write_access(file);
> >         ret = security_kernel_read_file(file, id);
> >                 *buf = vmalloc(i_size);
> >                 bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
> >         ret = security_kernel_post_read_file(file, *buf, i_size, id);
> > 
> > kernel_read() will read the data into the page cache and then copy it
> > into the vmalloc'd buffer.  There's nothing here to prevent read accesses
> > to the file.
> 
> The post security hook needs to access to the file data in order to
> calculate the file hash.  The question is whether prior to returning
> from kernel_read_file() the caller can access the file data.

In the case of firmware, I'm asking if the device will be able to
access the file data before kernel_read_file() returns.

Mimi
Matthew Wilcox (Oracle) June 8, 2020, 1:32 p.m. UTC | #6
On Mon, Jun 08, 2020 at 09:22:06AM -0400, Mimi Zohar wrote:
> On Mon, 2020-06-08 at 06:16 -0700, Matthew Wilcox wrote:
> > On Mon, Jun 08, 2020 at 09:03:21AM -0400, Mimi Zohar wrote:
> > > With this new design of not using a private vmalloc, will the file
> > > data be accessible prior to the post security hooks?  From an IMA
> > > perspective, the hooks are used for measuring and/or verifying the
> > > integrity of the file.
> > 
> > File data is already accessible prior to the post security hooks.
> > Look how kernel_read_file works:
> > 
> >         ret = deny_write_access(file);
> >         ret = security_kernel_read_file(file, id);
> >                 *buf = vmalloc(i_size);
> >                 bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
> >         ret = security_kernel_post_read_file(file, *buf, i_size, id);
> > 
> > kernel_read() will read the data into the page cache and then copy it
> > into the vmalloc'd buffer.  There's nothing here to prevent read accesses
> > to the file.
> 
> The post security hook needs to access to the file data in order to
> calculate the file hash.  The question is whether prior to returning
> from kernel_read_file() the caller can access the file data.

Whether you copy the data (as today) or map it (as I'm proposing),
the data goes into the page cache.  It's up to the security system to
block access to the page cache until it's been verified.
Scott Branden June 8, 2020, 10:29 p.m. UTC | #7
Hi Matthew,

I am requesting the experts in the filesystem subsystem to come to a 
consensus here.
This is not my area of expertise at all but every time I have addressed 
all of the
outstanding concerns someone else comes along and raises another one.

Please see me comments below.

On 2020-06-06 8:52 a.m., Matthew Wilcox wrote:
> On Fri, Jun 05, 2020 at 10:04:51PM -0700, Scott Branden wrote:
>> -int kernel_read_file(struct file *file, void **buf, loff_t *size,
>> -		     loff_t max_size, enum kernel_read_file_id id)
>> -{
>> -	loff_t i_size, pos;
Please note that how checkpatch generated the diff here.  The code 
modifications
below are for a new function kernel_pread_file, they do not modify the 
existing API
kernel_read_file.  kernel_read_file requests the ENTIRE file is read.  
So we need to be
able to differentiate whether it is ok to read just a portion of the 
file or not.
>> +int kernel_pread_file(struct file *file, void **buf, loff_t *size,
>> +		      loff_t pos, loff_t max_size,
>> +		      enum kernel_pread_opt opt,
>> +		      enum kernel_read_file_id id)
So, to share common code a new kernel_pread_opt needed to be added in 
order to specify whether
it was ok to read a partial file or not, and provide an offset into the 
file where to begin reading.
The meaning of parameters doesn't change in the bonkers API. max_size 
still means max size, etc.
These options are needed so common code can be shared with 
kernel_read_file api.

The partial read option is then needed further in the depths of the 
kernel read for IMA operation as IMA does
things differently for optimization of whether it is OK to do a partial 
read of the file or not.
>> +{
>> +	loff_t alloc_size;
>> +	loff_t buf_pos;
>> +	loff_t read_end;
>> +	loff_t i_size;
>>   	ssize_t bytes = 0;
>>   	int ret;
>>   
> Look, it's not your fault, but this is a great example of how we end
> up with atrocious interfaces.  Someone comes along and implements a
> simple DWIM interface that solves their problem.  Then somebody else
> adds a slight variant that solves their problem, and so on and so on,
> and we end up with this bonkers API where the arguments literally change
> meaning depending on other arguments.
I don't see what arguments are changing meaning.  Please explain what is 
changing meaning.
The diff below is for kernel_pread_file, not kernel_read_file. Perhaps 
that is where your confusion is.
>
>> @@ -950,21 +955,31 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
>>   		ret = -EINVAL;
>>   		goto out;
>>   	}
>> -	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
>> +
>> +	/* Default read to end of file */
>> +	read_end = i_size;
>> +
>> +	/* Allow reading partial portion of file */
>> +	if ((opt == KERNEL_PREAD_PART) &&
>> +	    (i_size > (pos + max_size)))
>> +		read_end = pos + max_size;
>> +
>> +	alloc_size = read_end - pos;
>> +	if (i_size > SIZE_MAX || (max_size > 0 && alloc_size > max_size)) {
>>   		ret = -EFBIG;
>>   		goto out;
> ... like that.
like what?  We need to determine how much of the file to read based on 
size of file, position in file, and max size we can read.
>
> I think what we actually want is:
>
> ssize_t vmap_file_range(struct file *, loff_t start, loff_t end, void **bufp);
> void vunmap_file_range(struct file *, void *buf);
>
> If end > i_size, limit the allocation to i_size.  Returns the number
> of bytes allocated, or a negative errno.  Writes the pointer allocated
> to *bufp.  Internally, it should use the page cache to read in the pages
> (taking appropriate reference counts).  Then it maps them using vmap()
> instead of copying them to a private vmalloc() array.
> kernel_read_file() can be converted to use this API.  The users will
> need to be changed to call kernel_read_end(struct file *file, void *buf)
> instead of vfree() so it can call allow_write_access() for them.
>
> vmap_file_range() has a lot of potential uses.  I'm surprised we don't
> have it already, to be honest.
Such a change sounds like it could be done in a later patch series.
It's an incomplete solution.  It would work for some of the needed 
operations but not others.
For kernel_read_file, I don't see how in your new API it indicates if 
the end of the file was reached or not.
Also, please note that buffers may be preallocated  and shouldn't be 
freed by the kernel in some cases and
allocated and freed by the kernel in others.

Your proposed change doesn't exist and is not simple as it sounds or 
meet all the needs of the existing kernel_read_file
function, IMA, and new partial kernel_pread_file?

Patch v7 does not break existing functions or rearchitect things in a 
dramatic way.  They fit into existing code,
will not break the existing codepaths (which some didn't even have a 
test case until I added one), and can
be improved upon as need with your vmap_file_range or others once those 
have been developed, tested, and
proven by someone.

I would like the experts here to decide on what needs to be done so we 
can move forward
and get kernel_pread_file support added soon.
Thanks,
Scott
Matthew Wilcox (Oracle) June 9, 2020, 1:21 p.m. UTC | #8
On Mon, Jun 08, 2020 at 03:29:22PM -0700, Scott Branden wrote:
> Hi Matthew,
> 
> I am requesting the experts in the filesystem subsystem to come to a
> consensus here.
> This is not my area of expertise at all but every time I have addressed all
> of the
> outstanding concerns someone else comes along and raises another one.

I appreciate it's frustrating for you, but this is the nature of
patch review.  I haven't even read the first five or so submissions.
I can see them in my inbox and they look like long threads.  I'm not
particularly inclined to read them.  I happened to read v6, and reacted
to the API being ugly.

> Please see me comments below.
> 
> On 2020-06-06 8:52 a.m., Matthew Wilcox wrote:
> > On Fri, Jun 05, 2020 at 10:04:51PM -0700, Scott Branden wrote:
> > > -int kernel_read_file(struct file *file, void **buf, loff_t *size,
> > > -		     loff_t max_size, enum kernel_read_file_id id)
> > > -{
> > > -	loff_t i_size, pos;
> Please note that how checkpatch generated the diff here.  The code
> modifications
> below are for a new function kernel_pread_file, they do not modify the
> existing API
> kernel_read_file.  kernel_read_file requests the ENTIRE file is read.  So we
> need to be
> able to differentiate whether it is ok to read just a portion of the file or
> not.

You've gone about this in entirely the wrong way though.  This enum to
read the entire file or a partial is just bad design.

> > > +int kernel_pread_file(struct file *file, void **buf, loff_t *size,
> > > +		      loff_t pos, loff_t max_size,
> > > +		      enum kernel_pread_opt opt,
> > > +		      enum kernel_read_file_id id)
> So, to share common code a new kernel_pread_opt needed to be added in order
> to specify whether
> it was ok to read a partial file or not, and provide an offset into the file
> where to begin reading.
> The meaning of parameters doesn't change in the bonkers API. max_size still
> means max size, etc.
> These options are needed so common code can be shared with kernel_read_file
> api.

Does pread() in userspace take seven parameters?  No.  It takes four.
What you're doing is taking all the complexity of all of the interfaces
and stuffing it all down into the bottom function instead of handling
some of the complexity in the wrapper functions.  For example, you
could support the functionality of 'max_size' in kernel_read_file()
and leave it out of the kernel_pread_file() interface.

> > I think what we actually want is:
> > 
> > ssize_t vmap_file_range(struct file *, loff_t start, loff_t end, void **bufp);
> > void vunmap_file_range(struct file *, void *buf);
> > 
> > If end > i_size, limit the allocation to i_size.  Returns the number
> > of bytes allocated, or a negative errno.  Writes the pointer allocated
> > to *bufp.  Internally, it should use the page cache to read in the pages
> > (taking appropriate reference counts).  Then it maps them using vmap()
> > instead of copying them to a private vmalloc() array.
> > kernel_read_file() can be converted to use this API.  The users will
> > need to be changed to call kernel_read_end(struct file *file, void *buf)
> > instead of vfree() so it can call allow_write_access() for them.
> > 
> > vmap_file_range() has a lot of potential uses.  I'm surprised we don't
> > have it already, to be honest.
> Such a change sounds like it could be done in a later patch series.
> It's an incomplete solution.  It would work for some of the needed
> operations but not others.
> For kernel_read_file, I don't see how in your new API it indicates if the
> end of the file was reached or not.

That's the point.  It doesn't.  If a caller needs that, then they can
figure that out themselves.

> Also, please note that buffers may be preallocated  and shouldn't be freed
> by the kernel in some cases and
> allocated and freed by the kernel in others.

You're trying to build the swiss army knife of functions.  Swiss army
knives are useful, but they're no good for carving a steak.

> I would like the experts here to decide on what needs to be done so we can
> move forward
> and get kernel_pread_file support added soon.

You know, you haven't even said _why_ you want this.  The cover letter
just says "I want this", and doesn't say why it's needed.
Scott Branden June 9, 2020, 10:55 p.m. UTC | #9
Hi Matthew,

On 2020-06-09 6:21 a.m., Matthew Wilcox wrote:
> On Mon, Jun 08, 2020 at 03:29:22PM -0700, Scott Branden wrote:
>> Hi Matthew,
>>
>> I am requesting the experts in the filesystem subsystem to come to a
>> consensus here.
>> This is not my area of expertise at all but every time I have addressed all
>> of the
>> outstanding concerns someone else comes along and raises another one.
> I appreciate it's frustrating for you, but this is the nature of
> patch review.  I haven't even read the first five or so submissions.
> I can see them in my inbox and they look like long threads.  I'm not
> particularly inclined to read them.  I happened to read v6, and reacted
> to the API being ugly.
Thanks for the review.  Yes, I do see the enum being ugly now
and have removed it in v8 of the patch.  Hopefully it addresses
your concerns.  More comments below.
>
>> Please see me comments below.
>>
>> On 2020-06-06 8:52 a.m., Matthew Wilcox wrote:
>>> On Fri, Jun 05, 2020 at 10:04:51PM -0700, Scott Branden wrote:
>>>> -int kernel_read_file(struct file *file, void **buf, loff_t *size,
>>>> -		     loff_t max_size, enum kernel_read_file_id id)
>>>> -{
>>>> -	loff_t i_size, pos;
>> Please note that how checkpatch generated the diff here.  The code
>> modifications
>> below are for a new function kernel_pread_file, they do not modify the
>> existing API
>> kernel_read_file.  kernel_read_file requests the ENTIRE file is read.  So we
>> need to be
>> able to differentiate whether it is ok to read just a portion of the file or
>> not.
> You've gone about this in entirely the wrong way though.  This enum to
> read the entire file or a partial is just bad design.
Your point on the enum is valid.
I've removed it from design.  Hopefully it is cleaner now.
>
>>>> +int kernel_pread_file(struct file *file, void **buf, loff_t *size,
>>>> +		      loff_t pos, loff_t max_size,
>>>> +		      enum kernel_pread_opt opt,
>>>> +		      enum kernel_read_file_id id)
>> So, to share common code a new kernel_pread_opt needed to be added in order
>> to specify whether
>> it was ok to read a partial file or not, and provide an offset into the file
>> where to begin reading.
>> The meaning of parameters doesn't change in the bonkers API. max_size still
>> means max size, etc.
>> These options are needed so common code can be shared with kernel_read_file
>> api.
> Does pread() in userspace take seven parameters?  No.  It takes four.
> What you're doing is taking all the complexity of all of the interfaces
> and stuffing it all down into the bottom function instead of handling
> some of the complexity in the wrapper functions.  For example, you
> could support the functionality of 'max_size' in kernel_read_file()
> and leave it out of the kernel_pread_file() interface.
I have removed the enum necessary in the kernel pread call now,
so it is down to 6.
The other 2 parameters are necessary as they are in kernel read.

max_size makes no sense to remove - it serves the same purpose
as in userspace pread and read functions.  To specify the max size
to read.
>>> I think what we actually want is:
>>>
>>> ssize_t vmap_file_range(struct file *, loff_t start, loff_t end, void **bufp);
>>> void vunmap_file_range(struct file *, void *buf);
>>>
>>> If end > i_size, limit the allocation to i_size.  Returns the number
>>> of bytes allocated, or a negative errno.  Writes the pointer allocated
>>> to *bufp.  Internally, it should use the page cache to read in the pages
>>> (taking appropriate reference counts).  Then it maps them using vmap()
>>> instead of copying them to a private vmalloc() array.
>>> kernel_read_file() can be converted to use this API.  The users will
>>> need to be changed to call kernel_read_end(struct file *file, void *buf)
>>> instead of vfree() so it can call allow_write_access() for them.
>>>
>>> vmap_file_range() has a lot of potential uses.  I'm surprised we don't
>>> have it already, to be honest.
>> Such a change sounds like it could be done in a later patch series.
>> It's an incomplete solution.  It would work for some of the needed
>> operations but not others.
>> For kernel_read_file, I don't see how in your new API it indicates if the
>> end of the file was reached or not.
> That's the point.  It doesn't.  If a caller needs that, then they can
> figure that out themselves.
No, they can't.  The caller only calls kernel_read_file once and expects
the whole file to be read.  The kernel_read_file doesn't work like 
userspace.
There is no tracking like userspace of where in the file you read?
>
>> Also, please note that buffers may be preallocated  and shouldn't be freed
>> by the kernel in some cases and
>> allocated and freed by the kernel in others.
> You're trying to build the swiss army knife of functions.  Swiss army
> knives are useful, but they're no good for carving a steak.
Hopefully I'm carving steak now.
>> I would like the experts here to decide on what needs to be done so we can
>> move forward
>> and get kernel_pread_file support added soon.
> You know, you haven't even said _why_ you want this.  The cover letter
> just says "I want this", and doesn't say why it's needed.
Cover letter updated.

Thanks,
Scott
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index 306fc566171e..751f5ddc7538 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -927,10 +927,15 @@  struct file *open_exec(const char *name)
 }
 EXPORT_SYMBOL(open_exec);
 
-int kernel_read_file(struct file *file, void **buf, loff_t *size,
-		     loff_t max_size, enum kernel_read_file_id id)
-{
-	loff_t i_size, pos;
+int kernel_pread_file(struct file *file, void **buf, loff_t *size,
+		      loff_t pos, loff_t max_size,
+		      enum kernel_pread_opt opt,
+		      enum kernel_read_file_id id)
+{
+	loff_t alloc_size;
+	loff_t buf_pos;
+	loff_t read_end;
+	loff_t i_size;
 	ssize_t bytes = 0;
 	int ret;
 
@@ -950,21 +955,31 @@  int kernel_read_file(struct file *file, void **buf, loff_t *size,
 		ret = -EINVAL;
 		goto out;
 	}
-	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
+
+	/* Default read to end of file */
+	read_end = i_size;
+
+	/* Allow reading partial portion of file */
+	if ((opt == KERNEL_PREAD_PART) &&
+	    (i_size > (pos + max_size)))
+		read_end = pos + max_size;
+
+	alloc_size = read_end - pos;
+	if (i_size > SIZE_MAX || (max_size > 0 && alloc_size > max_size)) {
 		ret = -EFBIG;
 		goto out;
 	}
 
 	if (id != READING_FIRMWARE_PREALLOC_BUFFER)
-		*buf = vmalloc(i_size);
+		*buf = vmalloc(alloc_size);
 	if (!*buf) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	pos = 0;
-	while (pos < i_size) {
-		bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
+	buf_pos = 0;
+	while (pos < read_end) {
+		bytes = kernel_read(file, *buf + buf_pos, read_end - pos, &pos);
 		if (bytes < 0) {
 			ret = bytes;
 			goto out_free;
@@ -972,14 +987,16 @@  int kernel_read_file(struct file *file, void **buf, loff_t *size,
 
 		if (bytes == 0)
 			break;
+
+		buf_pos += bytes;
 	}
 
-	if (pos != i_size) {
+	if (pos != read_end) {
 		ret = -EIO;
 		goto out_free;
 	}
 
-	ret = security_kernel_post_read_file(file, *buf, i_size, id);
+	ret = security_kernel_post_read_file(file, *buf, alloc_size, id);
 	if (!ret)
 		*size = pos;
 
@@ -995,10 +1012,20 @@  int kernel_read_file(struct file *file, void **buf, loff_t *size,
 	allow_write_access(file);
 	return ret;
 }
+
+int kernel_read_file(struct file *file, void **buf, loff_t *size,
+		     loff_t max_size, enum kernel_read_file_id id)
+{
+	return kernel_pread_file(file, buf, size, 0, max_size,
+				 KERNEL_PREAD_WHOLE, id);
+}
 EXPORT_SYMBOL_GPL(kernel_read_file);
 
-int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
-			       loff_t max_size, enum kernel_read_file_id id)
+int kernel_pread_file_from_path(const char *path, void **buf,
+				loff_t *size, loff_t pos,
+				loff_t max_size,
+				enum kernel_pread_opt opt,
+				enum kernel_read_file_id id)
 {
 	struct file *file;
 	int ret;
@@ -1010,15 +1037,24 @@  int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ret = kernel_read_file(file, buf, size, max_size, id);
+	ret = kernel_pread_file(file, buf, size, pos, max_size, opt, id);
 	fput(file);
 	return ret;
 }
+
+int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
+			       loff_t max_size, enum kernel_read_file_id id)
+{
+	return kernel_pread_file_from_path(path, buf, size, 0, max_size,
+					   KERNEL_PREAD_WHOLE, id);
+}
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
 
-int kernel_read_file_from_path_initns(const char *path, void **buf,
-				      loff_t *size, loff_t max_size,
-				      enum kernel_read_file_id id)
+int kernel_pread_file_from_path_initns(const char *path, void **buf,
+				       loff_t *size, loff_t pos,
+				       loff_t max_size,
+				       enum kernel_pread_opt opt,
+				       enum kernel_read_file_id id)
 {
 	struct file *file;
 	struct path root;
@@ -1036,14 +1072,24 @@  int kernel_read_file_from_path_initns(const char *path, void **buf,
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ret = kernel_read_file(file, buf, size, max_size, id);
+	ret = kernel_pread_file(file, buf, size, pos, max_size, opt, id);
 	fput(file);
 	return ret;
 }
+
+int kernel_read_file_from_path_initns(const char *path, void **buf,
+				      loff_t *size, loff_t max_size,
+				      enum kernel_read_file_id id)
+{
+	return kernel_pread_file_from_path_initns(path, buf, size, 0, max_size,
+						  KERNEL_PREAD_WHOLE, id);
+}
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
 
-int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
-			     enum kernel_read_file_id id)
+int kernel_pread_file_from_fd(int fd, void **buf, loff_t *size, loff_t pos,
+			      loff_t max_size,
+			      enum kernel_pread_opt opt,
+			      enum kernel_read_file_id id)
 {
 	struct fd f = fdget(fd);
 	int ret = -EBADF;
@@ -1051,11 +1097,18 @@  int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
 	if (!f.file)
 		goto out;
 
-	ret = kernel_read_file(f.file, buf, size, max_size, id);
+	ret = kernel_pread_file(f.file, buf, size, pos, max_size, opt, id);
 out:
 	fdput(f);
 	return ret;
 }
+
+int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
+			     enum kernel_read_file_id id)
+{
+	return kernel_pread_file_from_fd(fd, buf, size, 0, max_size,
+					 KERNEL_PREAD_WHOLE, id);
+}
 EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
 
 #if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
diff --git a/include/linux/fs.h b/include/linux/fs.h
index db604c7bd79c..aee7600958ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3048,12 +3048,41 @@  static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 	return kernel_read_file_str[id];
 }
 
+/**
+ * enum kernel_pread_opt - options to control pread file loading behaviour
+ *
+ * @KERNEL_PREAD_WHOLE: Only Allow reading of whole file.
+ * @KERNEL_PREAD_PART: Allow reading part of file.
+ */
+enum kernel_pread_opt {
+	KERNEL_PREAD_WHOLE = 0,
+	KERNEL_PREAD_PART = BIT(0),
+};
+
+int kernel_pread_file(struct file *file, void **buf, loff_t *size,
+		      loff_t pos, loff_t max_size,
+		      enum kernel_pread_opt opt,
+		      enum kernel_read_file_id id);
 extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
 			    enum kernel_read_file_id);
+int kernel_pread_file_from_path(const char *path, void **buf,
+				loff_t *size, loff_t pos,
+				loff_t max_size,
+				enum kernel_pread_opt opt,
+				enum kernel_read_file_id id);
 extern int kernel_read_file_from_path(const char *, void **, loff_t *, loff_t,
 				      enum kernel_read_file_id);
+int kernel_pread_file_from_path_initns(const char *path, void **buf,
+				       loff_t *size, loff_t pos,
+				       loff_t max_size,
+				       enum kernel_pread_opt opt,
+				       enum kernel_read_file_id id);
 extern int kernel_read_file_from_path_initns(const char *, void **, loff_t *, loff_t,
 					     enum kernel_read_file_id);
+int kernel_pread_file_from_fd(int fd, void **buf, loff_t *size,
+			      loff_t pos, loff_t max_size,
+			      enum kernel_pread_opt opt,
+			      enum kernel_read_file_id id);
 extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
 				    enum kernel_read_file_id);
 extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);