diff mbox series

[v2,1/1] NFSD: Simplify READ_PLUS

Message ID 20220907195259.926736-2-anna@kernel.org (mailing list archive)
State New, archived
Headers show
Series NFSD: Simplify READ_PLUS | expand

Commit Message

Anna Schumaker Sept. 7, 2022, 7:52 p.m. UTC
From: Anna Schumaker <Anna.Schumaker@Netapp.com>

Chuck had suggested reverting READ_PLUS so it returns a single DATA
segment covering the requested read range. This prepares the server for
a future "sparse read" function so support can easily be added without
needing to rip out the old READ_PLUS code at the same time.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
 1 file changed, 32 insertions(+), 107 deletions(-)

Comments

Chuck Lever III Sept. 7, 2022, 8:29 p.m. UTC | #1
Be sure to Cc: Jeff on these. Thanks!


> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> wrote:
> 
> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> 
> Chuck had suggested reverting READ_PLUS so it returns a single DATA
> segment covering the requested read range. This prepares the server for
> a future "sparse read" function so support can easily be added without
> needing to rip out the old READ_PLUS code at the same time.
> 
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
> fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
> 1 file changed, 32 insertions(+), 107 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 1e9690a061ec..bcc8c385faf2 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
> 
> static __be32
> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> -			    struct nfsd4_read *read,
> -			    unsigned long *maxcount, u32 *eof,
> -			    loff_t *pos)
> +			    struct nfsd4_read *read)
> {
> -	struct xdr_stream *xdr = resp->xdr;
> +	bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
> 	struct file *file = read->rd_nf->nf_file;
> -	int starting_len = xdr->buf->len;
> -	loff_t hole_pos;
> -	__be32 nfserr;
> -	__be32 *p, tmp;
> -	__be64 tmp64;
> -
> -	hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> -	if (hole_pos > read->rd_offset)
> -		*maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
> -	*maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
> +	struct xdr_stream *xdr = resp->xdr;
> +	unsigned long maxcount;
> +	__be32 nfserr, *p;
> 
> 	/* Content type, offset, byte count */
> 	p = xdr_reserve_space(xdr, 4 + 8 + 4);
> 	if (!p)
> -		return nfserr_resource;
> +		return nfserr_io;

Wouldn't nfserr_rep_too_big be a more appropriate status for running
off the end of the send buffer? I'm not 100% sure, but I would expect
that exhausting send buffer space would imply the reply has grown too
large.


> +	if (resp->xdr->buf->page_len && splice_ok) {
> +		WARN_ON_ONCE(splice_ok);
> +		return nfserr_io;
> +	}

I wish I understood why this test was needed. It seems to have been
copied and pasted from historic code into nfsd4_encode_read(), and
there have been recent mechanical changes to it, but there's no
comment explaining it there...

In any event, this seems to be checking for a server software bug,
so maybe this should return nfserr_serverfault. Oddly that status
code isn't defined yet.


Do you have some performance results for v2?


> -	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
> -	if (read->rd_vlen < 0)
> -		return nfserr_resource;
> +	maxcount = min_t(unsigned long, read->rd_length,
> +			 (xdr->buf->buflen - xdr->buf->len));
> 
> -	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> -			    resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
> +	if (file->f_op->splice_read && splice_ok)
> +		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
> +	else
> +		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> 	if (nfserr)
> 		return nfserr;
> -	xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
> 
> -	tmp = htonl(NFS4_CONTENT_DATA);
> -	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
> -	tmp64 = cpu_to_be64(read->rd_offset);
> -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
> -	tmp = htonl(*maxcount);
> -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
> -
> -	tmp = xdr_zero;
> -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
> -			       xdr_pad_size(*maxcount));
> -	return nfs_ok;
> -}
> -
> -static __be32
> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> -			    struct nfsd4_read *read,
> -			    unsigned long *maxcount, u32 *eof)
> -{
> -	struct file *file = read->rd_nf->nf_file;
> -	loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> -	loff_t f_size = i_size_read(file_inode(file));
> -	unsigned long count;
> -	__be32 *p;
> -
> -	if (data_pos == -ENXIO)
> -		data_pos = f_size;
> -	else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> -		return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> -	count = data_pos - read->rd_offset;
> -
> -	/* Content type, offset, byte count */
> -	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> -	if (!p)
> -		return nfserr_resource;
> -
> -	*p++ = htonl(NFS4_CONTENT_HOLE);
> +	*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
> 	p = xdr_encode_hyper(p, read->rd_offset);
> -	p = xdr_encode_hyper(p, count);
> +	*p = cpu_to_be32(read->rd_length);
> 
> -	*eof = (read->rd_offset + count) >= f_size;
> -	*maxcount = min_t(unsigned long, count, *maxcount);
> 	return nfs_ok;
> }
> 
> @@ -4811,69 +4769,36 @@ static __be32
> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> 		       struct nfsd4_read *read)
> {
> -	unsigned long maxcount, count;
> +	struct file *file = read->rd_nf->nf_file;
> 	struct xdr_stream *xdr = resp->xdr;
> -	struct file *file;
> 	int starting_len = xdr->buf->len;
> -	int last_segment = xdr->buf->len;
> -	int segments = 0;
> -	__be32 *p, tmp;
> -	bool is_data;
> -	loff_t pos;
> -	u32 eof;
> +	u32 segments = 0;
> +	__be32 *p;
> 
> 	if (nfserr)
> 		return nfserr;
> -	file = read->rd_nf->nf_file;
> 
> 	/* eof flag, segment count */
> 	p = xdr_reserve_space(xdr, 4 + 4);
> 	if (!p)
> -		return nfserr_resource;
> +		return nfserr_io;
> 	xdr_commit_encode(xdr);
> 
> -	maxcount = min_t(unsigned long, read->rd_length,
> -			 (xdr->buf->buflen - xdr->buf->len));
> -	count    = maxcount;
> -
> -	eof = read->rd_offset >= i_size_read(file_inode(file));
> -	if (eof)
> +	read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
> +	if (read->rd_eof)
> 		goto out;
> 
> -	pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> -	is_data = pos > read->rd_offset;
> -
> -	while (count > 0 && !eof) {
> -		maxcount = count;
> -		if (is_data)
> -			nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
> -						segments == 0 ? &pos : NULL);
> -		else
> -			nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
> -		if (nfserr)
> -			goto out;
> -		count -= maxcount;
> -		read->rd_offset += maxcount;
> -		is_data = !is_data;
> -		last_segment = xdr->buf->len;
> -		segments++;
> -	}
> -
> -out:
> -	if (nfserr && segments == 0)
> +	nfserr = nfsd4_encode_read_plus_data(resp, read);
> +	if (nfserr) {
> 		xdr_truncate_encode(xdr, starting_len);
> -	else {
> -		if (nfserr) {
> -			xdr_truncate_encode(xdr, last_segment);
> -			nfserr = nfs_ok;
> -			eof = 0;
> -		}
> -		tmp = htonl(eof);
> -		write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
> -		tmp = htonl(segments);
> -		write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
> +		return nfserr;
> 	}
> 
> +	segments++;
> +
> +out:
> +	p = xdr_encode_bool(p, read->rd_eof);
> +	*p = cpu_to_be32(segments);
> 	return nfserr;
> }
> 
> -- 
> 2.37.2
> 

--
Chuck Lever
Anna Schumaker Sept. 7, 2022, 8:37 p.m. UTC | #2
On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <chuck.lever@oracle.com> wrote:
>
> Be sure to Cc: Jeff on these. Thanks!
>
>
> > On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> wrote:
> >
> > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> >
> > Chuck had suggested reverting READ_PLUS so it returns a single DATA
> > segment covering the requested read range. This prepares the server for
> > a future "sparse read" function so support can easily be added without
> > needing to rip out the old READ_PLUS code at the same time.
> >
> > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > ---
> > fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
> > 1 file changed, 32 insertions(+), 107 deletions(-)
> >
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index 1e9690a061ec..bcc8c385faf2 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
> >
> > static __be32
> > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > -                         struct nfsd4_read *read,
> > -                         unsigned long *maxcount, u32 *eof,
> > -                         loff_t *pos)
> > +                         struct nfsd4_read *read)
> > {
> > -     struct xdr_stream *xdr = resp->xdr;
> > +     bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
> >       struct file *file = read->rd_nf->nf_file;
> > -     int starting_len = xdr->buf->len;
> > -     loff_t hole_pos;
> > -     __be32 nfserr;
> > -     __be32 *p, tmp;
> > -     __be64 tmp64;
> > -
> > -     hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> > -     if (hole_pos > read->rd_offset)
> > -             *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
> > -     *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
> > +     struct xdr_stream *xdr = resp->xdr;
> > +     unsigned long maxcount;
> > +     __be32 nfserr, *p;
> >
> >       /* Content type, offset, byte count */
> >       p = xdr_reserve_space(xdr, 4 + 8 + 4);
> >       if (!p)
> > -             return nfserr_resource;
> > +             return nfserr_io;
>
> Wouldn't nfserr_rep_too_big be a more appropriate status for running
> off the end of the send buffer? I'm not 100% sure, but I would expect
> that exhausting send buffer space would imply the reply has grown too
> large.

I can switch it to that, no problem.

>
>
> > +     if (resp->xdr->buf->page_len && splice_ok) {
> > +             WARN_ON_ONCE(splice_ok);
> > +             return nfserr_io;
> > +     }
>
> I wish I understood why this test was needed. It seems to have been
> copied and pasted from historic code into nfsd4_encode_read(), and
> there have been recent mechanical changes to it, but there's no
> comment explaining it there...

Yeah, I saw this was in the read code and assumed it was an important
check so I added it here too.
>
> In any event, this seems to be checking for a server software bug,
> so maybe this should return nfserr_serverfault. Oddly that status
> code isn't defined yet.

Do you want me to add that code and return it in this patch?

>
>
> Do you have some performance results for v2?

Not yet, I have it running now so hopefully I'll have something ready
by tomorrow morning.

Anna
>
>
> > -     read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
> > -     if (read->rd_vlen < 0)
> > -             return nfserr_resource;
> > +     maxcount = min_t(unsigned long, read->rd_length,
> > +                      (xdr->buf->buflen - xdr->buf->len));
> >
> > -     nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> > -                         resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
> > +     if (file->f_op->splice_read && splice_ok)
> > +             nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
> > +     else
> > +             nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> >       if (nfserr)
> >               return nfserr;
> > -     xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
> >
> > -     tmp = htonl(NFS4_CONTENT_DATA);
> > -     write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
> > -     tmp64 = cpu_to_be64(read->rd_offset);
> > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
> > -     tmp = htonl(*maxcount);
> > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
> > -
> > -     tmp = xdr_zero;
> > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
> > -                            xdr_pad_size(*maxcount));
> > -     return nfs_ok;
> > -}
> > -
> > -static __be32
> > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> > -                         struct nfsd4_read *read,
> > -                         unsigned long *maxcount, u32 *eof)
> > -{
> > -     struct file *file = read->rd_nf->nf_file;
> > -     loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> > -     loff_t f_size = i_size_read(file_inode(file));
> > -     unsigned long count;
> > -     __be32 *p;
> > -
> > -     if (data_pos == -ENXIO)
> > -             data_pos = f_size;
> > -     else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> > -             return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> > -     count = data_pos - read->rd_offset;
> > -
> > -     /* Content type, offset, byte count */
> > -     p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> > -     if (!p)
> > -             return nfserr_resource;
> > -
> > -     *p++ = htonl(NFS4_CONTENT_HOLE);
> > +     *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
> >       p = xdr_encode_hyper(p, read->rd_offset);
> > -     p = xdr_encode_hyper(p, count);
> > +     *p = cpu_to_be32(read->rd_length);
> >
> > -     *eof = (read->rd_offset + count) >= f_size;
> > -     *maxcount = min_t(unsigned long, count, *maxcount);
> >       return nfs_ok;
> > }
> >
> > @@ -4811,69 +4769,36 @@ static __be32
> > nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> >                      struct nfsd4_read *read)
> > {
> > -     unsigned long maxcount, count;
> > +     struct file *file = read->rd_nf->nf_file;
> >       struct xdr_stream *xdr = resp->xdr;
> > -     struct file *file;
> >       int starting_len = xdr->buf->len;
> > -     int last_segment = xdr->buf->len;
> > -     int segments = 0;
> > -     __be32 *p, tmp;
> > -     bool is_data;
> > -     loff_t pos;
> > -     u32 eof;
> > +     u32 segments = 0;
> > +     __be32 *p;
> >
> >       if (nfserr)
> >               return nfserr;
> > -     file = read->rd_nf->nf_file;
> >
> >       /* eof flag, segment count */
> >       p = xdr_reserve_space(xdr, 4 + 4);
> >       if (!p)
> > -             return nfserr_resource;
> > +             return nfserr_io;
> >       xdr_commit_encode(xdr);
> >
> > -     maxcount = min_t(unsigned long, read->rd_length,
> > -                      (xdr->buf->buflen - xdr->buf->len));
> > -     count    = maxcount;
> > -
> > -     eof = read->rd_offset >= i_size_read(file_inode(file));
> > -     if (eof)
> > +     read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
> > +     if (read->rd_eof)
> >               goto out;
> >
> > -     pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> > -     is_data = pos > read->rd_offset;
> > -
> > -     while (count > 0 && !eof) {
> > -             maxcount = count;
> > -             if (is_data)
> > -                     nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
> > -                                             segments == 0 ? &pos : NULL);
> > -             else
> > -                     nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
> > -             if (nfserr)
> > -                     goto out;
> > -             count -= maxcount;
> > -             read->rd_offset += maxcount;
> > -             is_data = !is_data;
> > -             last_segment = xdr->buf->len;
> > -             segments++;
> > -     }
> > -
> > -out:
> > -     if (nfserr && segments == 0)
> > +     nfserr = nfsd4_encode_read_plus_data(resp, read);
> > +     if (nfserr) {
> >               xdr_truncate_encode(xdr, starting_len);
> > -     else {
> > -             if (nfserr) {
> > -                     xdr_truncate_encode(xdr, last_segment);
> > -                     nfserr = nfs_ok;
> > -                     eof = 0;
> > -             }
> > -             tmp = htonl(eof);
> > -             write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
> > -             tmp = htonl(segments);
> > -             write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
> > +             return nfserr;
> >       }
> >
> > +     segments++;
> > +
> > +out:
> > +     p = xdr_encode_bool(p, read->rd_eof);
> > +     *p = cpu_to_be32(segments);
> >       return nfserr;
> > }
> >
> > --
> > 2.37.2
> >
>
> --
> Chuck Lever
>
>
>
Chuck Lever III Sept. 7, 2022, 8:51 p.m. UTC | #3
> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> wrote:
> 
> On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <chuck.lever@oracle.com> wrote:
>> 
>> Be sure to Cc: Jeff on these. Thanks!
>> 
>> 
>>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> wrote:
>>> 
>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>> 
>>> Chuck had suggested reverting READ_PLUS so it returns a single DATA
>>> segment covering the requested read range. This prepares the server for
>>> a future "sparse read" function so support can easily be added without
>>> needing to rip out the old READ_PLUS code at the same time.
>>> 
>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>> ---
>>> fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
>>> 1 file changed, 32 insertions(+), 107 deletions(-)
>>> 
>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>> index 1e9690a061ec..bcc8c385faf2 100644
>>> --- a/fs/nfsd/nfs4xdr.c
>>> +++ b/fs/nfsd/nfs4xdr.c
>>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
>>> 
>>> static __be32
>>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
>>> -                         struct nfsd4_read *read,
>>> -                         unsigned long *maxcount, u32 *eof,
>>> -                         loff_t *pos)
>>> +                         struct nfsd4_read *read)
>>> {
>>> -     struct xdr_stream *xdr = resp->xdr;
>>> +     bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
>>>      struct file *file = read->rd_nf->nf_file;
>>> -     int starting_len = xdr->buf->len;
>>> -     loff_t hole_pos;
>>> -     __be32 nfserr;
>>> -     __be32 *p, tmp;
>>> -     __be64 tmp64;
>>> -
>>> -     hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>> -     if (hole_pos > read->rd_offset)
>>> -             *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
>>> -     *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
>>> +     struct xdr_stream *xdr = resp->xdr;
>>> +     unsigned long maxcount;
>>> +     __be32 nfserr, *p;
>>> 
>>>      /* Content type, offset, byte count */
>>>      p = xdr_reserve_space(xdr, 4 + 8 + 4);
>>>      if (!p)
>>> -             return nfserr_resource;
>>> +             return nfserr_io;
>> 
>> Wouldn't nfserr_rep_too_big be a more appropriate status for running
>> off the end of the send buffer? I'm not 100% sure, but I would expect
>> that exhausting send buffer space would imply the reply has grown too
>> large.
> 
> I can switch it to that, no problem.

I would like to hear opinions from protocol experts before we go
with that choice.


>>> +     if (resp->xdr->buf->page_len && splice_ok) {
>>> +             WARN_ON_ONCE(splice_ok);
>>> +             return nfserr_io;
>>> +     }
>> 
>> I wish I understood why this test was needed. It seems to have been
>> copied and pasted from historic code into nfsd4_encode_read(), and
>> there have been recent mechanical changes to it, but there's no
>> comment explaining it there...
> 
> Yeah, I saw this was in the read code and assumed it was an important
> check so I added it here too.
>> 
>> In any event, this seems to be checking for a server software bug,
>> so maybe this should return nfserr_serverfault. Oddly that status
>> code isn't defined yet.
> 
> Do you want me to add that code and return it in this patch?

Sure. Make that a predecessor patch and fix up the code in
nfsd4_encode_read() before using it for READ_PLUS in this patch.

Suggested patch description:

RESOURCE is the wrong status code here because

a) This encoder is shared between NFSv4.0 and NFSv4.1+, and
   NFSv4.1+ doesn't support RESOURCE, and
b) That status code might cause the client to retry, in which
   case it will get the same failure because this check seems
   to be looking for a server-side coding mistake.


>> Do you have some performance results for v2?
> 
> Not yet, I have it running now so hopefully I'll have something ready
> by tomorrow morning.

Great, thanks!


> Anna
>> 
>> 
>>> -     read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
>>> -     if (read->rd_vlen < 0)
>>> -             return nfserr_resource;
>>> +     maxcount = min_t(unsigned long, read->rd_length,
>>> +                      (xdr->buf->buflen - xdr->buf->len));
>>> 
>>> -     nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
>>> -                         resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
>>> +     if (file->f_op->splice_read && splice_ok)
>>> +             nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>>> +     else
>>> +             nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
>>>      if (nfserr)
>>>              return nfserr;
>>> -     xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
>>> 
>>> -     tmp = htonl(NFS4_CONTENT_DATA);
>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
>>> -     tmp64 = cpu_to_be64(read->rd_offset);
>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
>>> -     tmp = htonl(*maxcount);
>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
>>> -
>>> -     tmp = xdr_zero;
>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
>>> -                            xdr_pad_size(*maxcount));
>>> -     return nfs_ok;
>>> -}
>>> -
>>> -static __be32
>>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
>>> -                         struct nfsd4_read *read,
>>> -                         unsigned long *maxcount, u32 *eof)
>>> -{
>>> -     struct file *file = read->rd_nf->nf_file;
>>> -     loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
>>> -     loff_t f_size = i_size_read(file_inode(file));
>>> -     unsigned long count;
>>> -     __be32 *p;
>>> -
>>> -     if (data_pos == -ENXIO)
>>> -             data_pos = f_size;
>>> -     else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
>>> -             return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
>>> -     count = data_pos - read->rd_offset;
>>> -
>>> -     /* Content type, offset, byte count */
>>> -     p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
>>> -     if (!p)
>>> -             return nfserr_resource;
>>> -
>>> -     *p++ = htonl(NFS4_CONTENT_HOLE);
>>> +     *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
>>>      p = xdr_encode_hyper(p, read->rd_offset);
>>> -     p = xdr_encode_hyper(p, count);
>>> +     *p = cpu_to_be32(read->rd_length);
>>> 
>>> -     *eof = (read->rd_offset + count) >= f_size;
>>> -     *maxcount = min_t(unsigned long, count, *maxcount);
>>>      return nfs_ok;
>>> }
>>> 
>>> @@ -4811,69 +4769,36 @@ static __be32
>>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>>>                     struct nfsd4_read *read)
>>> {
>>> -     unsigned long maxcount, count;
>>> +     struct file *file = read->rd_nf->nf_file;
>>>      struct xdr_stream *xdr = resp->xdr;
>>> -     struct file *file;
>>>      int starting_len = xdr->buf->len;
>>> -     int last_segment = xdr->buf->len;
>>> -     int segments = 0;
>>> -     __be32 *p, tmp;
>>> -     bool is_data;
>>> -     loff_t pos;
>>> -     u32 eof;
>>> +     u32 segments = 0;
>>> +     __be32 *p;
>>> 
>>>      if (nfserr)
>>>              return nfserr;
>>> -     file = read->rd_nf->nf_file;
>>> 
>>>      /* eof flag, segment count */
>>>      p = xdr_reserve_space(xdr, 4 + 4);
>>>      if (!p)
>>> -             return nfserr_resource;
>>> +             return nfserr_io;
>>>      xdr_commit_encode(xdr);
>>> 
>>> -     maxcount = min_t(unsigned long, read->rd_length,
>>> -                      (xdr->buf->buflen - xdr->buf->len));
>>> -     count    = maxcount;
>>> -
>>> -     eof = read->rd_offset >= i_size_read(file_inode(file));
>>> -     if (eof)
>>> +     read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
>>> +     if (read->rd_eof)
>>>              goto out;
>>> 
>>> -     pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>> -     is_data = pos > read->rd_offset;
>>> -
>>> -     while (count > 0 && !eof) {
>>> -             maxcount = count;
>>> -             if (is_data)
>>> -                     nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
>>> -                                             segments == 0 ? &pos : NULL);
>>> -             else
>>> -                     nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
>>> -             if (nfserr)
>>> -                     goto out;
>>> -             count -= maxcount;
>>> -             read->rd_offset += maxcount;
>>> -             is_data = !is_data;
>>> -             last_segment = xdr->buf->len;
>>> -             segments++;
>>> -     }
>>> -
>>> -out:
>>> -     if (nfserr && segments == 0)
>>> +     nfserr = nfsd4_encode_read_plus_data(resp, read);
>>> +     if (nfserr) {
>>>              xdr_truncate_encode(xdr, starting_len);
>>> -     else {
>>> -             if (nfserr) {
>>> -                     xdr_truncate_encode(xdr, last_segment);
>>> -                     nfserr = nfs_ok;
>>> -                     eof = 0;
>>> -             }
>>> -             tmp = htonl(eof);
>>> -             write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
>>> -             tmp = htonl(segments);
>>> -             write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
>>> +             return nfserr;
>>>      }
>>> 
>>> +     segments++;
>>> +
>>> +out:
>>> +     p = xdr_encode_bool(p, read->rd_eof);
>>> +     *p = cpu_to_be32(segments);
>>>      return nfserr;
>>> }
>>> 
>>> --
>>> 2.37.2
>>> 
>> 
>> --
>> Chuck Lever

--
Chuck Lever
Trond Myklebust Sept. 7, 2022, 10:33 p.m. UTC | #4
On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote:
> 
> 
> > On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> wrote:
> > 
> > On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III
> > <chuck.lever@oracle.com> wrote:
> > > 
> > > Be sure to Cc: Jeff on these. Thanks!
> > > 
> > > 
> > > > On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org>
> > > > wrote:
> > > > 
> > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > 
> > > > Chuck had suggested reverting READ_PLUS so it returns a single
> > > > DATA
> > > > segment covering the requested read range. This prepares the
> > > > server for
> > > > a future "sparse read" function so support can easily be added
> > > > without
> > > > needing to rip out the old READ_PLUS code at the same time.
> > > > 
> > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > ---
> > > > fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------
> > > > -------
> > > > 1 file changed, 32 insertions(+), 107 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > > index 1e9690a061ec..bcc8c385faf2 100644
> > > > --- a/fs/nfsd/nfs4xdr.c
> > > > +++ b/fs/nfsd/nfs4xdr.c
> > > > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct
> > > > nfsd4_compoundres *resp, __be32 nfserr,
> > > > 
> > > > static __be32
> > > > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > > > -                         struct nfsd4_read *read,
> > > > -                         unsigned long *maxcount, u32 *eof,
> > > > -                         loff_t *pos)
> > > > +                         struct nfsd4_read *read)
> > > > {
> > > > -     struct xdr_stream *xdr = resp->xdr;
> > > > +     bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp-
> > > > >rq_flags);
> > > >      struct file *file = read->rd_nf->nf_file;
> > > > -     int starting_len = xdr->buf->len;
> > > > -     loff_t hole_pos;
> > > > -     __be32 nfserr;
> > > > -     __be32 *p, tmp;
> > > > -     __be64 tmp64;
> > > > -
> > > > -     hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset,
> > > > SEEK_HOLE);
> > > > -     if (hole_pos > read->rd_offset)
> > > > -             *maxcount = min_t(unsigned long, *maxcount,
> > > > hole_pos - read->rd_offset);
> > > > -     *maxcount = min_t(unsigned long, *maxcount, (xdr->buf-
> > > > >buflen - xdr->buf->len));
> > > > +     struct xdr_stream *xdr = resp->xdr;
> > > > +     unsigned long maxcount;
> > > > +     __be32 nfserr, *p;
> > > > 
> > > >      /* Content type, offset, byte count */
> > > >      p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > > >      if (!p)
> > > > -             return nfserr_resource;
> > > > +             return nfserr_io;
> > > 
> > > Wouldn't nfserr_rep_too_big be a more appropriate status for
> > > running
> > > off the end of the send buffer? I'm not 100% sure, but I would
> > > expect
> > > that exhausting send buffer space would imply the reply has grown
> > > too
> > > large.
> > 
> > I can switch it to that, no problem.
> 
> I would like to hear opinions from protocol experts before we go
> with that choice.

I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to
even return a short read. However if you can return a short read, then
that's better than returning an error.

It looks to me as if this function can bit hit in both cases, so
perhaps some care is in order.

> 
> > > > +     if (resp->xdr->buf->page_len && splice_ok) {
> > > > +             WARN_ON_ONCE(splice_ok);
> > > > +             return nfserr_io;
> > > > +     }
> > > 
> > > I wish I understood why this test was needed. It seems to have
> > > been
> > > copied and pasted from historic code into nfsd4_encode_read(),
> > > and
> > > there have been recent mechanical changes to it, but there's no
> > > comment explaining it there...
> > 
> > Yeah, I saw this was in the read code and assumed it was an
> > important
> > check so I added it here too.
> > > 
> > > In any event, this seems to be checking for a server software
> > > bug,
> > > so maybe this should return nfserr_serverfault. Oddly that status
> > > code isn't defined yet.
> > 
> > Do you want me to add that code and return it in this patch?
> 
> Sure. Make that a predecessor patch and fix up the code in
> nfsd4_encode_read() before using it for READ_PLUS in this patch.
> 
> Suggested patch description:
> 
> RESOURCE is the wrong status code here because
> 
> a) This encoder is shared between NFSv4.0 and NFSv4.1+, and
>    NFSv4.1+ doesn't support RESOURCE, and

Is it? I thought it was READ_PLUS only. That's NFSv4.2 or higher.

If the encoder is to be shared with both READ and READ_PLUS, then
perhaps it might be wiser to choose a name other than
nfsd4_encode_read_plus_data()?

> b) That status code might cause the client to retry, in which
>    case it will get the same failure because this check seems
>    to be looking for a server-side coding mistake.
> 
> 
> > > Do you have some performance results for v2?
> > 
> > Not yet, I have it running now so hopefully I'll have something
> > ready
> > by tomorrow morning.
> 
> Great, thanks!
> 
> 
> > Anna
> > > 
> > > 
> > > > -     read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp-
> > > > >rq_vec, *maxcount);
> > > > -     if (read->rd_vlen < 0)
> > > > -             return nfserr_resource;
> > > > +     maxcount = min_t(unsigned long, read->rd_length,
> > > > +                      (xdr->buf->buflen - xdr->buf->len));
> > > > 
> > > > -     nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file,
> > > > read->rd_offset,
> > > > -                         resp->rqstp->rq_vec, read->rd_vlen,
> > > > maxcount, eof);
> > > > +     if (file->f_op->splice_read && splice_ok)
> > > > +             nfserr = nfsd4_encode_splice_read(resp, read,
> > > > file, maxcount);
> > > > +     else
> > > > +             nfserr = nfsd4_encode_readv(resp, read, file,
> > > > maxcount);
> > > >      if (nfserr)
> > > >              return nfserr;
> > > > -     xdr_truncate_encode(xdr, starting_len + 16 +
> > > > xdr_align_size(*maxcount));
> > > > 
> > > > -     tmp = htonl(NFS4_CONTENT_DATA);
> > > > -     write_bytes_to_xdr_buf(xdr->buf, starting_len,     
> > > > &tmp,   4);
> > > > -     tmp64 = cpu_to_be64(read->rd_offset);
> > > > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, 
> > > > &tmp64, 8);
> > > > -     tmp = htonl(*maxcount);
> > > > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 12,
> > > > &tmp,   4);
> > > > -
> > > > -     tmp = xdr_zero;
> > > > -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 +
> > > > *maxcount, &tmp,
> > > > -                            xdr_pad_size(*maxcount));
> > > > -     return nfs_ok;
> > > > -}
> > > > -
> > > > -static __be32
> > > > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> > > > -                         struct nfsd4_read *read,
> > > > -                         unsigned long *maxcount, u32 *eof)
> > > > -{
> > > > -     struct file *file = read->rd_nf->nf_file;
> > > > -     loff_t data_pos = vfs_llseek(file, read->rd_offset,
> > > > SEEK_DATA);
> > > > -     loff_t f_size = i_size_read(file_inode(file));
> > > > -     unsigned long count;
> > > > -     __be32 *p;
> > > > -
> > > > -     if (data_pos == -ENXIO)
> > > > -             data_pos = f_size;
> > > > -     else if (data_pos <= read->rd_offset || (data_pos <
> > > > f_size && data_pos % PAGE_SIZE))
> > > > -             return nfsd4_encode_read_plus_data(resp, read,
> > > > maxcount, eof, &f_size);
> > > > -     count = data_pos - read->rd_offset;
> > > > -
> > > > -     /* Content type, offset, byte count */
> > > > -     p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> > > > -     if (!p)
> > > > -             return nfserr_resource;
> > > > -
> > > > -     *p++ = htonl(NFS4_CONTENT_HOLE);
> > > > +     *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
> > > >      p = xdr_encode_hyper(p, read->rd_offset);
> > > > -     p = xdr_encode_hyper(p, count);
> > > > +     *p = cpu_to_be32(read->rd_length);
> > > > 
> > > > -     *eof = (read->rd_offset + count) >= f_size;
> > > > -     *maxcount = min_t(unsigned long, count, *maxcount);
> > > >      return nfs_ok;
> > > > }
> > > > 
> > > > @@ -4811,69 +4769,36 @@ static __be32
> > > > nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32
> > > > nfserr,
> > > >                     struct nfsd4_read *read)
> > > > {
> > > > -     unsigned long maxcount, count;
> > > > +     struct file *file = read->rd_nf->nf_file;
> > > >      struct xdr_stream *xdr = resp->xdr;
> > > > -     struct file *file;
> > > >      int starting_len = xdr->buf->len;
> > > > -     int last_segment = xdr->buf->len;
> > > > -     int segments = 0;
> > > > -     __be32 *p, tmp;
> > > > -     bool is_data;
> > > > -     loff_t pos;
> > > > -     u32 eof;
> > > > +     u32 segments = 0;
> > > > +     __be32 *p;
> > > > 
> > > >      if (nfserr)
> > > >              return nfserr;
> > > > -     file = read->rd_nf->nf_file;
> > > > 
> > > >      /* eof flag, segment count */
> > > >      p = xdr_reserve_space(xdr, 4 + 4);
> > > >      if (!p)
> > > > -             return nfserr_resource;
> > > > +             return nfserr_io;
> > > >      xdr_commit_encode(xdr);
> > > > 
> > > > -     maxcount = min_t(unsigned long, read->rd_length,
> > > > -                      (xdr->buf->buflen - xdr->buf->len));
> > > > -     count    = maxcount;
> > > > -
> > > > -     eof = read->rd_offset >= i_size_read(file_inode(file));
> > > > -     if (eof)
> > > > +     read->rd_eof = read->rd_offset >=
> > > > i_size_read(file_inode(file));
> > > > +     if (read->rd_eof)
> > > >              goto out;
> > > > 
> > > > -     pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> > > > -     is_data = pos > read->rd_offset;
> > > > -
> > > > -     while (count > 0 && !eof) {
> > > > -             maxcount = count;
> > > > -             if (is_data)
> > > > -                     nfserr =
> > > > nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
> > > > -                                             segments == 0 ?
> > > > &pos : NULL);
> > > > -             else
> > > > -                     nfserr =
> > > > nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
> > > > -             if (nfserr)
> > > > -                     goto out;
> > > > -             count -= maxcount;
> > > > -             read->rd_offset += maxcount;
> > > > -             is_data = !is_data;
> > > > -             last_segment = xdr->buf->len;
> > > > -             segments++;
> > > > -     }
> > > > -
> > > > -out:
> > > > -     if (nfserr && segments == 0)
> > > > +     nfserr = nfsd4_encode_read_plus_data(resp, read);
> > > > +     if (nfserr) {
> > > >              xdr_truncate_encode(xdr, starting_len);
> > > > -     else {
> > > > -             if (nfserr) {
> > > > -                     xdr_truncate_encode(xdr, last_segment);
> > > > -                     nfserr = nfs_ok;
> > > > -                     eof = 0;
> > > > -             }
> > > > -             tmp = htonl(eof);
> > > > -             write_bytes_to_xdr_buf(xdr->buf,
> > > > starting_len,     &tmp, 4);
> > > > -             tmp = htonl(segments);
> > > > -             write_bytes_to_xdr_buf(xdr->buf, starting_len +
> > > > 4, &tmp, 4);
> > > > +             return nfserr;
> > > >      }
> > > > 
> > > > +     segments++;
> > > > +
> > > > +out:
> > > > +     p = xdr_encode_bool(p, read->rd_eof);
> > > > +     *p = cpu_to_be32(segments);
> > > >      return nfserr;
> > > > }
> > > > 
> > > > --
> > > > 2.37.2
> > > > 
> > > 
> > > --
> > > Chuck Lever
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever III Sept. 8, 2022, 3:36 p.m. UTC | #5
> On Sep 7, 2022, at 6:33 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote:
>> 
>> 
>>> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> wrote:
>>> 
>>> On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III
>>> <chuck.lever@oracle.com> wrote:
>>>> 
>>>> Be sure to Cc: Jeff on these. Thanks!
>>>> 
>>>> 
>>>>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org>
>>>>> wrote:
>>>>> 
>>>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>> 
>>>>> Chuck had suggested reverting READ_PLUS so it returns a single
>>>>> DATA
>>>>> segment covering the requested read range. This prepares the
>>>>> server for
>>>>> a future "sparse read" function so support can easily be added
>>>>> without
>>>>> needing to rip out the old READ_PLUS code at the same time.
>>>>> 
>>>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>> ---
>>>>> fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------
>>>>> -------
>>>>> 1 file changed, 32 insertions(+), 107 deletions(-)
>>>>> 
>>>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>>>> index 1e9690a061ec..bcc8c385faf2 100644
>>>>> --- a/fs/nfsd/nfs4xdr.c
>>>>> +++ b/fs/nfsd/nfs4xdr.c
>>>>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct
>>>>> nfsd4_compoundres *resp, __be32 nfserr,
>>>>> 
>>>>> static __be32
>>>>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
>>>>> -                         struct nfsd4_read *read,
>>>>> -                         unsigned long *maxcount, u32 *eof,
>>>>> -                         loff_t *pos)
>>>>> +                         struct nfsd4_read *read)
>>>>> {
>>>>> -     struct xdr_stream *xdr = resp->xdr;
>>>>> +     bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp-
>>>>>> rq_flags);
>>>>>      struct file *file = read->rd_nf->nf_file;
>>>>> -     int starting_len = xdr->buf->len;
>>>>> -     loff_t hole_pos;
>>>>> -     __be32 nfserr;
>>>>> -     __be32 *p, tmp;
>>>>> -     __be64 tmp64;
>>>>> -
>>>>> -     hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset,
>>>>> SEEK_HOLE);
>>>>> -     if (hole_pos > read->rd_offset)
>>>>> -             *maxcount = min_t(unsigned long, *maxcount,
>>>>> hole_pos - read->rd_offset);
>>>>> -     *maxcount = min_t(unsigned long, *maxcount, (xdr->buf-
>>>>>> buflen - xdr->buf->len));
>>>>> +     struct xdr_stream *xdr = resp->xdr;
>>>>> +     unsigned long maxcount;
>>>>> +     __be32 nfserr, *p;
>>>>> 
>>>>>      /* Content type, offset, byte count */
>>>>>      p = xdr_reserve_space(xdr, 4 + 8 + 4);
>>>>>      if (!p)
>>>>> -             return nfserr_resource;
>>>>> +             return nfserr_io;
>>>> 
>>>> Wouldn't nfserr_rep_too_big be a more appropriate status for
>>>> running
>>>> off the end of the send buffer? I'm not 100% sure, but I would
>>>> expect
>>>> that exhausting send buffer space would imply the reply has grown
>>>> too
>>>> large.
>>> 
>>> I can switch it to that, no problem.
>> 
>> I would like to hear opinions from protocol experts before we go
>> with that choice.
> 
> I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to
> even return a short read. However if you can return a short read, then
> that's better than returning an error.

Many thanks for reviewing!

In general, I might want to convert all NFSv4 encoders to return
either RESOURCE or REP_TOO_BIG when xdr_reserve_space() fails. I
can post a patch or two that makes that attempt so the special
cases can be sorted on the mailing list.


> It looks to me as if this function can bit hit in both cases, so
> perhaps some care is in order.

Intriguing idea.

For READ, if xdr_reserve_space() fails, that's all she wrote.
I think all these calls happen before the data payload is encoded.

For READ_PLUS, if xdr_reserve_space() fails, it might be possible
to use xdr_truncate_encode() or simply start over with a limit on
the number of segments to be encoded. We're not really there yet,
as currently we want to return just a single segment at this
point.

I feel the question is whether it's practical or a frequent
enough occurrence to bother with the special cases. Encoding a
READ_PLUS response is already challenging.


>>>>> +     if (resp->xdr->buf->page_len && splice_ok) {
>>>>> +             WARN_ON_ONCE(splice_ok);
>>>>> +             return nfserr_io;
>>>>> +     }
>>>> 
>>>> I wish I understood why this test was needed. It seems to have
>>>> been
>>>> copied and pasted from historic code into nfsd4_encode_read(),
>>>> and
>>>> there have been recent mechanical changes to it, but there's no
>>>> comment explaining it there...
>>> 
>>> Yeah, I saw this was in the read code and assumed it was an
>>> important
>>> check so I added it here too.
>>>> 
>>>> In any event, this seems to be checking for a server software
>>>> bug,
>>>> so maybe this should return nfserr_serverfault. Oddly that status
>>>> code isn't defined yet.
>>> 
>>> Do you want me to add that code and return it in this patch?
>> 
>> Sure. Make that a predecessor patch and fix up the code in
>> nfsd4_encode_read() before using it for READ_PLUS in this patch.
>> 
>> Suggested patch description:
>> 
>> RESOURCE is the wrong status code here because
>> 
>> a) This encoder is shared between NFSv4.0 and NFSv4.1+, and
>>    NFSv4.1+ doesn't support RESOURCE, and
> 
> Is it? I thought it was READ_PLUS only. That's NFSv4.2 or higher.

For the initial patch that adds nfserr_serverfault, I was speaking
only about nfsd4_encode_read(), which is shared amongst all minor
versions. That's where the page_len && splice_ok test originally
came from. Sorry that wasn't clear.


> If the encoder is to be shared with both READ and READ_PLUS, then
> perhaps it might be wiser to choose a name other than
> nfsd4_encode_read_plus_data()?

Although the encoded streams are very similar, there are still
separate encoders for a READ_PLUS data segment and a full READ
response. The common pieces for both of those operations are
nfsd4_encode_readv() and nfsd4_encode_splice_read().

IIUC nfsd4_encode_read_plus_data() has to deal with encoding the
the NFS4_CONTENT_DATA enumerator -- it handles a single
READ_PLUS segment. nfsd4_encode_read() encodes one complete READ
response. I'm comfortable with the current function names.


>> b) That status code might cause the client to retry, in which
>>    case it will get the same failure because this check seems
>>    to be looking for a server-side coding mistake.
>> 
>> 
>>>> Do you have some performance results for v2?
>>> 
>>> Not yet, I have it running now so hopefully I'll have something
>>> ready
>>> by tomorrow morning.
>> 
>> Great, thanks!
>> 
>> 
>>> Anna
>>>> 
>>>> 
>>>>> -     read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp-
>>>>>> rq_vec, *maxcount);
>>>>> -     if (read->rd_vlen < 0)
>>>>> -             return nfserr_resource;
>>>>> +     maxcount = min_t(unsigned long, read->rd_length,
>>>>> +                      (xdr->buf->buflen - xdr->buf->len));
>>>>> 
>>>>> -     nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file,
>>>>> read->rd_offset,
>>>>> -                         resp->rqstp->rq_vec, read->rd_vlen,
>>>>> maxcount, eof);
>>>>> +     if (file->f_op->splice_read && splice_ok)
>>>>> +             nfserr = nfsd4_encode_splice_read(resp, read,
>>>>> file, maxcount);
>>>>> +     else
>>>>> +             nfserr = nfsd4_encode_readv(resp, read, file,
>>>>> maxcount);
>>>>>      if (nfserr)
>>>>>              return nfserr;
>>>>> -     xdr_truncate_encode(xdr, starting_len + 16 +
>>>>> xdr_align_size(*maxcount));
>>>>> 
>>>>> -     tmp = htonl(NFS4_CONTENT_DATA);
>>>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len,     
>>>>> &tmp,   4);
>>>>> -     tmp64 = cpu_to_be64(read->rd_offset);
>>>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, 
>>>>> &tmp64, 8);
>>>>> -     tmp = htonl(*maxcount);
>>>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 12,
>>>>> &tmp,   4);
>>>>> -
>>>>> -     tmp = xdr_zero;
>>>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 +
>>>>> *maxcount, &tmp,
>>>>> -                            xdr_pad_size(*maxcount));
>>>>> -     return nfs_ok;
>>>>> -}
>>>>> -
>>>>> -static __be32
>>>>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
>>>>> -                         struct nfsd4_read *read,
>>>>> -                         unsigned long *maxcount, u32 *eof)
>>>>> -{
>>>>> -     struct file *file = read->rd_nf->nf_file;
>>>>> -     loff_t data_pos = vfs_llseek(file, read->rd_offset,
>>>>> SEEK_DATA);
>>>>> -     loff_t f_size = i_size_read(file_inode(file));
>>>>> -     unsigned long count;
>>>>> -     __be32 *p;
>>>>> -
>>>>> -     if (data_pos == -ENXIO)
>>>>> -             data_pos = f_size;
>>>>> -     else if (data_pos <= read->rd_offset || (data_pos <
>>>>> f_size && data_pos % PAGE_SIZE))
>>>>> -             return nfsd4_encode_read_plus_data(resp, read,
>>>>> maxcount, eof, &f_size);
>>>>> -     count = data_pos - read->rd_offset;
>>>>> -
>>>>> -     /* Content type, offset, byte count */
>>>>> -     p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
>>>>> -     if (!p)
>>>>> -             return nfserr_resource;
>>>>> -
>>>>> -     *p++ = htonl(NFS4_CONTENT_HOLE);
>>>>> +     *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
>>>>>      p = xdr_encode_hyper(p, read->rd_offset);
>>>>> -     p = xdr_encode_hyper(p, count);
>>>>> +     *p = cpu_to_be32(read->rd_length);
>>>>> 
>>>>> -     *eof = (read->rd_offset + count) >= f_size;
>>>>> -     *maxcount = min_t(unsigned long, count, *maxcount);
>>>>>      return nfs_ok;
>>>>> }
>>>>> 
>>>>> @@ -4811,69 +4769,36 @@ static __be32
>>>>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32
>>>>> nfserr,
>>>>>                     struct nfsd4_read *read)
>>>>> {
>>>>> -     unsigned long maxcount, count;
>>>>> +     struct file *file = read->rd_nf->nf_file;
>>>>>      struct xdr_stream *xdr = resp->xdr;
>>>>> -     struct file *file;
>>>>>      int starting_len = xdr->buf->len;
>>>>> -     int last_segment = xdr->buf->len;
>>>>> -     int segments = 0;
>>>>> -     __be32 *p, tmp;
>>>>> -     bool is_data;
>>>>> -     loff_t pos;
>>>>> -     u32 eof;
>>>>> +     u32 segments = 0;
>>>>> +     __be32 *p;
>>>>> 
>>>>>      if (nfserr)
>>>>>              return nfserr;
>>>>> -     file = read->rd_nf->nf_file;
>>>>> 
>>>>>      /* eof flag, segment count */
>>>>>      p = xdr_reserve_space(xdr, 4 + 4);
>>>>>      if (!p)
>>>>> -             return nfserr_resource;
>>>>> +             return nfserr_io;
>>>>>      xdr_commit_encode(xdr);
>>>>> 
>>>>> -     maxcount = min_t(unsigned long, read->rd_length,
>>>>> -                      (xdr->buf->buflen - xdr->buf->len));
>>>>> -     count    = maxcount;
>>>>> -
>>>>> -     eof = read->rd_offset >= i_size_read(file_inode(file));
>>>>> -     if (eof)
>>>>> +     read->rd_eof = read->rd_offset >=
>>>>> i_size_read(file_inode(file));
>>>>> +     if (read->rd_eof)
>>>>>              goto out;
>>>>> 
>>>>> -     pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>>>> -     is_data = pos > read->rd_offset;
>>>>> -
>>>>> -     while (count > 0 && !eof) {
>>>>> -             maxcount = count;
>>>>> -             if (is_data)
>>>>> -                     nfserr =
>>>>> nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
>>>>> -                                             segments == 0 ?
>>>>> &pos : NULL);
>>>>> -             else
>>>>> -                     nfserr =
>>>>> nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
>>>>> -             if (nfserr)
>>>>> -                     goto out;
>>>>> -             count -= maxcount;
>>>>> -             read->rd_offset += maxcount;
>>>>> -             is_data = !is_data;
>>>>> -             last_segment = xdr->buf->len;
>>>>> -             segments++;
>>>>> -     }
>>>>> -
>>>>> -out:
>>>>> -     if (nfserr && segments == 0)
>>>>> +     nfserr = nfsd4_encode_read_plus_data(resp, read);
>>>>> +     if (nfserr) {
>>>>>              xdr_truncate_encode(xdr, starting_len);
>>>>> -     else {
>>>>> -             if (nfserr) {
>>>>> -                     xdr_truncate_encode(xdr, last_segment);
>>>>> -                     nfserr = nfs_ok;
>>>>> -                     eof = 0;
>>>>> -             }
>>>>> -             tmp = htonl(eof);
>>>>> -             write_bytes_to_xdr_buf(xdr->buf,
>>>>> starting_len,     &tmp, 4);
>>>>> -             tmp = htonl(segments);
>>>>> -             write_bytes_to_xdr_buf(xdr->buf, starting_len +
>>>>> 4, &tmp, 4);
>>>>> +             return nfserr;
>>>>>      }
>>>>> 
>>>>> +     segments++;
>>>>> +
>>>>> +out:
>>>>> +     p = xdr_encode_bool(p, read->rd_eof);
>>>>> +     *p = cpu_to_be32(segments);
>>>>>      return nfserr;
>>>>> }
>>>>> 
>>>>> --
>>>>> 2.37.2
>>>>> 
>>>> 
>>>> --
>>>> Chuck Lever
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com

--
Chuck Lever
Trond Myklebust Sept. 8, 2022, 4:25 p.m. UTC | #6
On Thu, 2022-09-08 at 15:36 +0000, Chuck Lever III wrote:
> 
> 
> > On Sep 7, 2022, at 6:33 PM, Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> > 
> > On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote:
> > > 
> > > 
> > > > On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org>
> > > > wrote:
> > > > 
> > > > On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III
> > > > <chuck.lever@oracle.com> wrote:
> > > > > 
> > > > > Be sure to Cc: Jeff on these. Thanks!
> > > > > 
> > > > > 
> > > > > > On Sep 7, 2022, at 3:52 PM, Anna Schumaker
> > > > > > <anna@kernel.org>
> > > > > > wrote:
> > > > > > 
> > > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > > > 
> > > > > > Chuck had suggested reverting READ_PLUS so it returns a
> > > > > > single
> > > > > > DATA
> > > > > > segment covering the requested read range. This prepares
> > > > > > the
> > > > > > server for
> > > > > > a future "sparse read" function so support can easily be
> > > > > > added
> > > > > > without
> > > > > > needing to rip out the old READ_PLUS code at the same time.
> > > > > > 
> > > > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > > > ---
> > > > > > fs/nfsd/nfs4xdr.c | 139 +++++++++++------------------------
> > > > > > ----
> > > > > > -------
> > > > > > 1 file changed, 32 insertions(+), 107 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > > > > index 1e9690a061ec..bcc8c385faf2 100644
> > > > > > --- a/fs/nfsd/nfs4xdr.c
> > > > > > +++ b/fs/nfsd/nfs4xdr.c
> > > > > > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct
> > > > > > nfsd4_compoundres *resp, __be32 nfserr,
> > > > > > 
> > > > > > static __be32
> > > > > > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > > > > > -                         struct nfsd4_read *read,
> > > > > > -                         unsigned long *maxcount, u32
> > > > > > *eof,
> > > > > > -                         loff_t *pos)
> > > > > > +                         struct nfsd4_read *read)
> > > > > > {
> > > > > > -     struct xdr_stream *xdr = resp->xdr;
> > > > > > +     bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp-
> > > > > > > rq_flags);
> > > > > >      struct file *file = read->rd_nf->nf_file;
> > > > > > -     int starting_len = xdr->buf->len;
> > > > > > -     loff_t hole_pos;
> > > > > > -     __be32 nfserr;
> > > > > > -     __be32 *p, tmp;
> > > > > > -     __be64 tmp64;
> > > > > > -
> > > > > > -     hole_pos = pos ? *pos : vfs_llseek(file, read-
> > > > > > >rd_offset,
> > > > > > SEEK_HOLE);
> > > > > > -     if (hole_pos > read->rd_offset)
> > > > > > -             *maxcount = min_t(unsigned long, *maxcount,
> > > > > > hole_pos - read->rd_offset);
> > > > > > -     *maxcount = min_t(unsigned long, *maxcount, (xdr-
> > > > > > >buf-
> > > > > > > buflen - xdr->buf->len));
> > > > > > +     struct xdr_stream *xdr = resp->xdr;
> > > > > > +     unsigned long maxcount;
> > > > > > +     __be32 nfserr, *p;
> > > > > > 
> > > > > >      /* Content type, offset, byte count */
> > > > > >      p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > > > > >      if (!p)
> > > > > > -             return nfserr_resource;
> > > > > > +             return nfserr_io;
> > > > > 
> > > > > Wouldn't nfserr_rep_too_big be a more appropriate status for
> > > > > running
> > > > > off the end of the send buffer? I'm not 100% sure, but I
> > > > > would
> > > > > expect
> > > > > that exhausting send buffer space would imply the reply has
> > > > > grown
> > > > > too
> > > > > large.
> > > > 
> > > > I can switch it to that, no problem.
> > > 
> > > I would like to hear opinions from protocol experts before we go
> > > with that choice.
> > 
> > I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to
> > even return a short read. However if you can return a short read,
> > then
> > that's better than returning an error.
> 
> Many thanks for reviewing!
> 
> In general, I might want to convert all NFSv4 encoders to return
> either RESOURCE or REP_TOO_BIG when xdr_reserve_space() fails. I
> can post a patch or two that makes that attempt so the special
> cases can be sorted on the mailing list.
> 
> 
> > It looks to me as if this function can bit hit in both cases, so
> > perhaps some care is in order.
> 
> Intriguing idea.
> 
> For READ, if xdr_reserve_space() fails, that's all she wrote.
> I think all these calls happen before the data payload is encoded.
> 
> For READ_PLUS, if xdr_reserve_space() fails, it might be possible
> to use xdr_truncate_encode() or simply start over with a limit on
> the number of segments to be encoded. We're not really there yet,
> as currently we want to return just a single segment at this
> point.
> 
> I feel the question is whether it's practical or a frequent
> enough occurrence to bother with the special cases. Encoding a
> READ_PLUS response is already challenging.

What I'm saying is that you are more likely to hit this issue when you
have a reply with something like "<data><hole><data>"...

If the second "<data>" chunk hits the above xdr_reserve_space() limit
(i.e. the first call to nfsd4_encode_read_plus_data() succeeds as does
the call to nfsd4_encode_read_plus_hole()), then you just want to
return a short read of the form "<data><hole>" instead of returning an
error on the whole READ_PLUS operation.
Chuck Lever III Sept. 8, 2022, 4:39 p.m. UTC | #7
> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> wrote:
> 
> On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <chuck.lever@oracle.com> wrote:
>> 
>> Be sure to Cc: Jeff on these. Thanks!
>> 
>> 
>>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> wrote:
>>> 
>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>> 
>>> Chuck had suggested reverting READ_PLUS so it returns a single DATA
>>> segment covering the requested read range. This prepares the server for
>>> a future "sparse read" function so support can easily be added without
>>> needing to rip out the old READ_PLUS code at the same time.
>>> 
>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>> ---
>>> fs/nfsd/nfs4xdr.c | 139 +++++++++++-----------------------------------
>>> 1 file changed, 32 insertions(+), 107 deletions(-)
>>> 
>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>> index 1e9690a061ec..bcc8c385faf2 100644
>>> --- a/fs/nfsd/nfs4xdr.c
>>> +++ b/fs/nfsd/nfs4xdr.c
>>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
>>> 
>>> static __be32
>>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
>>> -                         struct nfsd4_read *read,
>>> -                         unsigned long *maxcount, u32 *eof,
>>> -                         loff_t *pos)
>>> +                         struct nfsd4_read *read)
>>> {
>>> -     struct xdr_stream *xdr = resp->xdr;
>>> +     bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
>>>      struct file *file = read->rd_nf->nf_file;
>>> -     int starting_len = xdr->buf->len;
>>> -     loff_t hole_pos;
>>> -     __be32 nfserr;
>>> -     __be32 *p, tmp;
>>> -     __be64 tmp64;
>>> -
>>> -     hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>> -     if (hole_pos > read->rd_offset)
>>> -             *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
>>> -     *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
>>> +     struct xdr_stream *xdr = resp->xdr;
>>> +     unsigned long maxcount;
>>> +     __be32 nfserr, *p;
>>> 
>>>      /* Content type, offset, byte count */
>>>      p = xdr_reserve_space(xdr, 4 + 8 + 4);
>>>      if (!p)
>>> -             return nfserr_resource;
>>> +             return nfserr_io;
>> 
>> Wouldn't nfserr_rep_too_big be a more appropriate status for running
>> off the end of the send buffer? I'm not 100% sure, but I would expect
>> that exhausting send buffer space would imply the reply has grown too
>> large.
> 
> I can switch it to that, no problem.

OK, never mind. I see that nfsd4_encode_compound() handles the status
code conversion for every encoder, and deals with reply caching too:

5349         if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) {
5350                 struct nfsd4_slot *slot = resp->cstate.slot;
5351 
5352                 if (slot->sl_flags & NFSD4_SLOT_CACHETHIS)
5353                         op->status = nfserr_rep_too_big_to_cache;
5354                 else
5355                         op->status = nfserr_rep_too_big;
5356         }

So returning nfserr_resource from the READ_PLUS encoder when
xdr_reserve_space() fails is copacetic and preferred.

Then, once READ_PLUS can return multiple segments again, it should
deal with send buffer space exhaustion by truncating the reply at
the last properly encoded segment, as Trond suggested.


>>> +     if (resp->xdr->buf->page_len && splice_ok) {
>>> +             WARN_ON_ONCE(splice_ok);
>>> +             return nfserr_io;
>>> +     }
>> 
>> I wish I understood why this test was needed. It seems to have been
>> copied and pasted from historic code into nfsd4_encode_read(), and
>> there have been recent mechanical changes to it, but there's no
>> comment explaining it there...
> 
> Yeah, I saw this was in the read code and assumed it was an important
> check so I added it here too.
>> 
>> In any event, this seems to be checking for a server software bug,
>> so maybe this should return nfserr_serverfault. Oddly that status
>> code isn't defined yet.
> 
> Do you want me to add that code and return it in this patch?

I would still like to return serverfault if the splice check
fails.


>> Do you have some performance results for v2?
> 
> Not yet, I have it running now so hopefully I'll have something ready
> by tomorrow morning.
> 
> Anna
>> 
>> 
>>> -     read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
>>> -     if (read->rd_vlen < 0)
>>> -             return nfserr_resource;
>>> +     maxcount = min_t(unsigned long, read->rd_length,
>>> +                      (xdr->buf->buflen - xdr->buf->len));
>>> 
>>> -     nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
>>> -                         resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
>>> +     if (file->f_op->splice_read && splice_ok)
>>> +             nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>>> +     else
>>> +             nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
>>>      if (nfserr)
>>>              return nfserr;
>>> -     xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
>>> 
>>> -     tmp = htonl(NFS4_CONTENT_DATA);
>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
>>> -     tmp64 = cpu_to_be64(read->rd_offset);
>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
>>> -     tmp = htonl(*maxcount);
>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
>>> -
>>> -     tmp = xdr_zero;
>>> -     write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
>>> -                            xdr_pad_size(*maxcount));
>>> -     return nfs_ok;
>>> -}
>>> -
>>> -static __be32
>>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
>>> -                         struct nfsd4_read *read,
>>> -                         unsigned long *maxcount, u32 *eof)
>>> -{
>>> -     struct file *file = read->rd_nf->nf_file;
>>> -     loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
>>> -     loff_t f_size = i_size_read(file_inode(file));
>>> -     unsigned long count;
>>> -     __be32 *p;
>>> -
>>> -     if (data_pos == -ENXIO)
>>> -             data_pos = f_size;
>>> -     else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
>>> -             return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
>>> -     count = data_pos - read->rd_offset;
>>> -
>>> -     /* Content type, offset, byte count */
>>> -     p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
>>> -     if (!p)
>>> -             return nfserr_resource;
>>> -
>>> -     *p++ = htonl(NFS4_CONTENT_HOLE);
>>> +     *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
>>>      p = xdr_encode_hyper(p, read->rd_offset);
>>> -     p = xdr_encode_hyper(p, count);
>>> +     *p = cpu_to_be32(read->rd_length);
>>> 
>>> -     *eof = (read->rd_offset + count) >= f_size;
>>> -     *maxcount = min_t(unsigned long, count, *maxcount);
>>>      return nfs_ok;
>>> }
>>> 
>>> @@ -4811,69 +4769,36 @@ static __be32
>>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>>>                     struct nfsd4_read *read)
>>> {
>>> -     unsigned long maxcount, count;
>>> +     struct file *file = read->rd_nf->nf_file;
>>>      struct xdr_stream *xdr = resp->xdr;
>>> -     struct file *file;
>>>      int starting_len = xdr->buf->len;
>>> -     int last_segment = xdr->buf->len;
>>> -     int segments = 0;
>>> -     __be32 *p, tmp;
>>> -     bool is_data;
>>> -     loff_t pos;
>>> -     u32 eof;
>>> +     u32 segments = 0;
>>> +     __be32 *p;
>>> 
>>>      if (nfserr)
>>>              return nfserr;
>>> -     file = read->rd_nf->nf_file;
>>> 
>>>      /* eof flag, segment count */
>>>      p = xdr_reserve_space(xdr, 4 + 4);
>>>      if (!p)
>>> -             return nfserr_resource;
>>> +             return nfserr_io;
>>>      xdr_commit_encode(xdr);
>>> 
>>> -     maxcount = min_t(unsigned long, read->rd_length,
>>> -                      (xdr->buf->buflen - xdr->buf->len));
>>> -     count    = maxcount;
>>> -
>>> -     eof = read->rd_offset >= i_size_read(file_inode(file));
>>> -     if (eof)
>>> +     read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
>>> +     if (read->rd_eof)
>>>              goto out;
>>> 
>>> -     pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
>>> -     is_data = pos > read->rd_offset;
>>> -
>>> -     while (count > 0 && !eof) {
>>> -             maxcount = count;
>>> -             if (is_data)
>>> -                     nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
>>> -                                             segments == 0 ? &pos : NULL);
>>> -             else
>>> -                     nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
>>> -             if (nfserr)
>>> -                     goto out;
>>> -             count -= maxcount;
>>> -             read->rd_offset += maxcount;
>>> -             is_data = !is_data;
>>> -             last_segment = xdr->buf->len;
>>> -             segments++;
>>> -     }
>>> -
>>> -out:
>>> -     if (nfserr && segments == 0)
>>> +     nfserr = nfsd4_encode_read_plus_data(resp, read);
>>> +     if (nfserr) {
>>>              xdr_truncate_encode(xdr, starting_len);
>>> -     else {
>>> -             if (nfserr) {
>>> -                     xdr_truncate_encode(xdr, last_segment);
>>> -                     nfserr = nfs_ok;
>>> -                     eof = 0;
>>> -             }
>>> -             tmp = htonl(eof);
>>> -             write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
>>> -             tmp = htonl(segments);
>>> -             write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
>>> +             return nfserr;
>>>      }
>>> 
>>> +     segments++;
>>> +
>>> +out:
>>> +     p = xdr_encode_bool(p, read->rd_eof);
>>> +     *p = cpu_to_be32(segments);
>>>      return nfserr;
>>> }
>>> 
>>> --
>>> 2.37.2
>>> 
>> 
>> --
>> Chuck Lever

--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e9690a061ec..bcc8c385faf2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4731,79 +4731,37 @@  nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
-			    struct nfsd4_read *read,
-			    unsigned long *maxcount, u32 *eof,
-			    loff_t *pos)
+			    struct nfsd4_read *read)
 {
-	struct xdr_stream *xdr = resp->xdr;
+	bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
 	struct file *file = read->rd_nf->nf_file;
-	int starting_len = xdr->buf->len;
-	loff_t hole_pos;
-	__be32 nfserr;
-	__be32 *p, tmp;
-	__be64 tmp64;
-
-	hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
-	if (hole_pos > read->rd_offset)
-		*maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
-	*maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
+	struct xdr_stream *xdr = resp->xdr;
+	unsigned long maxcount;
+	__be32 nfserr, *p;
 
 	/* Content type, offset, byte count */
 	p = xdr_reserve_space(xdr, 4 + 8 + 4);
 	if (!p)
-		return nfserr_resource;
+		return nfserr_io;
+	if (resp->xdr->buf->page_len && splice_ok) {
+		WARN_ON_ONCE(splice_ok);
+		return nfserr_io;
+	}
 
-	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
-	if (read->rd_vlen < 0)
-		return nfserr_resource;
+	maxcount = min_t(unsigned long, read->rd_length,
+			 (xdr->buf->buflen - xdr->buf->len));
 
-	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
-			    resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
+	if (file->f_op->splice_read && splice_ok)
+		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
+	else
+		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
 	if (nfserr)
 		return nfserr;
-	xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
 
-	tmp = htonl(NFS4_CONTENT_DATA);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
-	tmp64 = cpu_to_be64(read->rd_offset);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
-	tmp = htonl(*maxcount);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
-
-	tmp = xdr_zero;
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
-			       xdr_pad_size(*maxcount));
-	return nfs_ok;
-}
-
-static __be32
-nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
-			    struct nfsd4_read *read,
-			    unsigned long *maxcount, u32 *eof)
-{
-	struct file *file = read->rd_nf->nf_file;
-	loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
-	loff_t f_size = i_size_read(file_inode(file));
-	unsigned long count;
-	__be32 *p;
-
-	if (data_pos == -ENXIO)
-		data_pos = f_size;
-	else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
-		return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
-	count = data_pos - read->rd_offset;
-
-	/* Content type, offset, byte count */
-	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
-	if (!p)
-		return nfserr_resource;
-
-	*p++ = htonl(NFS4_CONTENT_HOLE);
+	*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
 	p = xdr_encode_hyper(p, read->rd_offset);
-	p = xdr_encode_hyper(p, count);
+	*p = cpu_to_be32(read->rd_length);
 
-	*eof = (read->rd_offset + count) >= f_size;
-	*maxcount = min_t(unsigned long, count, *maxcount);
 	return nfs_ok;
 }
 
@@ -4811,69 +4769,36 @@  static __be32
 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
 		       struct nfsd4_read *read)
 {
-	unsigned long maxcount, count;
+	struct file *file = read->rd_nf->nf_file;
 	struct xdr_stream *xdr = resp->xdr;
-	struct file *file;
 	int starting_len = xdr->buf->len;
-	int last_segment = xdr->buf->len;
-	int segments = 0;
-	__be32 *p, tmp;
-	bool is_data;
-	loff_t pos;
-	u32 eof;
+	u32 segments = 0;
+	__be32 *p;
 
 	if (nfserr)
 		return nfserr;
-	file = read->rd_nf->nf_file;
 
 	/* eof flag, segment count */
 	p = xdr_reserve_space(xdr, 4 + 4);
 	if (!p)
-		return nfserr_resource;
+		return nfserr_io;
 	xdr_commit_encode(xdr);
 
-	maxcount = min_t(unsigned long, read->rd_length,
-			 (xdr->buf->buflen - xdr->buf->len));
-	count    = maxcount;
-
-	eof = read->rd_offset >= i_size_read(file_inode(file));
-	if (eof)
+	read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
+	if (read->rd_eof)
 		goto out;
 
-	pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
-	is_data = pos > read->rd_offset;
-
-	while (count > 0 && !eof) {
-		maxcount = count;
-		if (is_data)
-			nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
-						segments == 0 ? &pos : NULL);
-		else
-			nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
-		if (nfserr)
-			goto out;
-		count -= maxcount;
-		read->rd_offset += maxcount;
-		is_data = !is_data;
-		last_segment = xdr->buf->len;
-		segments++;
-	}
-
-out:
-	if (nfserr && segments == 0)
+	nfserr = nfsd4_encode_read_plus_data(resp, read);
+	if (nfserr) {
 		xdr_truncate_encode(xdr, starting_len);
-	else {
-		if (nfserr) {
-			xdr_truncate_encode(xdr, last_segment);
-			nfserr = nfs_ok;
-			eof = 0;
-		}
-		tmp = htonl(eof);
-		write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
-		tmp = htonl(segments);
-		write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+		return nfserr;
 	}
 
+	segments++;
+
+out:
+	p = xdr_encode_bool(p, read->rd_eof);
+	*p = cpu_to_be32(segments);
 	return nfserr;
 }