Message ID | 20220907195259.926736-2-anna@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | NFSD: Simplify READ_PLUS | expand |
Be sure to Cc: Jeff on these. Thanks! > On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> wrote: > > From: Anna Schumaker <Anna.Schumaker@Netapp.com> > > Chuck had suggested reverting READ_PLUS so it returns a single DATA > segment covering the requested read range. This prepares the server for > a future "sparse read" function so support can easily be added without > needing to rip out the old READ_PLUS code at the same time. > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> > --- > fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------------- > 1 file changed, 32 insertions(+), 107 deletions(-) > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index 1e9690a061ec..bcc8c385faf2 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, > > static __be32 > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, > - struct nfsd4_read *read, > - unsigned long *maxcount, u32 *eof, > - loff_t *pos) > + struct nfsd4_read *read) > { > - struct xdr_stream *xdr = resp->xdr; > + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); > struct file *file = read->rd_nf->nf_file; > - int starting_len = xdr->buf->len; > - loff_t hole_pos; > - __be32 nfserr; > - __be32 *p, tmp; > - __be64 tmp64; > - > - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); > - if (hole_pos > read->rd_offset) > - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); > - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); > + struct xdr_stream *xdr = resp->xdr; > + unsigned long maxcount; > + __be32 nfserr, *p; > > /* Content type, offset, byte count */ > p = xdr_reserve_space(xdr, 4 + 8 + 4); > if (!p) > - return nfserr_resource; > + return nfserr_io; Wouldn't nfserr_rep_too_big be a more appropriate status for running off the end of the send buffer? I'm not 100% sure, but I would expect that exhausting send buffer space would imply the reply has grown too large. > + if (resp->xdr->buf->page_len && splice_ok) { > + WARN_ON_ONCE(splice_ok); > + return nfserr_io; > + } I wish I understood why this test was needed. It seems to have been copied and pasted from historic code into nfsd4_encode_read(), and there have been recent mechanical changes to it, but there's no comment explaining it there... In any event, this seems to be checking for a server software bug, so maybe this should return nfserr_serverfault. Oddly that status code isn't defined yet. Do you have some performance results for v2? > - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); > - if (read->rd_vlen < 0) > - return nfserr_resource; > + maxcount = min_t(unsigned long, read->rd_length, > + (xdr->buf->buflen - xdr->buf->len)); > > - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, > - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); > + if (file->f_op->splice_read && splice_ok) > + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); > + else > + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); > if (nfserr) > return nfserr; > - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); > > - tmp = htonl(NFS4_CONTENT_DATA); > - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); > - tmp64 = cpu_to_be64(read->rd_offset); > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); > - tmp = htonl(*maxcount); > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); > - > - tmp = xdr_zero; > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, > - xdr_pad_size(*maxcount)); > - return nfs_ok; > -} > - > -static __be32 > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, > - struct nfsd4_read *read, > - unsigned long *maxcount, u32 *eof) > -{ > - struct file *file = read->rd_nf->nf_file; > - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); > - loff_t f_size = i_size_read(file_inode(file)); > - unsigned long count; > - __be32 *p; > - > - if (data_pos == -ENXIO) > - data_pos = f_size; > - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) > - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); > - count = data_pos - read->rd_offset; > - > - /* Content type, offset, byte count */ > - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); > - if (!p) > - return nfserr_resource; > - > - *p++ = htonl(NFS4_CONTENT_HOLE); > + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); > p = xdr_encode_hyper(p, read->rd_offset); > - p = xdr_encode_hyper(p, count); > + *p = cpu_to_be32(read->rd_length); > > - *eof = (read->rd_offset + count) >= f_size; > - *maxcount = min_t(unsigned long, count, *maxcount); > return nfs_ok; > } > > @@ -4811,69 +4769,36 @@ static __be32 > nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, > struct nfsd4_read *read) > { > - unsigned long maxcount, count; > + struct file *file = read->rd_nf->nf_file; > struct xdr_stream *xdr = resp->xdr; > - struct file *file; > int starting_len = xdr->buf->len; > - int last_segment = xdr->buf->len; > - int segments = 0; > - __be32 *p, tmp; > - bool is_data; > - loff_t pos; > - u32 eof; > + u32 segments = 0; > + __be32 *p; > > if (nfserr) > return nfserr; > - file = read->rd_nf->nf_file; > > /* eof flag, segment count */ > p = xdr_reserve_space(xdr, 4 + 4); > if (!p) > - return nfserr_resource; > + return nfserr_io; > xdr_commit_encode(xdr); > > - maxcount = min_t(unsigned long, read->rd_length, > - (xdr->buf->buflen - xdr->buf->len)); > - count = maxcount; > - > - eof = read->rd_offset >= i_size_read(file_inode(file)); > - if (eof) > + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); > + if (read->rd_eof) > goto out; > > - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); > - is_data = pos > read->rd_offset; > - > - while (count > 0 && !eof) { > - maxcount = count; > - if (is_data) > - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, > - segments == 0 ? &pos : NULL); > - else > - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); > - if (nfserr) > - goto out; > - count -= maxcount; > - read->rd_offset += maxcount; > - is_data = !is_data; > - last_segment = xdr->buf->len; > - segments++; > - } > - > -out: > - if (nfserr && segments == 0) > + nfserr = nfsd4_encode_read_plus_data(resp, read); > + if (nfserr) { > xdr_truncate_encode(xdr, starting_len); > - else { > - if (nfserr) { > - xdr_truncate_encode(xdr, last_segment); > - nfserr = nfs_ok; > - eof = 0; > - } > - tmp = htonl(eof); > - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); > - tmp = htonl(segments); > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); > + return nfserr; > } > > + segments++; > + > +out: > + p = xdr_encode_bool(p, read->rd_eof); > + *p = cpu_to_be32(segments); > return nfserr; > } > > -- > 2.37.2 > -- Chuck Lever
On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <chuck.lever@oracle.com> wrote: > > Be sure to Cc: Jeff on these. Thanks! > > > > On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> wrote: > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com> > > > > Chuck had suggested reverting READ_PLUS so it returns a single DATA > > segment covering the requested read range. This prepares the server for > > a future "sparse read" function so support can easily be added without > > needing to rip out the old READ_PLUS code at the same time. > > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> > > --- > > fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------------- > > 1 file changed, 32 insertions(+), 107 deletions(-) > > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > > index 1e9690a061ec..bcc8c385faf2 100644 > > --- a/fs/nfsd/nfs4xdr.c > > +++ b/fs/nfsd/nfs4xdr.c > > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, > > > > static __be32 > > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, > > - struct nfsd4_read *read, > > - unsigned long *maxcount, u32 *eof, > > - loff_t *pos) > > + struct nfsd4_read *read) > > { > > - struct xdr_stream *xdr = resp->xdr; > > + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); > > struct file *file = read->rd_nf->nf_file; > > - int starting_len = xdr->buf->len; > > - loff_t hole_pos; > > - __be32 nfserr; > > - __be32 *p, tmp; > > - __be64 tmp64; > > - > > - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); > > - if (hole_pos > read->rd_offset) > > - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); > > - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); > > + struct xdr_stream *xdr = resp->xdr; > > + unsigned long maxcount; > > + __be32 nfserr, *p; > > > > /* Content type, offset, byte count */ > > p = xdr_reserve_space(xdr, 4 + 8 + 4); > > if (!p) > > - return nfserr_resource; > > + return nfserr_io; > > Wouldn't nfserr_rep_too_big be a more appropriate status for running > off the end of the send buffer? I'm not 100% sure, but I would expect > that exhausting send buffer space would imply the reply has grown too > large. I can switch it to that, no problem. > > > > + if (resp->xdr->buf->page_len && splice_ok) { > > + WARN_ON_ONCE(splice_ok); > > + return nfserr_io; > > + } > > I wish I understood why this test was needed. It seems to have been > copied and pasted from historic code into nfsd4_encode_read(), and > there have been recent mechanical changes to it, but there's no > comment explaining it there... Yeah, I saw this was in the read code and assumed it was an important check so I added it here too. > > In any event, this seems to be checking for a server software bug, > so maybe this should return nfserr_serverfault. Oddly that status > code isn't defined yet. Do you want me to add that code and return it in this patch? > > > Do you have some performance results for v2? Not yet, I have it running now so hopefully I'll have something ready by tomorrow morning. Anna > > > > - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); > > - if (read->rd_vlen < 0) > > - return nfserr_resource; > > + maxcount = min_t(unsigned long, read->rd_length, > > + (xdr->buf->buflen - xdr->buf->len)); > > > > - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, > > - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); > > + if (file->f_op->splice_read && splice_ok) > > + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); > > + else > > + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); > > if (nfserr) > > return nfserr; > > - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); > > > > - tmp = htonl(NFS4_CONTENT_DATA); > > - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); > > - tmp64 = cpu_to_be64(read->rd_offset); > > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); > > - tmp = htonl(*maxcount); > > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); > > - > > - tmp = xdr_zero; > > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, > > - xdr_pad_size(*maxcount)); > > - return nfs_ok; > > -} > > - > > -static __be32 > > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, > > - struct nfsd4_read *read, > > - unsigned long *maxcount, u32 *eof) > > -{ > > - struct file *file = read->rd_nf->nf_file; > > - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); > > - loff_t f_size = i_size_read(file_inode(file)); > > - unsigned long count; > > - __be32 *p; > > - > > - if (data_pos == -ENXIO) > > - data_pos = f_size; > > - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) > > - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); > > - count = data_pos - read->rd_offset; > > - > > - /* Content type, offset, byte count */ > > - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); > > - if (!p) > > - return nfserr_resource; > > - > > - *p++ = htonl(NFS4_CONTENT_HOLE); > > + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); > > p = xdr_encode_hyper(p, read->rd_offset); > > - p = xdr_encode_hyper(p, count); > > + *p = cpu_to_be32(read->rd_length); > > > > - *eof = (read->rd_offset + count) >= f_size; > > - *maxcount = min_t(unsigned long, count, *maxcount); > > return nfs_ok; > > } > > > > @@ -4811,69 +4769,36 @@ static __be32 > > nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, > > struct nfsd4_read *read) > > { > > - unsigned long maxcount, count; > > + struct file *file = read->rd_nf->nf_file; > > struct xdr_stream *xdr = resp->xdr; > > - struct file *file; > > int starting_len = xdr->buf->len; > > - int last_segment = xdr->buf->len; > > - int segments = 0; > > - __be32 *p, tmp; > > - bool is_data; > > - loff_t pos; > > - u32 eof; > > + u32 segments = 0; > > + __be32 *p; > > > > if (nfserr) > > return nfserr; > > - file = read->rd_nf->nf_file; > > > > /* eof flag, segment count */ > > p = xdr_reserve_space(xdr, 4 + 4); > > if (!p) > > - return nfserr_resource; > > + return nfserr_io; > > xdr_commit_encode(xdr); > > > > - maxcount = min_t(unsigned long, read->rd_length, > > - (xdr->buf->buflen - xdr->buf->len)); > > - count = maxcount; > > - > > - eof = read->rd_offset >= i_size_read(file_inode(file)); > > - if (eof) > > + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); > > + if (read->rd_eof) > > goto out; > > > > - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); > > - is_data = pos > read->rd_offset; > > - > > - while (count > 0 && !eof) { > > - maxcount = count; > > - if (is_data) > > - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, > > - segments == 0 ? &pos : NULL); > > - else > > - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); > > - if (nfserr) > > - goto out; > > - count -= maxcount; > > - read->rd_offset += maxcount; > > - is_data = !is_data; > > - last_segment = xdr->buf->len; > > - segments++; > > - } > > - > > -out: > > - if (nfserr && segments == 0) > > + nfserr = nfsd4_encode_read_plus_data(resp, read); > > + if (nfserr) { > > xdr_truncate_encode(xdr, starting_len); > > - else { > > - if (nfserr) { > > - xdr_truncate_encode(xdr, last_segment); > > - nfserr = nfs_ok; > > - eof = 0; > > - } > > - tmp = htonl(eof); > > - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); > > - tmp = htonl(segments); > > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); > > + return nfserr; > > } > > > > + segments++; > > + > > +out: > > + p = xdr_encode_bool(p, read->rd_eof); > > + *p = cpu_to_be32(segments); > > return nfserr; > > } > > > > -- > > 2.37.2 > > > > -- > Chuck Lever > > >
> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> wrote: > > On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <chuck.lever@oracle.com> wrote: >> >> Be sure to Cc: Jeff on these. Thanks! >> >> >>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> wrote: >>> >>> From: Anna Schumaker <Anna.Schumaker@Netapp.com> >>> >>> Chuck had suggested reverting READ_PLUS so it returns a single DATA >>> segment covering the requested read range. This prepares the server for >>> a future "sparse read" function so support can easily be added without >>> needing to rip out the old READ_PLUS code at the same time. >>> >>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> >>> --- >>> fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------------- >>> 1 file changed, 32 insertions(+), 107 deletions(-) >>> >>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c >>> index 1e9690a061ec..bcc8c385faf2 100644 >>> --- a/fs/nfsd/nfs4xdr.c >>> +++ b/fs/nfsd/nfs4xdr.c >>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, >>> >>> static __be32 >>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, >>> - struct nfsd4_read *read, >>> - unsigned long *maxcount, u32 *eof, >>> - loff_t *pos) >>> + struct nfsd4_read *read) >>> { >>> - struct xdr_stream *xdr = resp->xdr; >>> + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); >>> struct file *file = read->rd_nf->nf_file; >>> - int starting_len = xdr->buf->len; >>> - loff_t hole_pos; >>> - __be32 nfserr; >>> - __be32 *p, tmp; >>> - __be64 tmp64; >>> - >>> - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); >>> - if (hole_pos > read->rd_offset) >>> - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); >>> - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); >>> + struct xdr_stream *xdr = resp->xdr; >>> + unsigned long maxcount; >>> + __be32 nfserr, *p; >>> >>> /* Content type, offset, byte count */ >>> p = xdr_reserve_space(xdr, 4 + 8 + 4); >>> if (!p) >>> - return nfserr_resource; >>> + return nfserr_io; >> >> Wouldn't nfserr_rep_too_big be a more appropriate status for running >> off the end of the send buffer? I'm not 100% sure, but I would expect >> that exhausting send buffer space would imply the reply has grown too >> large. > > I can switch it to that, no problem. I would like to hear opinions from protocol experts before we go with that choice. >>> + if (resp->xdr->buf->page_len && splice_ok) { >>> + WARN_ON_ONCE(splice_ok); >>> + return nfserr_io; >>> + } >> >> I wish I understood why this test was needed. It seems to have been >> copied and pasted from historic code into nfsd4_encode_read(), and >> there have been recent mechanical changes to it, but there's no >> comment explaining it there... > > Yeah, I saw this was in the read code and assumed it was an important > check so I added it here too. >> >> In any event, this seems to be checking for a server software bug, >> so maybe this should return nfserr_serverfault. Oddly that status >> code isn't defined yet. > > Do you want me to add that code and return it in this patch? Sure. Make that a predecessor patch and fix up the code in nfsd4_encode_read() before using it for READ_PLUS in this patch. Suggested patch description: RESOURCE is the wrong status code here because a) This encoder is shared between NFSv4.0 and NFSv4.1+, and NFSv4.1+ doesn't support RESOURCE, and b) That status code might cause the client to retry, in which case it will get the same failure because this check seems to be looking for a server-side coding mistake. >> Do you have some performance results for v2? > > Not yet, I have it running now so hopefully I'll have something ready > by tomorrow morning. Great, thanks! > Anna >> >> >>> - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); >>> - if (read->rd_vlen < 0) >>> - return nfserr_resource; >>> + maxcount = min_t(unsigned long, read->rd_length, >>> + (xdr->buf->buflen - xdr->buf->len)); >>> >>> - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, >>> - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); >>> + if (file->f_op->splice_read && splice_ok) >>> + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); >>> + else >>> + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); >>> if (nfserr) >>> return nfserr; >>> - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); >>> >>> - tmp = htonl(NFS4_CONTENT_DATA); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); >>> - tmp64 = cpu_to_be64(read->rd_offset); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); >>> - tmp = htonl(*maxcount); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); >>> - >>> - tmp = xdr_zero; >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, >>> - xdr_pad_size(*maxcount)); >>> - return nfs_ok; >>> -} >>> - >>> -static __be32 >>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, >>> - struct nfsd4_read *read, >>> - unsigned long *maxcount, u32 *eof) >>> -{ >>> - struct file *file = read->rd_nf->nf_file; >>> - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); >>> - loff_t f_size = i_size_read(file_inode(file)); >>> - unsigned long count; >>> - __be32 *p; >>> - >>> - if (data_pos == -ENXIO) >>> - data_pos = f_size; >>> - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) >>> - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); >>> - count = data_pos - read->rd_offset; >>> - >>> - /* Content type, offset, byte count */ >>> - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); >>> - if (!p) >>> - return nfserr_resource; >>> - >>> - *p++ = htonl(NFS4_CONTENT_HOLE); >>> + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); >>> p = xdr_encode_hyper(p, read->rd_offset); >>> - p = xdr_encode_hyper(p, count); >>> + *p = cpu_to_be32(read->rd_length); >>> >>> - *eof = (read->rd_offset + count) >= f_size; >>> - *maxcount = min_t(unsigned long, count, *maxcount); >>> return nfs_ok; >>> } >>> >>> @@ -4811,69 +4769,36 @@ static __be32 >>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, >>> struct nfsd4_read *read) >>> { >>> - unsigned long maxcount, count; >>> + struct file *file = read->rd_nf->nf_file; >>> struct xdr_stream *xdr = resp->xdr; >>> - struct file *file; >>> int starting_len = xdr->buf->len; >>> - int last_segment = xdr->buf->len; >>> - int segments = 0; >>> - __be32 *p, tmp; >>> - bool is_data; >>> - loff_t pos; >>> - u32 eof; >>> + u32 segments = 0; >>> + __be32 *p; >>> >>> if (nfserr) >>> return nfserr; >>> - file = read->rd_nf->nf_file; >>> >>> /* eof flag, segment count */ >>> p = xdr_reserve_space(xdr, 4 + 4); >>> if (!p) >>> - return nfserr_resource; >>> + return nfserr_io; >>> xdr_commit_encode(xdr); >>> >>> - maxcount = min_t(unsigned long, read->rd_length, >>> - (xdr->buf->buflen - xdr->buf->len)); >>> - count = maxcount; >>> - >>> - eof = read->rd_offset >= i_size_read(file_inode(file)); >>> - if (eof) >>> + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); >>> + if (read->rd_eof) >>> goto out; >>> >>> - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); >>> - is_data = pos > read->rd_offset; >>> - >>> - while (count > 0 && !eof) { >>> - maxcount = count; >>> - if (is_data) >>> - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, >>> - segments == 0 ? &pos : NULL); >>> - else >>> - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); >>> - if (nfserr) >>> - goto out; >>> - count -= maxcount; >>> - read->rd_offset += maxcount; >>> - is_data = !is_data; >>> - last_segment = xdr->buf->len; >>> - segments++; >>> - } >>> - >>> -out: >>> - if (nfserr && segments == 0) >>> + nfserr = nfsd4_encode_read_plus_data(resp, read); >>> + if (nfserr) { >>> xdr_truncate_encode(xdr, starting_len); >>> - else { >>> - if (nfserr) { >>> - xdr_truncate_encode(xdr, last_segment); >>> - nfserr = nfs_ok; >>> - eof = 0; >>> - } >>> - tmp = htonl(eof); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); >>> - tmp = htonl(segments); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); >>> + return nfserr; >>> } >>> >>> + segments++; >>> + >>> +out: >>> + p = xdr_encode_bool(p, read->rd_eof); >>> + *p = cpu_to_be32(segments); >>> return nfserr; >>> } >>> >>> -- >>> 2.37.2 >>> >> >> -- >> Chuck Lever -- Chuck Lever
On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote: > > > > On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> wrote: > > > > On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III > > <chuck.lever@oracle.com> wrote: > > > > > > Be sure to Cc: Jeff on these. Thanks! > > > > > > > > > > On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> > > > > wrote: > > > > > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com> > > > > > > > > Chuck had suggested reverting READ_PLUS so it returns a single > > > > DATA > > > > segment covering the requested read range. This prepares the > > > > server for > > > > a future "sparse read" function so support can easily be added > > > > without > > > > needing to rip out the old READ_PLUS code at the same time. > > > > > > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> > > > > --- > > > > fs/nfsd/nfs4xdr.c | 139 +++++++++++---------------------------- > > > > ------- > > > > 1 file changed, 32 insertions(+), 107 deletions(-) > > > > > > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > > > > index 1e9690a061ec..bcc8c385faf2 100644 > > > > --- a/fs/nfsd/nfs4xdr.c > > > > +++ b/fs/nfsd/nfs4xdr.c > > > > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct > > > > nfsd4_compoundres *resp, __be32 nfserr, > > > > > > > > static __be32 > > > > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, > > > > - struct nfsd4_read *read, > > > > - unsigned long *maxcount, u32 *eof, > > > > - loff_t *pos) > > > > + struct nfsd4_read *read) > > > > { > > > > - struct xdr_stream *xdr = resp->xdr; > > > > + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp- > > > > >rq_flags); > > > > struct file *file = read->rd_nf->nf_file; > > > > - int starting_len = xdr->buf->len; > > > > - loff_t hole_pos; > > > > - __be32 nfserr; > > > > - __be32 *p, tmp; > > > > - __be64 tmp64; > > > > - > > > > - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, > > > > SEEK_HOLE); > > > > - if (hole_pos > read->rd_offset) > > > > - *maxcount = min_t(unsigned long, *maxcount, > > > > hole_pos - read->rd_offset); > > > > - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf- > > > > >buflen - xdr->buf->len)); > > > > + struct xdr_stream *xdr = resp->xdr; > > > > + unsigned long maxcount; > > > > + __be32 nfserr, *p; > > > > > > > > /* Content type, offset, byte count */ > > > > p = xdr_reserve_space(xdr, 4 + 8 + 4); > > > > if (!p) > > > > - return nfserr_resource; > > > > + return nfserr_io; > > > > > > Wouldn't nfserr_rep_too_big be a more appropriate status for > > > running > > > off the end of the send buffer? I'm not 100% sure, but I would > > > expect > > > that exhausting send buffer space would imply the reply has grown > > > too > > > large. > > > > I can switch it to that, no problem. > > I would like to hear opinions from protocol experts before we go > with that choice. I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to even return a short read. However if you can return a short read, then that's better than returning an error. It looks to me as if this function can bit hit in both cases, so perhaps some care is in order. > > > > > + if (resp->xdr->buf->page_len && splice_ok) { > > > > + WARN_ON_ONCE(splice_ok); > > > > + return nfserr_io; > > > > + } > > > > > > I wish I understood why this test was needed. It seems to have > > > been > > > copied and pasted from historic code into nfsd4_encode_read(), > > > and > > > there have been recent mechanical changes to it, but there's no > > > comment explaining it there... > > > > Yeah, I saw this was in the read code and assumed it was an > > important > > check so I added it here too. > > > > > > In any event, this seems to be checking for a server software > > > bug, > > > so maybe this should return nfserr_serverfault. Oddly that status > > > code isn't defined yet. > > > > Do you want me to add that code and return it in this patch? > > Sure. Make that a predecessor patch and fix up the code in > nfsd4_encode_read() before using it for READ_PLUS in this patch. > > Suggested patch description: > > RESOURCE is the wrong status code here because > > a) This encoder is shared between NFSv4.0 and NFSv4.1+, and > NFSv4.1+ doesn't support RESOURCE, and Is it? I thought it was READ_PLUS only. That's NFSv4.2 or higher. If the encoder is to be shared with both READ and READ_PLUS, then perhaps it might be wiser to choose a name other than nfsd4_encode_read_plus_data()? > b) That status code might cause the client to retry, in which > case it will get the same failure because this check seems > to be looking for a server-side coding mistake. > > > > > Do you have some performance results for v2? > > > > Not yet, I have it running now so hopefully I'll have something > > ready > > by tomorrow morning. > > Great, thanks! > > > > Anna > > > > > > > > > > - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp- > > > > >rq_vec, *maxcount); > > > > - if (read->rd_vlen < 0) > > > > - return nfserr_resource; > > > > + maxcount = min_t(unsigned long, read->rd_length, > > > > + (xdr->buf->buflen - xdr->buf->len)); > > > > > > > > - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, > > > > read->rd_offset, > > > > - resp->rqstp->rq_vec, read->rd_vlen, > > > > maxcount, eof); > > > > + if (file->f_op->splice_read && splice_ok) > > > > + nfserr = nfsd4_encode_splice_read(resp, read, > > > > file, maxcount); > > > > + else > > > > + nfserr = nfsd4_encode_readv(resp, read, file, > > > > maxcount); > > > > if (nfserr) > > > > return nfserr; > > > > - xdr_truncate_encode(xdr, starting_len + 16 + > > > > xdr_align_size(*maxcount)); > > > > > > > > - tmp = htonl(NFS4_CONTENT_DATA); > > > > - write_bytes_to_xdr_buf(xdr->buf, starting_len, > > > > &tmp, 4); > > > > - tmp64 = cpu_to_be64(read->rd_offset); > > > > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, > > > > &tmp64, 8); > > > > - tmp = htonl(*maxcount); > > > > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, > > > > &tmp, 4); > > > > - > > > > - tmp = xdr_zero; > > > > - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + > > > > *maxcount, &tmp, > > > > - xdr_pad_size(*maxcount)); > > > > - return nfs_ok; > > > > -} > > > > - > > > > -static __be32 > > > > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, > > > > - struct nfsd4_read *read, > > > > - unsigned long *maxcount, u32 *eof) > > > > -{ > > > > - struct file *file = read->rd_nf->nf_file; > > > > - loff_t data_pos = vfs_llseek(file, read->rd_offset, > > > > SEEK_DATA); > > > > - loff_t f_size = i_size_read(file_inode(file)); > > > > - unsigned long count; > > > > - __be32 *p; > > > > - > > > > - if (data_pos == -ENXIO) > > > > - data_pos = f_size; > > > > - else if (data_pos <= read->rd_offset || (data_pos < > > > > f_size && data_pos % PAGE_SIZE)) > > > > - return nfsd4_encode_read_plus_data(resp, read, > > > > maxcount, eof, &f_size); > > > > - count = data_pos - read->rd_offset; > > > > - > > > > - /* Content type, offset, byte count */ > > > > - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); > > > > - if (!p) > > > > - return nfserr_resource; > > > > - > > > > - *p++ = htonl(NFS4_CONTENT_HOLE); > > > > + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); > > > > p = xdr_encode_hyper(p, read->rd_offset); > > > > - p = xdr_encode_hyper(p, count); > > > > + *p = cpu_to_be32(read->rd_length); > > > > > > > > - *eof = (read->rd_offset + count) >= f_size; > > > > - *maxcount = min_t(unsigned long, count, *maxcount); > > > > return nfs_ok; > > > > } > > > > > > > > @@ -4811,69 +4769,36 @@ static __be32 > > > > nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 > > > > nfserr, > > > > struct nfsd4_read *read) > > > > { > > > > - unsigned long maxcount, count; > > > > + struct file *file = read->rd_nf->nf_file; > > > > struct xdr_stream *xdr = resp->xdr; > > > > - struct file *file; > > > > int starting_len = xdr->buf->len; > > > > - int last_segment = xdr->buf->len; > > > > - int segments = 0; > > > > - __be32 *p, tmp; > > > > - bool is_data; > > > > - loff_t pos; > > > > - u32 eof; > > > > + u32 segments = 0; > > > > + __be32 *p; > > > > > > > > if (nfserr) > > > > return nfserr; > > > > - file = read->rd_nf->nf_file; > > > > > > > > /* eof flag, segment count */ > > > > p = xdr_reserve_space(xdr, 4 + 4); > > > > if (!p) > > > > - return nfserr_resource; > > > > + return nfserr_io; > > > > xdr_commit_encode(xdr); > > > > > > > > - maxcount = min_t(unsigned long, read->rd_length, > > > > - (xdr->buf->buflen - xdr->buf->len)); > > > > - count = maxcount; > > > > - > > > > - eof = read->rd_offset >= i_size_read(file_inode(file)); > > > > - if (eof) > > > > + read->rd_eof = read->rd_offset >= > > > > i_size_read(file_inode(file)); > > > > + if (read->rd_eof) > > > > goto out; > > > > > > > > - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); > > > > - is_data = pos > read->rd_offset; > > > > - > > > > - while (count > 0 && !eof) { > > > > - maxcount = count; > > > > - if (is_data) > > > > - nfserr = > > > > nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, > > > > - segments == 0 ? > > > > &pos : NULL); > > > > - else > > > > - nfserr = > > > > nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); > > > > - if (nfserr) > > > > - goto out; > > > > - count -= maxcount; > > > > - read->rd_offset += maxcount; > > > > - is_data = !is_data; > > > > - last_segment = xdr->buf->len; > > > > - segments++; > > > > - } > > > > - > > > > -out: > > > > - if (nfserr && segments == 0) > > > > + nfserr = nfsd4_encode_read_plus_data(resp, read); > > > > + if (nfserr) { > > > > xdr_truncate_encode(xdr, starting_len); > > > > - else { > > > > - if (nfserr) { > > > > - xdr_truncate_encode(xdr, last_segment); > > > > - nfserr = nfs_ok; > > > > - eof = 0; > > > > - } > > > > - tmp = htonl(eof); > > > > - write_bytes_to_xdr_buf(xdr->buf, > > > > starting_len, &tmp, 4); > > > > - tmp = htonl(segments); > > > > - write_bytes_to_xdr_buf(xdr->buf, starting_len + > > > > 4, &tmp, 4); > > > > + return nfserr; > > > > } > > > > > > > > + segments++; > > > > + > > > > +out: > > > > + p = xdr_encode_bool(p, read->rd_eof); > > > > + *p = cpu_to_be32(segments); > > > > return nfserr; > > > > } > > > > > > > > -- > > > > 2.37.2 > > > > > > > > > > -- > > > Chuck Lever > > -- > Chuck Lever > > >
> On Sep 7, 2022, at 6:33 PM, Trond Myklebust <trondmy@hammerspace.com> wrote: > > On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote: >> >> >>> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> wrote: >>> >>> On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III >>> <chuck.lever@oracle.com> wrote: >>>> >>>> Be sure to Cc: Jeff on these. Thanks! >>>> >>>> >>>>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> >>>>> wrote: >>>>> >>>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com> >>>>> >>>>> Chuck had suggested reverting READ_PLUS so it returns a single >>>>> DATA >>>>> segment covering the requested read range. This prepares the >>>>> server for >>>>> a future "sparse read" function so support can easily be added >>>>> without >>>>> needing to rip out the old READ_PLUS code at the same time. >>>>> >>>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> >>>>> --- >>>>> fs/nfsd/nfs4xdr.c | 139 +++++++++++---------------------------- >>>>> ------- >>>>> 1 file changed, 32 insertions(+), 107 deletions(-) >>>>> >>>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c >>>>> index 1e9690a061ec..bcc8c385faf2 100644 >>>>> --- a/fs/nfsd/nfs4xdr.c >>>>> +++ b/fs/nfsd/nfs4xdr.c >>>>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct >>>>> nfsd4_compoundres *resp, __be32 nfserr, >>>>> >>>>> static __be32 >>>>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, >>>>> - struct nfsd4_read *read, >>>>> - unsigned long *maxcount, u32 *eof, >>>>> - loff_t *pos) >>>>> + struct nfsd4_read *read) >>>>> { >>>>> - struct xdr_stream *xdr = resp->xdr; >>>>> + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp- >>>>>> rq_flags); >>>>> struct file *file = read->rd_nf->nf_file; >>>>> - int starting_len = xdr->buf->len; >>>>> - loff_t hole_pos; >>>>> - __be32 nfserr; >>>>> - __be32 *p, tmp; >>>>> - __be64 tmp64; >>>>> - >>>>> - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, >>>>> SEEK_HOLE); >>>>> - if (hole_pos > read->rd_offset) >>>>> - *maxcount = min_t(unsigned long, *maxcount, >>>>> hole_pos - read->rd_offset); >>>>> - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf- >>>>>> buflen - xdr->buf->len)); >>>>> + struct xdr_stream *xdr = resp->xdr; >>>>> + unsigned long maxcount; >>>>> + __be32 nfserr, *p; >>>>> >>>>> /* Content type, offset, byte count */ >>>>> p = xdr_reserve_space(xdr, 4 + 8 + 4); >>>>> if (!p) >>>>> - return nfserr_resource; >>>>> + return nfserr_io; >>>> >>>> Wouldn't nfserr_rep_too_big be a more appropriate status for >>>> running >>>> off the end of the send buffer? I'm not 100% sure, but I would >>>> expect >>>> that exhausting send buffer space would imply the reply has grown >>>> too >>>> large. >>> >>> I can switch it to that, no problem. >> >> I would like to hear opinions from protocol experts before we go >> with that choice. > > I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to > even return a short read. However if you can return a short read, then > that's better than returning an error. Many thanks for reviewing! In general, I might want to convert all NFSv4 encoders to return either RESOURCE or REP_TOO_BIG when xdr_reserve_space() fails. I can post a patch or two that makes that attempt so the special cases can be sorted on the mailing list. > It looks to me as if this function can bit hit in both cases, so > perhaps some care is in order. Intriguing idea. For READ, if xdr_reserve_space() fails, that's all she wrote. I think all these calls happen before the data payload is encoded. For READ_PLUS, if xdr_reserve_space() fails, it might be possible to use xdr_truncate_encode() or simply start over with a limit on the number of segments to be encoded. We're not really there yet, as currently we want to return just a single segment at this point. I feel the question is whether it's practical or a frequent enough occurrence to bother with the special cases. Encoding a READ_PLUS response is already challenging. >>>>> + if (resp->xdr->buf->page_len && splice_ok) { >>>>> + WARN_ON_ONCE(splice_ok); >>>>> + return nfserr_io; >>>>> + } >>>> >>>> I wish I understood why this test was needed. It seems to have >>>> been >>>> copied and pasted from historic code into nfsd4_encode_read(), >>>> and >>>> there have been recent mechanical changes to it, but there's no >>>> comment explaining it there... >>> >>> Yeah, I saw this was in the read code and assumed it was an >>> important >>> check so I added it here too. >>>> >>>> In any event, this seems to be checking for a server software >>>> bug, >>>> so maybe this should return nfserr_serverfault. Oddly that status >>>> code isn't defined yet. >>> >>> Do you want me to add that code and return it in this patch? >> >> Sure. Make that a predecessor patch and fix up the code in >> nfsd4_encode_read() before using it for READ_PLUS in this patch. >> >> Suggested patch description: >> >> RESOURCE is the wrong status code here because >> >> a) This encoder is shared between NFSv4.0 and NFSv4.1+, and >> NFSv4.1+ doesn't support RESOURCE, and > > Is it? I thought it was READ_PLUS only. That's NFSv4.2 or higher. For the initial patch that adds nfserr_serverfault, I was speaking only about nfsd4_encode_read(), which is shared amongst all minor versions. That's where the page_len && splice_ok test originally came from. Sorry that wasn't clear. > If the encoder is to be shared with both READ and READ_PLUS, then > perhaps it might be wiser to choose a name other than > nfsd4_encode_read_plus_data()? Although the encoded streams are very similar, there are still separate encoders for a READ_PLUS data segment and a full READ response. The common pieces for both of those operations are nfsd4_encode_readv() and nfsd4_encode_splice_read(). IIUC nfsd4_encode_read_plus_data() has to deal with encoding the the NFS4_CONTENT_DATA enumerator -- it handles a single READ_PLUS segment. nfsd4_encode_read() encodes one complete READ response. I'm comfortable with the current function names. >> b) That status code might cause the client to retry, in which >> case it will get the same failure because this check seems >> to be looking for a server-side coding mistake. >> >> >>>> Do you have some performance results for v2? >>> >>> Not yet, I have it running now so hopefully I'll have something >>> ready >>> by tomorrow morning. >> >> Great, thanks! >> >> >>> Anna >>>> >>>> >>>>> - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp- >>>>>> rq_vec, *maxcount); >>>>> - if (read->rd_vlen < 0) >>>>> - return nfserr_resource; >>>>> + maxcount = min_t(unsigned long, read->rd_length, >>>>> + (xdr->buf->buflen - xdr->buf->len)); >>>>> >>>>> - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, >>>>> read->rd_offset, >>>>> - resp->rqstp->rq_vec, read->rd_vlen, >>>>> maxcount, eof); >>>>> + if (file->f_op->splice_read && splice_ok) >>>>> + nfserr = nfsd4_encode_splice_read(resp, read, >>>>> file, maxcount); >>>>> + else >>>>> + nfserr = nfsd4_encode_readv(resp, read, file, >>>>> maxcount); >>>>> if (nfserr) >>>>> return nfserr; >>>>> - xdr_truncate_encode(xdr, starting_len + 16 + >>>>> xdr_align_size(*maxcount)); >>>>> >>>>> - tmp = htonl(NFS4_CONTENT_DATA); >>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, >>>>> &tmp, 4); >>>>> - tmp64 = cpu_to_be64(read->rd_offset); >>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, >>>>> &tmp64, 8); >>>>> - tmp = htonl(*maxcount); >>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, >>>>> &tmp, 4); >>>>> - >>>>> - tmp = xdr_zero; >>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + >>>>> *maxcount, &tmp, >>>>> - xdr_pad_size(*maxcount)); >>>>> - return nfs_ok; >>>>> -} >>>>> - >>>>> -static __be32 >>>>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, >>>>> - struct nfsd4_read *read, >>>>> - unsigned long *maxcount, u32 *eof) >>>>> -{ >>>>> - struct file *file = read->rd_nf->nf_file; >>>>> - loff_t data_pos = vfs_llseek(file, read->rd_offset, >>>>> SEEK_DATA); >>>>> - loff_t f_size = i_size_read(file_inode(file)); >>>>> - unsigned long count; >>>>> - __be32 *p; >>>>> - >>>>> - if (data_pos == -ENXIO) >>>>> - data_pos = f_size; >>>>> - else if (data_pos <= read->rd_offset || (data_pos < >>>>> f_size && data_pos % PAGE_SIZE)) >>>>> - return nfsd4_encode_read_plus_data(resp, read, >>>>> maxcount, eof, &f_size); >>>>> - count = data_pos - read->rd_offset; >>>>> - >>>>> - /* Content type, offset, byte count */ >>>>> - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); >>>>> - if (!p) >>>>> - return nfserr_resource; >>>>> - >>>>> - *p++ = htonl(NFS4_CONTENT_HOLE); >>>>> + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); >>>>> p = xdr_encode_hyper(p, read->rd_offset); >>>>> - p = xdr_encode_hyper(p, count); >>>>> + *p = cpu_to_be32(read->rd_length); >>>>> >>>>> - *eof = (read->rd_offset + count) >= f_size; >>>>> - *maxcount = min_t(unsigned long, count, *maxcount); >>>>> return nfs_ok; >>>>> } >>>>> >>>>> @@ -4811,69 +4769,36 @@ static __be32 >>>>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 >>>>> nfserr, >>>>> struct nfsd4_read *read) >>>>> { >>>>> - unsigned long maxcount, count; >>>>> + struct file *file = read->rd_nf->nf_file; >>>>> struct xdr_stream *xdr = resp->xdr; >>>>> - struct file *file; >>>>> int starting_len = xdr->buf->len; >>>>> - int last_segment = xdr->buf->len; >>>>> - int segments = 0; >>>>> - __be32 *p, tmp; >>>>> - bool is_data; >>>>> - loff_t pos; >>>>> - u32 eof; >>>>> + u32 segments = 0; >>>>> + __be32 *p; >>>>> >>>>> if (nfserr) >>>>> return nfserr; >>>>> - file = read->rd_nf->nf_file; >>>>> >>>>> /* eof flag, segment count */ >>>>> p = xdr_reserve_space(xdr, 4 + 4); >>>>> if (!p) >>>>> - return nfserr_resource; >>>>> + return nfserr_io; >>>>> xdr_commit_encode(xdr); >>>>> >>>>> - maxcount = min_t(unsigned long, read->rd_length, >>>>> - (xdr->buf->buflen - xdr->buf->len)); >>>>> - count = maxcount; >>>>> - >>>>> - eof = read->rd_offset >= i_size_read(file_inode(file)); >>>>> - if (eof) >>>>> + read->rd_eof = read->rd_offset >= >>>>> i_size_read(file_inode(file)); >>>>> + if (read->rd_eof) >>>>> goto out; >>>>> >>>>> - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); >>>>> - is_data = pos > read->rd_offset; >>>>> - >>>>> - while (count > 0 && !eof) { >>>>> - maxcount = count; >>>>> - if (is_data) >>>>> - nfserr = >>>>> nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, >>>>> - segments == 0 ? >>>>> &pos : NULL); >>>>> - else >>>>> - nfserr = >>>>> nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); >>>>> - if (nfserr) >>>>> - goto out; >>>>> - count -= maxcount; >>>>> - read->rd_offset += maxcount; >>>>> - is_data = !is_data; >>>>> - last_segment = xdr->buf->len; >>>>> - segments++; >>>>> - } >>>>> - >>>>> -out: >>>>> - if (nfserr && segments == 0) >>>>> + nfserr = nfsd4_encode_read_plus_data(resp, read); >>>>> + if (nfserr) { >>>>> xdr_truncate_encode(xdr, starting_len); >>>>> - else { >>>>> - if (nfserr) { >>>>> - xdr_truncate_encode(xdr, last_segment); >>>>> - nfserr = nfs_ok; >>>>> - eof = 0; >>>>> - } >>>>> - tmp = htonl(eof); >>>>> - write_bytes_to_xdr_buf(xdr->buf, >>>>> starting_len, &tmp, 4); >>>>> - tmp = htonl(segments); >>>>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + >>>>> 4, &tmp, 4); >>>>> + return nfserr; >>>>> } >>>>> >>>>> + segments++; >>>>> + >>>>> +out: >>>>> + p = xdr_encode_bool(p, read->rd_eof); >>>>> + *p = cpu_to_be32(segments); >>>>> return nfserr; >>>>> } >>>>> >>>>> -- >>>>> 2.37.2 >>>>> >>>> >>>> -- >>>> Chuck Lever >> >> -- >> Chuck Lever >> >> >> > > -- > Trond Myklebust > Linux NFS client maintainer, Hammerspace > trond.myklebust@hammerspace.com -- Chuck Lever
On Thu, 2022-09-08 at 15:36 +0000, Chuck Lever III wrote: > > > > On Sep 7, 2022, at 6:33 PM, Trond Myklebust > > <trondmy@hammerspace.com> wrote: > > > > On Wed, 2022-09-07 at 20:51 +0000, Chuck Lever III wrote: > > > > > > > > > > On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> > > > > wrote: > > > > > > > > On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III > > > > <chuck.lever@oracle.com> wrote: > > > > > > > > > > Be sure to Cc: Jeff on these. Thanks! > > > > > > > > > > > > > > > > On Sep 7, 2022, at 3:52 PM, Anna Schumaker > > > > > > <anna@kernel.org> > > > > > > wrote: > > > > > > > > > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com> > > > > > > > > > > > > Chuck had suggested reverting READ_PLUS so it returns a > > > > > > single > > > > > > DATA > > > > > > segment covering the requested read range. This prepares > > > > > > the > > > > > > server for > > > > > > a future "sparse read" function so support can easily be > > > > > > added > > > > > > without > > > > > > needing to rip out the old READ_PLUS code at the same time. > > > > > > > > > > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> > > > > > > --- > > > > > > fs/nfsd/nfs4xdr.c | 139 +++++++++++------------------------ > > > > > > ---- > > > > > > ------- > > > > > > 1 file changed, 32 insertions(+), 107 deletions(-) > > > > > > > > > > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > > > > > > index 1e9690a061ec..bcc8c385faf2 100644 > > > > > > --- a/fs/nfsd/nfs4xdr.c > > > > > > +++ b/fs/nfsd/nfs4xdr.c > > > > > > @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct > > > > > > nfsd4_compoundres *resp, __be32 nfserr, > > > > > > > > > > > > static __be32 > > > > > > nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, > > > > > > - struct nfsd4_read *read, > > > > > > - unsigned long *maxcount, u32 > > > > > > *eof, > > > > > > - loff_t *pos) > > > > > > + struct nfsd4_read *read) > > > > > > { > > > > > > - struct xdr_stream *xdr = resp->xdr; > > > > > > + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp- > > > > > > > rq_flags); > > > > > > struct file *file = read->rd_nf->nf_file; > > > > > > - int starting_len = xdr->buf->len; > > > > > > - loff_t hole_pos; > > > > > > - __be32 nfserr; > > > > > > - __be32 *p, tmp; > > > > > > - __be64 tmp64; > > > > > > - > > > > > > - hole_pos = pos ? *pos : vfs_llseek(file, read- > > > > > > >rd_offset, > > > > > > SEEK_HOLE); > > > > > > - if (hole_pos > read->rd_offset) > > > > > > - *maxcount = min_t(unsigned long, *maxcount, > > > > > > hole_pos - read->rd_offset); > > > > > > - *maxcount = min_t(unsigned long, *maxcount, (xdr- > > > > > > >buf- > > > > > > > buflen - xdr->buf->len)); > > > > > > + struct xdr_stream *xdr = resp->xdr; > > > > > > + unsigned long maxcount; > > > > > > + __be32 nfserr, *p; > > > > > > > > > > > > /* Content type, offset, byte count */ > > > > > > p = xdr_reserve_space(xdr, 4 + 8 + 4); > > > > > > if (!p) > > > > > > - return nfserr_resource; > > > > > > + return nfserr_io; > > > > > > > > > > Wouldn't nfserr_rep_too_big be a more appropriate status for > > > > > running > > > > > off the end of the send buffer? I'm not 100% sure, but I > > > > > would > > > > > expect > > > > > that exhausting send buffer space would imply the reply has > > > > > grown > > > > > too > > > > > large. > > > > > > > > I can switch it to that, no problem. > > > > > > I would like to hear opinions from protocol experts before we go > > > with that choice. > > > > I'd agree that NFS4ERR_REP_TOO_BIG is correct if you're not able to > > even return a short read. However if you can return a short read, > > then > > that's better than returning an error. > > Many thanks for reviewing! > > In general, I might want to convert all NFSv4 encoders to return > either RESOURCE or REP_TOO_BIG when xdr_reserve_space() fails. I > can post a patch or two that makes that attempt so the special > cases can be sorted on the mailing list. > > > > It looks to me as if this function can bit hit in both cases, so > > perhaps some care is in order. > > Intriguing idea. > > For READ, if xdr_reserve_space() fails, that's all she wrote. > I think all these calls happen before the data payload is encoded. > > For READ_PLUS, if xdr_reserve_space() fails, it might be possible > to use xdr_truncate_encode() or simply start over with a limit on > the number of segments to be encoded. We're not really there yet, > as currently we want to return just a single segment at this > point. > > I feel the question is whether it's practical or a frequent > enough occurrence to bother with the special cases. Encoding a > READ_PLUS response is already challenging. What I'm saying is that you are more likely to hit this issue when you have a reply with something like "<data><hole><data>"... If the second "<data>" chunk hits the above xdr_reserve_space() limit (i.e. the first call to nfsd4_encode_read_plus_data() succeeds as does the call to nfsd4_encode_read_plus_hole()), then you just want to return a short read of the form "<data><hole>" instead of returning an error on the whole READ_PLUS operation.
> On Sep 7, 2022, at 4:37 PM, Anna Schumaker <anna@kernel.org> wrote: > > On Wed, Sep 7, 2022 at 4:29 PM Chuck Lever III <chuck.lever@oracle.com> wrote: >> >> Be sure to Cc: Jeff on these. Thanks! >> >> >>> On Sep 7, 2022, at 3:52 PM, Anna Schumaker <anna@kernel.org> wrote: >>> >>> From: Anna Schumaker <Anna.Schumaker@Netapp.com> >>> >>> Chuck had suggested reverting READ_PLUS so it returns a single DATA >>> segment covering the requested read range. This prepares the server for >>> a future "sparse read" function so support can easily be added without >>> needing to rip out the old READ_PLUS code at the same time. >>> >>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> >>> --- >>> fs/nfsd/nfs4xdr.c | 139 +++++++++++----------------------------------- >>> 1 file changed, 32 insertions(+), 107 deletions(-) >>> >>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c >>> index 1e9690a061ec..bcc8c385faf2 100644 >>> --- a/fs/nfsd/nfs4xdr.c >>> +++ b/fs/nfsd/nfs4xdr.c >>> @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, >>> >>> static __be32 >>> nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, >>> - struct nfsd4_read *read, >>> - unsigned long *maxcount, u32 *eof, >>> - loff_t *pos) >>> + struct nfsd4_read *read) >>> { >>> - struct xdr_stream *xdr = resp->xdr; >>> + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); >>> struct file *file = read->rd_nf->nf_file; >>> - int starting_len = xdr->buf->len; >>> - loff_t hole_pos; >>> - __be32 nfserr; >>> - __be32 *p, tmp; >>> - __be64 tmp64; >>> - >>> - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); >>> - if (hole_pos > read->rd_offset) >>> - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); >>> - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); >>> + struct xdr_stream *xdr = resp->xdr; >>> + unsigned long maxcount; >>> + __be32 nfserr, *p; >>> >>> /* Content type, offset, byte count */ >>> p = xdr_reserve_space(xdr, 4 + 8 + 4); >>> if (!p) >>> - return nfserr_resource; >>> + return nfserr_io; >> >> Wouldn't nfserr_rep_too_big be a more appropriate status for running >> off the end of the send buffer? I'm not 100% sure, but I would expect >> that exhausting send buffer space would imply the reply has grown too >> large. > > I can switch it to that, no problem. OK, never mind. I see that nfsd4_encode_compound() handles the status code conversion for every encoder, and deals with reply caching too: 5349 if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { 5350 struct nfsd4_slot *slot = resp->cstate.slot; 5351 5352 if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) 5353 op->status = nfserr_rep_too_big_to_cache; 5354 else 5355 op->status = nfserr_rep_too_big; 5356 } So returning nfserr_resource from the READ_PLUS encoder when xdr_reserve_space() fails is copacetic and preferred. Then, once READ_PLUS can return multiple segments again, it should deal with send buffer space exhaustion by truncating the reply at the last properly encoded segment, as Trond suggested. >>> + if (resp->xdr->buf->page_len && splice_ok) { >>> + WARN_ON_ONCE(splice_ok); >>> + return nfserr_io; >>> + } >> >> I wish I understood why this test was needed. It seems to have been >> copied and pasted from historic code into nfsd4_encode_read(), and >> there have been recent mechanical changes to it, but there's no >> comment explaining it there... > > Yeah, I saw this was in the read code and assumed it was an important > check so I added it here too. >> >> In any event, this seems to be checking for a server software bug, >> so maybe this should return nfserr_serverfault. Oddly that status >> code isn't defined yet. > > Do you want me to add that code and return it in this patch? I would still like to return serverfault if the splice check fails. >> Do you have some performance results for v2? > > Not yet, I have it running now so hopefully I'll have something ready > by tomorrow morning. > > Anna >> >> >>> - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); >>> - if (read->rd_vlen < 0) >>> - return nfserr_resource; >>> + maxcount = min_t(unsigned long, read->rd_length, >>> + (xdr->buf->buflen - xdr->buf->len)); >>> >>> - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, >>> - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); >>> + if (file->f_op->splice_read && splice_ok) >>> + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); >>> + else >>> + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); >>> if (nfserr) >>> return nfserr; >>> - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); >>> >>> - tmp = htonl(NFS4_CONTENT_DATA); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); >>> - tmp64 = cpu_to_be64(read->rd_offset); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); >>> - tmp = htonl(*maxcount); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); >>> - >>> - tmp = xdr_zero; >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, >>> - xdr_pad_size(*maxcount)); >>> - return nfs_ok; >>> -} >>> - >>> -static __be32 >>> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, >>> - struct nfsd4_read *read, >>> - unsigned long *maxcount, u32 *eof) >>> -{ >>> - struct file *file = read->rd_nf->nf_file; >>> - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); >>> - loff_t f_size = i_size_read(file_inode(file)); >>> - unsigned long count; >>> - __be32 *p; >>> - >>> - if (data_pos == -ENXIO) >>> - data_pos = f_size; >>> - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) >>> - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); >>> - count = data_pos - read->rd_offset; >>> - >>> - /* Content type, offset, byte count */ >>> - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); >>> - if (!p) >>> - return nfserr_resource; >>> - >>> - *p++ = htonl(NFS4_CONTENT_HOLE); >>> + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); >>> p = xdr_encode_hyper(p, read->rd_offset); >>> - p = xdr_encode_hyper(p, count); >>> + *p = cpu_to_be32(read->rd_length); >>> >>> - *eof = (read->rd_offset + count) >= f_size; >>> - *maxcount = min_t(unsigned long, count, *maxcount); >>> return nfs_ok; >>> } >>> >>> @@ -4811,69 +4769,36 @@ static __be32 >>> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, >>> struct nfsd4_read *read) >>> { >>> - unsigned long maxcount, count; >>> + struct file *file = read->rd_nf->nf_file; >>> struct xdr_stream *xdr = resp->xdr; >>> - struct file *file; >>> int starting_len = xdr->buf->len; >>> - int last_segment = xdr->buf->len; >>> - int segments = 0; >>> - __be32 *p, tmp; >>> - bool is_data; >>> - loff_t pos; >>> - u32 eof; >>> + u32 segments = 0; >>> + __be32 *p; >>> >>> if (nfserr) >>> return nfserr; >>> - file = read->rd_nf->nf_file; >>> >>> /* eof flag, segment count */ >>> p = xdr_reserve_space(xdr, 4 + 4); >>> if (!p) >>> - return nfserr_resource; >>> + return nfserr_io; >>> xdr_commit_encode(xdr); >>> >>> - maxcount = min_t(unsigned long, read->rd_length, >>> - (xdr->buf->buflen - xdr->buf->len)); >>> - count = maxcount; >>> - >>> - eof = read->rd_offset >= i_size_read(file_inode(file)); >>> - if (eof) >>> + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); >>> + if (read->rd_eof) >>> goto out; >>> >>> - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); >>> - is_data = pos > read->rd_offset; >>> - >>> - while (count > 0 && !eof) { >>> - maxcount = count; >>> - if (is_data) >>> - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, >>> - segments == 0 ? &pos : NULL); >>> - else >>> - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); >>> - if (nfserr) >>> - goto out; >>> - count -= maxcount; >>> - read->rd_offset += maxcount; >>> - is_data = !is_data; >>> - last_segment = xdr->buf->len; >>> - segments++; >>> - } >>> - >>> -out: >>> - if (nfserr && segments == 0) >>> + nfserr = nfsd4_encode_read_plus_data(resp, read); >>> + if (nfserr) { >>> xdr_truncate_encode(xdr, starting_len); >>> - else { >>> - if (nfserr) { >>> - xdr_truncate_encode(xdr, last_segment); >>> - nfserr = nfs_ok; >>> - eof = 0; >>> - } >>> - tmp = htonl(eof); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); >>> - tmp = htonl(segments); >>> - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); >>> + return nfserr; >>> } >>> >>> + segments++; >>> + >>> +out: >>> + p = xdr_encode_bool(p, read->rd_eof); >>> + *p = cpu_to_be32(segments); >>> return nfserr; >>> } >>> >>> -- >>> 2.37.2 >>> >> >> -- >> Chuck Lever -- Chuck Lever
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1e9690a061ec..bcc8c385faf2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4731,79 +4731,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, - struct nfsd4_read *read, - unsigned long *maxcount, u32 *eof, - loff_t *pos) + struct nfsd4_read *read) { - struct xdr_stream *xdr = resp->xdr; + bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); struct file *file = read->rd_nf->nf_file; - int starting_len = xdr->buf->len; - loff_t hole_pos; - __be32 nfserr; - __be32 *p, tmp; - __be64 tmp64; - - hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); - if (hole_pos > read->rd_offset) - *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); - *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); + struct xdr_stream *xdr = resp->xdr; + unsigned long maxcount; + __be32 nfserr, *p; /* Content type, offset, byte count */ p = xdr_reserve_space(xdr, 4 + 8 + 4); if (!p) - return nfserr_resource; + return nfserr_io; + if (resp->xdr->buf->page_len && splice_ok) { + WARN_ON_ONCE(splice_ok); + return nfserr_io; + } - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); - if (read->rd_vlen < 0) - return nfserr_resource; + maxcount = min_t(unsigned long, read->rd_length, + (xdr->buf->buflen - xdr->buf->len)); - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); + if (file->f_op->splice_read && splice_ok) + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); if (nfserr) return nfserr; - xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); - tmp = htonl(NFS4_CONTENT_DATA); - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); - tmp64 = cpu_to_be64(read->rd_offset); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); - tmp = htonl(*maxcount); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); - - tmp = xdr_zero; - write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, - xdr_pad_size(*maxcount)); - return nfs_ok; -} - -static __be32 -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, - struct nfsd4_read *read, - unsigned long *maxcount, u32 *eof) -{ - struct file *file = read->rd_nf->nf_file; - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); - loff_t f_size = i_size_read(file_inode(file)); - unsigned long count; - __be32 *p; - - if (data_pos == -ENXIO) - data_pos = f_size; - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); - count = data_pos - read->rd_offset; - - /* Content type, offset, byte count */ - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); - if (!p) - return nfserr_resource; - - *p++ = htonl(NFS4_CONTENT_HOLE); + *p++ = cpu_to_be32(NFS4_CONTENT_DATA); p = xdr_encode_hyper(p, read->rd_offset); - p = xdr_encode_hyper(p, count); + *p = cpu_to_be32(read->rd_length); - *eof = (read->rd_offset + count) >= f_size; - *maxcount = min_t(unsigned long, count, *maxcount); return nfs_ok; } @@ -4811,69 +4769,36 @@ static __be32 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_read *read) { - unsigned long maxcount, count; + struct file *file = read->rd_nf->nf_file; struct xdr_stream *xdr = resp->xdr; - struct file *file; int starting_len = xdr->buf->len; - int last_segment = xdr->buf->len; - int segments = 0; - __be32 *p, tmp; - bool is_data; - loff_t pos; - u32 eof; + u32 segments = 0; + __be32 *p; if (nfserr) return nfserr; - file = read->rd_nf->nf_file; /* eof flag, segment count */ p = xdr_reserve_space(xdr, 4 + 4); if (!p) - return nfserr_resource; + return nfserr_io; xdr_commit_encode(xdr); - maxcount = min_t(unsigned long, read->rd_length, - (xdr->buf->buflen - xdr->buf->len)); - count = maxcount; - - eof = read->rd_offset >= i_size_read(file_inode(file)); - if (eof) + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); + if (read->rd_eof) goto out; - pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); - is_data = pos > read->rd_offset; - - while (count > 0 && !eof) { - maxcount = count; - if (is_data) - nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, - segments == 0 ? &pos : NULL); - else - nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); - if (nfserr) - goto out; - count -= maxcount; - read->rd_offset += maxcount; - is_data = !is_data; - last_segment = xdr->buf->len; - segments++; - } - -out: - if (nfserr && segments == 0) + nfserr = nfsd4_encode_read_plus_data(resp, read); + if (nfserr) { xdr_truncate_encode(xdr, starting_len); - else { - if (nfserr) { - xdr_truncate_encode(xdr, last_segment); - nfserr = nfs_ok; - eof = 0; - } - tmp = htonl(eof); - write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); - tmp = htonl(segments); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + return nfserr; } + segments++; + +out: + p = xdr_encode_bool(p, read->rd_eof); + *p = cpu_to_be32(segments); return nfserr; }