diff mbox series

[v3,6/6] NFSD: Repeal and replace the READ_PLUS implementation

Message ID 20220715184433.838521-7-anna@kernel.org (mailing list archive)
State New, archived
Headers show
Series NFSD: Improvements for the NFSv4.2 READ_PLUS operation | expand

Commit Message

Anna Schumaker July 15, 2022, 6:44 p.m. UTC
From: Anna Schumaker <Anna.Schumaker@Netapp.com>

Rather than relying on the underlying filesystem to tell us where hole
and data segments are through vfs_llseek(), let's instead do the hole
compression ourselves. This has a few advantages over the old
implementation:

1) A single call to the underlying filesystem through nfsd_readv() means
   the file can't change from underneath us in the middle of encoding.
2) A single call to the underlying filestem also means that the
   underlying filesystem only needs to synchronize cached and on-disk
   data one time instead of potentially many speeding up the reply.
3) Hole support for filesystems that don't support SEEK_HOLE and SEEK_DATA

I also included an optimization where we can cut down on the amount of
memory being shifed around by doing the compression as (hole, data)
pairs.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfsd/nfs4xdr.c | 219 +++++++++++++++++++++++++---------------------
 1 file changed, 119 insertions(+), 100 deletions(-)

Comments

Chuck Lever III July 15, 2022, 7:08 p.m. UTC | #1
> On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> 
> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> 
> Rather than relying on the underlying filesystem to tell us where hole
> and data segments are through vfs_llseek(), let's instead do the hole
> compression ourselves. This has a few advantages over the old
> implementation:
> 
> 1) A single call to the underlying filesystem through nfsd_readv() means
>   the file can't change from underneath us in the middle of encoding.
> 2) A single call to the underlying filestem also means that the
>   underlying filesystem only needs to synchronize cached and on-disk
>   data one time instead of potentially many speeding up the reply.
> 3) Hole support for filesystems that don't support SEEK_HOLE and SEEK_DATA

Thanks for addressing my cosmetic comments! Looks good.

I'm still not clear why NFSD needs to support filesystems that
do not support SEEK_HOLE/DATA. I'm guessing this is just a side
benefit of the memchr_inv approach below, not really a goal of
this overhaul?

One more comment below.


> I also included an optimization where we can cut down on the amount of
> memory being shifed around by doing the compression as (hole, data)
> pairs.
> 
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
> fs/nfsd/nfs4xdr.c | 219 +++++++++++++++++++++++++---------------------
> 1 file changed, 119 insertions(+), 100 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 61b2aae81abb..df8289fce4ef 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4731,81 +4731,138 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
> 	return nfserr;
> }
> 
> +struct read_plus_segment {
> +	enum data_content4	rp_type;
> +	u64			rp_offset;
> +	u64			rp_length;
> +	unsigned int		rp_page_pos;
> +};
> +
> static __be32
> -nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> -			    struct nfsd4_read *read,
> -			    unsigned long *maxcount, u32 *eof,
> -			    loff_t *pos)
> +nfsd4_read_plus_readv(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
> +		      unsigned long *maxcount, u32 *eof)
> {
> 	struct xdr_stream *xdr = resp->xdr;
> -	struct file *file = read->rd_nf->nf_file;
> -	int starting_len = xdr->buf->len;
> -	loff_t hole_pos;
> -	__be32 nfserr;
> -	__be32 *p, tmp;
> -	__be64 tmp64;
> -
> -	hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> -	if (hole_pos > read->rd_offset)
> -		*maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
> -	*maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
> -
> -	/* Content type, offset, byte count */
> -	p = xdr_reserve_space(xdr, 4 + 8 + 4);
> -	if (!p)
> -		return nfserr_resource;
> +	unsigned int starting_len = xdr->buf->len;
> +	__be32 nfserr, zero = xdr_zero;
> +	unsigned int pad;
> 
> +	/*
> +	 * Reserve the maximum abount of space needed to craft a READ_PLUS
> +	 * reply. The call to xdr_reserve_space_vec() switches us to the
> +	 * xdr->pages, which we then read file data into before analyzing
> +	 * the individual segments.
> +	 */
> 	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
> 	if (read->rd_vlen < 0)
> 		return nfserr_resource;
> 
> -	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> -			    resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
> +	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, read->rd_nf->nf_file,
> +			    read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
> +			    maxcount, eof);
> 	if (nfserr)
> 		return nfserr;
> -	xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
> +	xdr_truncate_encode(xdr, starting_len + xdr_align_size(*maxcount));
> 
> -	tmp = htonl(NFS4_CONTENT_DATA);
> -	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
> -	tmp64 = cpu_to_be64(read->rd_offset);
> -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
> -	tmp = htonl(*maxcount);
> -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
> -
> -	tmp = xdr_zero;
> -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
> -			       xdr_pad_size(*maxcount));
> +	pad = xdr_pad_size(*maxcount);
> +	write_bytes_to_xdr_buf(xdr->buf, starting_len + *maxcount, &zero, pad);
> 	return nfs_ok;
> }
> 
> +/**
> + * nfsd4_encode_read_plus_segment - Encode a single READ_PLUS segment
> + * @xdr: pointer to an xdr_stream
> + * @segment: pointer to a single segment
> + * @bufpos: xdr_stream offset to place the segment
> + * @segments: pointer to the total number of segments seen
> + *
> + * Performs surgery on the xdr_stream to compress out HOLE segments and
> + * to place DATA segments in the proper place.
> + */
> +static void
> +nfsd4_encode_read_plus_segment(struct xdr_stream *xdr,
> +			       struct read_plus_segment *segment,
> +			       unsigned int *bufpos, unsigned int *segments)
> +{
> +	struct xdr_buf *buf = xdr->buf;
> +
> +	xdr_encode_word(buf, *bufpos, segment->rp_type);
> +	xdr_encode_double(buf, *bufpos + XDR_UNIT, segment->rp_offset);
> +	*bufpos += 3 * XDR_UNIT;
> +
> +	if (segment->rp_type == NFS4_CONTENT_HOLE) {
> +		xdr_encode_double(buf, *bufpos, segment->rp_length);
> +		*bufpos += 2 * XDR_UNIT;
> +	} else {
> +		size_t align = xdr_align_size(segment->rp_length);
> +		xdr_encode_word(buf, *bufpos, segment->rp_length);
> +		if (*segments == 0)
> +			xdr_buf_trim_head(buf, XDR_UNIT);
> +
> +		xdr_stream_move_subsegment(xdr,
> +				buf->head[0].iov_len + segment->rp_page_pos,
> +				*bufpos + XDR_UNIT, align);
> +		*bufpos += XDR_UNIT + align;
> +	}
> +
> +	*segments += 1;
> +}
> +
> static __be32
> -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> -			    struct nfsd4_read *read,
> -			    unsigned long *maxcount, u32 *eof)
> +nfsd4_encode_read_plus_segments(struct nfsd4_compoundres *resp,
> +				struct nfsd4_read *read,
> +				unsigned int *segments, u32 *eof)
> {
> -	struct file *file = read->rd_nf->nf_file;
> -	loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> -	loff_t f_size = i_size_read(file_inode(file));
> -	unsigned long count;
> -	__be32 *p;
> +	struct xdr_stream *xdr = resp->xdr;
> +	unsigned int bufpos = xdr->buf->len;
> +	u64 offset = read->rd_offset;
> +	struct read_plus_segment segment;
> +	enum data_content4 pagetype;
> +	unsigned long maxcount;
> +	unsigned int pagenum = 0;
> +	unsigned int pagelen;
> +	char *vpage, *p;
> +	__be32 nfserr;
> 
> -	if (data_pos == -ENXIO)
> -		data_pos = f_size;
> -	else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> -		return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> -	count = data_pos - read->rd_offset;
> -
> -	/* Content type, offset, byte count */
> -	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> -	if (!p)
> +	/* enough space for a HOLE segment before we switch to the pages */
> +	if (!xdr_reserve_space(xdr, 5 * XDR_UNIT))
> 		return nfserr_resource;
> +	xdr_commit_encode(xdr);
> 
> -	*p++ = htonl(NFS4_CONTENT_HOLE);
> -	p = xdr_encode_hyper(p, read->rd_offset);
> -	p = xdr_encode_hyper(p, count);
> +	maxcount = min_t(unsigned long, read->rd_length,
> +			 (xdr->buf->buflen - xdr->buf->len));
> 
> -	*eof = (read->rd_offset + count) >= f_size;
> -	*maxcount = min_t(unsigned long, count, *maxcount);
> +	nfserr = nfsd4_read_plus_readv(resp, read, &maxcount, eof);
> +	if (nfserr)
> +		return nfserr;
> +
> +	while (maxcount > 0) {
> +		vpage = xdr_buf_nth_page_address(xdr->buf, pagenum, &pagelen);
> +		pagelen = min_t(unsigned int, pagelen, maxcount);
> +		if (!vpage || pagelen == 0)
> +			break;
> +		p = memchr_inv(vpage, 0, pagelen);

I'm still not happy about touching every byte in each READ_PLUS
payload. I think even though the rest of this work is merge-ready,
this is a brute-force mechanism that's OK for a proof of concept
but not appropriate for production-ready code.

I've cc'd linux-fsdevel to see if we can get some more ideas going
and move this forward.

Another thought I had was to support returning only one or two
segments per reply. One CONTENT segment, one HOLE segment, or one
of each. Would that be enough to prevent the issues around file
updates racing with the construction of the reply?

Just trying to think outside the box. I think efficiency is the
point of READ_PLUS; touching every byte in the payload misses
that goal IMO.


> +		pagetype = (p == NULL) ? NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA;
> +
> +		if (pagetype != segment.rp_type || pagenum == 0) {
> +			if (likely(pagenum > 0)) {
> +				nfsd4_encode_read_plus_segment(xdr, &segment,
> +							      &bufpos, segments);
> +				offset += segment.rp_length;
> +			}
> +			segment.rp_type = pagetype;
> +			segment.rp_offset = offset;
> +			segment.rp_length = pagelen;
> +			segment.rp_page_pos = pagenum * PAGE_SIZE;
> +		} else
> +			segment.rp_length += pagelen;
> +
> +		maxcount -= pagelen;
> +		pagenum++;
> +	}
> +
> +	nfsd4_encode_read_plus_segment(xdr, &segment, &bufpos, segments);
> +	xdr_truncate_encode(xdr, bufpos);
> 	return nfs_ok;
> }
> 
> @@ -4813,69 +4870,31 @@ static __be32
> nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> 		       struct nfsd4_read *read)
> {
> -	unsigned long maxcount, count;
> 	struct xdr_stream *xdr = resp->xdr;
> -	struct file *file;
> 	int starting_len = xdr->buf->len;
> -	int last_segment = xdr->buf->len;
> -	int segments = 0;
> -	__be32 *p, tmp;
> -	bool is_data;
> -	loff_t pos;
> +	unsigned int segments = 0;
> 	u32 eof;
> 
> 	if (nfserr)
> 		return nfserr;
> -	file = read->rd_nf->nf_file;
> 
> 	/* eof flag, segment count */
> -	p = xdr_reserve_space(xdr, 4 + 4);
> -	if (!p)
> +	if (!xdr_reserve_space(xdr, 2 * XDR_UNIT))
> 		return nfserr_resource;
> 	xdr_commit_encode(xdr);
> 
> -	maxcount = min_t(unsigned long, read->rd_length,
> -			 (xdr->buf->buflen - xdr->buf->len));
> -	count    = maxcount;
> -
> -	eof = read->rd_offset >= i_size_read(file_inode(file));
> +	eof = read->rd_offset >= i_size_read(file_inode(read->rd_nf->nf_file));
> 	if (eof)
> 		goto out;
> 
> -	pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> -	is_data = pos > read->rd_offset;
> -
> -	while (count > 0 && !eof) {
> -		maxcount = count;
> -		if (is_data)
> -			nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
> -						segments == 0 ? &pos : NULL);
> -		else
> -			nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
> -		if (nfserr)
> -			goto out;
> -		count -= maxcount;
> -		read->rd_offset += maxcount;
> -		is_data = !is_data;
> -		last_segment = xdr->buf->len;
> -		segments++;
> -	}
> -
> +	nfserr = nfsd4_encode_read_plus_segments(resp, read, &segments, &eof);
> out:
> -	if (nfserr && segments == 0)
> +	if (nfserr)
> 		xdr_truncate_encode(xdr, starting_len);
> 	else {
> -		if (nfserr) {
> -			xdr_truncate_encode(xdr, last_segment);
> -			nfserr = nfs_ok;
> -			eof = 0;
> -		}
> -		tmp = htonl(eof);
> -		write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
> -		tmp = htonl(segments);
> -		write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
> +		xdr_encode_word(xdr->buf, starting_len, eof);
> +		xdr_encode_word(xdr->buf, starting_len + XDR_UNIT, segments);
> 	}
> -
> 	return nfserr;
> }
> 
> -- 
> 2.37.1
> 

--
Chuck Lever
Dave Chinner July 18, 2022, 1:15 a.m. UTC | #2
On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> > On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> > 
> > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > 
> > Rather than relying on the underlying filesystem to tell us where hole
> > and data segments are through vfs_llseek(), let's instead do the hole
> > compression ourselves. This has a few advantages over the old
> > implementation:
> > 
> > 1) A single call to the underlying filesystem through nfsd_readv() means
> >   the file can't change from underneath us in the middle of encoding.

Hi Anna,

I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
of nfsd4_encode_read_plus_data() that is used to trim the data that
has already been read out of the file?

What's the problem with racing with a hole punch here? All it does
is shorten the read data returned to match the new hole, so all it's
doing is making the returned data "more correct".

OTOH, if something allocates over a hole that the read filled with
zeros, what's the problem with occasionally returning zeros as data?
Regardless, if this has raced with a write to the file that filled
that hole, we're already returning stale data/hole information to
the client regardless of whether we trim it or not....

i.e. I can't see a correctness or data integrity problem here that
doesn't already exist, and I have my doubts that hole
punching/filling racing with reads happens often enough to create a
performance or bandwidth problem OTW. Hence I've really got no idea
what the problem that needs to be solved here is.

Can you explain what the symptoms of the problem a user would see
that this change solves?

> > 2) A single call to the underlying filestem also means that the
> >   underlying filesystem only needs to synchronize cached and on-disk
> >   data one time instead of potentially many speeding up the reply.

SEEK_HOLE/DATA doesn't require cached data to be sync'd to disk to
be coherent - that's only a problem FIEMAP has (and syncing cached
data doesn't fix the TOCTOU coherency issue!).  i.e. SEEK_HOLE/DATA
will check the page cache for data if appropriate (e.g. unwritten
disk extents may have data in memory over the top of them) instead
of syncing data to disk.

> > 3) Hole support for filesystems that don't support SEEK_HOLE and SEEK_DATA
> 
> Thanks for addressing my cosmetic comments! Looks good.
> 
> I'm still not clear why NFSD needs to support filesystems that
> do not support SEEK_HOLE/DATA. I'm guessing this is just a side
> benefit of the memchr_inv approach below, not really a goal of
> this overhaul?

Don't other mechanisms in NFSv4 require SEEK_HOLE/SEEK_DATA support?
ie. supporting a useful implementation of NFS4_CONTENT_DATA/
NFS4_CONTENT_HOLE across the wire?  Why doesn't that server side
implementations use memchr_inv() rather than allowing filesystems to
optimise away the data read requirement for the operation?

Or don't we care because what is returned to the client is *always*
going to have races with whatever the client decides what to do with
the information? If that's true, then why do we care if
nfsd4_encode_read_plus_data() may return some zeros instead of a
shorter read and a longer hole?

> One more comment below.
> 
> 
> > I also included an optimization where we can cut down on the amount of
> > memory being shifed around by doing the compression as (hole, data)
> > pairs.
> > 
> > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > ---
> > fs/nfsd/nfs4xdr.c | 219 +++++++++++++++++++++++++---------------------
> > 1 file changed, 119 insertions(+), 100 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index 61b2aae81abb..df8289fce4ef 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -4731,81 +4731,138 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
> > 	return nfserr;
> > }
> > 
> > +struct read_plus_segment {
> > +	enum data_content4	rp_type;
> > +	u64			rp_offset;
> > +	u64			rp_length;
> > +	unsigned int		rp_page_pos;
> > +};
> > +
> > static __be32
> > -nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > -			    struct nfsd4_read *read,
> > -			    unsigned long *maxcount, u32 *eof,
> > -			    loff_t *pos)
> > +nfsd4_read_plus_readv(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
> > +		      unsigned long *maxcount, u32 *eof)
> > {
> > 	struct xdr_stream *xdr = resp->xdr;
> > -	struct file *file = read->rd_nf->nf_file;
> > -	int starting_len = xdr->buf->len;
> > -	loff_t hole_pos;
> > -	__be32 nfserr;
> > -	__be32 *p, tmp;
> > -	__be64 tmp64;
> > -
> > -	hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> > -	if (hole_pos > read->rd_offset)
> > -		*maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
> > -	*maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
> > -
> > -	/* Content type, offset, byte count */
> > -	p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > -	if (!p)
> > -		return nfserr_resource;
> > +	unsigned int starting_len = xdr->buf->len;
> > +	__be32 nfserr, zero = xdr_zero;
> > +	unsigned int pad;
> > 
> > +	/*
> > +	 * Reserve the maximum abount of space needed to craft a READ_PLUS
> > +	 * reply. The call to xdr_reserve_space_vec() switches us to the
> > +	 * xdr->pages, which we then read file data into before analyzing
> > +	 * the individual segments.
> > +	 */
> > 	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
> > 	if (read->rd_vlen < 0)
> > 		return nfserr_resource;
> > 
> > -	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> > -			    resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
> > +	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, read->rd_nf->nf_file,
> > +			    read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
> > +			    maxcount, eof);
> > 	if (nfserr)
> > 		return nfserr;
> > -	xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
> > +	xdr_truncate_encode(xdr, starting_len + xdr_align_size(*maxcount));
> > 
> > -	tmp = htonl(NFS4_CONTENT_DATA);
> > -	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
> > -	tmp64 = cpu_to_be64(read->rd_offset);
> > -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
> > -	tmp = htonl(*maxcount);
> > -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
> > -
> > -	tmp = xdr_zero;
> > -	write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
> > -			       xdr_pad_size(*maxcount));
> > +	pad = xdr_pad_size(*maxcount);
> > +	write_bytes_to_xdr_buf(xdr->buf, starting_len + *maxcount, &zero, pad);
> > 	return nfs_ok;
> > }
> > 
> > +/**
> > + * nfsd4_encode_read_plus_segment - Encode a single READ_PLUS segment
> > + * @xdr: pointer to an xdr_stream
> > + * @segment: pointer to a single segment
> > + * @bufpos: xdr_stream offset to place the segment
> > + * @segments: pointer to the total number of segments seen
> > + *
> > + * Performs surgery on the xdr_stream to compress out HOLE segments and
> > + * to place DATA segments in the proper place.
> > + */
> > +static void
> > +nfsd4_encode_read_plus_segment(struct xdr_stream *xdr,
> > +			       struct read_plus_segment *segment,
> > +			       unsigned int *bufpos, unsigned int *segments)
> > +{
> > +	struct xdr_buf *buf = xdr->buf;
> > +
> > +	xdr_encode_word(buf, *bufpos, segment->rp_type);
> > +	xdr_encode_double(buf, *bufpos + XDR_UNIT, segment->rp_offset);
> > +	*bufpos += 3 * XDR_UNIT;
> > +
> > +	if (segment->rp_type == NFS4_CONTENT_HOLE) {
> > +		xdr_encode_double(buf, *bufpos, segment->rp_length);
> > +		*bufpos += 2 * XDR_UNIT;
> > +	} else {
> > +		size_t align = xdr_align_size(segment->rp_length);
> > +		xdr_encode_word(buf, *bufpos, segment->rp_length);
> > +		if (*segments == 0)
> > +			xdr_buf_trim_head(buf, XDR_UNIT);
> > +
> > +		xdr_stream_move_subsegment(xdr,
> > +				buf->head[0].iov_len + segment->rp_page_pos,
> > +				*bufpos + XDR_UNIT, align);
> > +		*bufpos += XDR_UNIT + align;
> > +	}
> > +
> > +	*segments += 1;
> > +}
> > +
> > static __be32
> > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> > -			    struct nfsd4_read *read,
> > -			    unsigned long *maxcount, u32 *eof)
> > +nfsd4_encode_read_plus_segments(struct nfsd4_compoundres *resp,
> > +				struct nfsd4_read *read,
> > +				unsigned int *segments, u32 *eof)
> > {
> > -	struct file *file = read->rd_nf->nf_file;
> > -	loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> > -	loff_t f_size = i_size_read(file_inode(file));
> > -	unsigned long count;
> > -	__be32 *p;
> > +	struct xdr_stream *xdr = resp->xdr;
> > +	unsigned int bufpos = xdr->buf->len;
> > +	u64 offset = read->rd_offset;
> > +	struct read_plus_segment segment;
> > +	enum data_content4 pagetype;
> > +	unsigned long maxcount;
> > +	unsigned int pagenum = 0;
> > +	unsigned int pagelen;
> > +	char *vpage, *p;
> > +	__be32 nfserr;
> > 
> > -	if (data_pos == -ENXIO)
> > -		data_pos = f_size;
> > -	else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> > -		return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> > -	count = data_pos - read->rd_offset;
> > -
> > -	/* Content type, offset, byte count */
> > -	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> > -	if (!p)
> > +	/* enough space for a HOLE segment before we switch to the pages */
> > +	if (!xdr_reserve_space(xdr, 5 * XDR_UNIT))
> > 		return nfserr_resource;
> > +	xdr_commit_encode(xdr);
> > 
> > -	*p++ = htonl(NFS4_CONTENT_HOLE);
> > -	p = xdr_encode_hyper(p, read->rd_offset);
> > -	p = xdr_encode_hyper(p, count);
> > +	maxcount = min_t(unsigned long, read->rd_length,
> > +			 (xdr->buf->buflen - xdr->buf->len));
> > 
> > -	*eof = (read->rd_offset + count) >= f_size;
> > -	*maxcount = min_t(unsigned long, count, *maxcount);
> > +	nfserr = nfsd4_read_plus_readv(resp, read, &maxcount, eof);
> > +	if (nfserr)
> > +		return nfserr;
> > +
> > +	while (maxcount > 0) {
> > +		vpage = xdr_buf_nth_page_address(xdr->buf, pagenum, &pagelen);
> > +		pagelen = min_t(unsigned int, pagelen, maxcount);
> > +		if (!vpage || pagelen == 0)
> > +			break;
> > +		p = memchr_inv(vpage, 0, pagelen);
> 
> I'm still not happy about touching every byte in each READ_PLUS
> payload. I think even though the rest of this work is merge-ready,
> this is a brute-force mechanism that's OK for a proof of concept
> but not appropriate for production-ready code.

Seems like a step backwards as it defeats the benefit zero-copy read
IO paths on the server side....

> I've cc'd linux-fsdevel to see if we can get some more ideas going
> and move this forward.
> 
> Another thought I had was to support returning only one or two
> segments per reply. One CONTENT segment, one HOLE segment, or one
> of each. Would that be enough to prevent the issues around file
> updates racing with the construction of the reply?

Before I can make any sort of useful suggestion, I need to have it
explained to me why we care if the underlying file mapping has
changed between the read of the data and the SEEK_HOLE trim check,
because it's not at all clear to me what problem this change is
actually solving.

Cheers,

Dave.
Chuck Lever III July 19, 2022, 5:21 p.m. UTC | #3
> On Jul 17, 2022, at 9:15 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
>>> On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
>>> 
>>> +nfsd4_encode_read_plus_segments(struct nfsd4_compoundres *resp,
>>> +				struct nfsd4_read *read,
>>> +				unsigned int *segments, u32 *eof)
>>> {
>>> -	struct file *file = read->rd_nf->nf_file;
>>> -	loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
>>> -	loff_t f_size = i_size_read(file_inode(file));
>>> -	unsigned long count;
>>> -	__be32 *p;
>>> +	struct xdr_stream *xdr = resp->xdr;
>>> +	unsigned int bufpos = xdr->buf->len;
>>> +	u64 offset = read->rd_offset;
>>> +	struct read_plus_segment segment;
>>> +	enum data_content4 pagetype;
>>> +	unsigned long maxcount;
>>> +	unsigned int pagenum = 0;
>>> +	unsigned int pagelen;
>>> +	char *vpage, *p;
>>> +	__be32 nfserr;
>>> 
>>> -	if (data_pos == -ENXIO)
>>> -		data_pos = f_size;
>>> -	else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
>>> -		return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
>>> -	count = data_pos - read->rd_offset;
>>> -
>>> -	/* Content type, offset, byte count */
>>> -	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
>>> -	if (!p)
>>> +	/* enough space for a HOLE segment before we switch to the pages */
>>> +	if (!xdr_reserve_space(xdr, 5 * XDR_UNIT))
>>> 		return nfserr_resource;
>>> +	xdr_commit_encode(xdr);
>>> 
>>> -	*p++ = htonl(NFS4_CONTENT_HOLE);
>>> -	p = xdr_encode_hyper(p, read->rd_offset);
>>> -	p = xdr_encode_hyper(p, count);
>>> +	maxcount = min_t(unsigned long, read->rd_length,
>>> +			 (xdr->buf->buflen - xdr->buf->len));
>>> 
>>> -	*eof = (read->rd_offset + count) >= f_size;
>>> -	*maxcount = min_t(unsigned long, count, *maxcount);
>>> +	nfserr = nfsd4_read_plus_readv(resp, read, &maxcount, eof);
>>> +	if (nfserr)
>>> +		return nfserr;
>>> +
>>> +	while (maxcount > 0) {
>>> +		vpage = xdr_buf_nth_page_address(xdr->buf, pagenum, &pagelen);
>>> +		pagelen = min_t(unsigned int, pagelen, maxcount);
>>> +		if (!vpage || pagelen == 0)
>>> +			break;
>>> +		p = memchr_inv(vpage, 0, pagelen);
>> 
>> I'm still not happy about touching every byte in each READ_PLUS
>> payload. I think even though the rest of this work is merge-ready,
>> this is a brute-force mechanism that's OK for a proof of concept
>> but not appropriate for production-ready code.
> 
> Seems like a step backwards as it defeats the benefit zero-copy read
> IO paths on the server side....

Tom Haynes' vision for READ_PLUS was to eventually replace the
legacy READ operation. That means READ_PLUS(CONTENT_DATA) needs
to be as fast and efficient as plain READ. (It would be great
to use splice reads for CONTENT_DATA if we can!)

But I also thought the purpose of READ_PLUS was to help clients
preserve unallocated extents in files during copy operations.
An unallocated extent is not the same as an allocated extent
that has zeroes written into it. IIUC this new logic does not
distinguish between those two cases at all. (And please correct
me if this is really not the goal of READ_PLUS).

I would like to retain precise detection of unallocated extents
in files. Maybe SEEK_HOLE/SEEK_DATA is not how to do that, but
my feeling is that checking for zero bytes is definitely not
the way to do it.


>> I've cc'd linux-fsdevel to see if we can get some more ideas going
>> and move this forward.
>> 
>> Another thought I had was to support returning only one or two
>> segments per reply. One CONTENT segment, one HOLE segment, or one
>> of each. Would that be enough to prevent the issues around file
>> updates racing with the construction of the reply?
> 
> Before I can make any sort of useful suggestion, I need to have it
> explained to me why we care if the underlying file mapping has
> changed between the read of the data and the SEEK_HOLE trim check,
> because it's not at all clear to me what problem this change is
> actually solving.

The cover letter for this series calls out generic/091 and
generic/263 -- it mentioned both are failing with NFSv4.2. I've
tried to reproduce failures here, but both passed.

Anna, can you provide a root-cause analysis of what is failing
in your testing? Maybe a reproducer for us to kick around?
I'm guessing you might be encountering races because your
usual test environment is virtual, so we need to understand
how timing effects the results.


--
Chuck Lever
Anna Schumaker July 19, 2022, 8:24 p.m. UTC | #4
On Tue, Jul 19, 2022 at 1:21 PM Chuck Lever III <chuck.lever@oracle.com> wrote:
>
>
>
> > On Jul 17, 2022, at 9:15 PM, Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> >>> On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> >>>
> >>> +nfsd4_encode_read_plus_segments(struct nfsd4_compoundres *resp,
> >>> +                           struct nfsd4_read *read,
> >>> +                           unsigned int *segments, u32 *eof)
> >>> {
> >>> -   struct file *file = read->rd_nf->nf_file;
> >>> -   loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> >>> -   loff_t f_size = i_size_read(file_inode(file));
> >>> -   unsigned long count;
> >>> -   __be32 *p;
> >>> +   struct xdr_stream *xdr = resp->xdr;
> >>> +   unsigned int bufpos = xdr->buf->len;
> >>> +   u64 offset = read->rd_offset;
> >>> +   struct read_plus_segment segment;
> >>> +   enum data_content4 pagetype;
> >>> +   unsigned long maxcount;
> >>> +   unsigned int pagenum = 0;
> >>> +   unsigned int pagelen;
> >>> +   char *vpage, *p;
> >>> +   __be32 nfserr;
> >>>
> >>> -   if (data_pos == -ENXIO)
> >>> -           data_pos = f_size;
> >>> -   else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> >>> -           return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> >>> -   count = data_pos - read->rd_offset;
> >>> -
> >>> -   /* Content type, offset, byte count */
> >>> -   p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> >>> -   if (!p)
> >>> +   /* enough space for a HOLE segment before we switch to the pages */
> >>> +   if (!xdr_reserve_space(xdr, 5 * XDR_UNIT))
> >>>             return nfserr_resource;
> >>> +   xdr_commit_encode(xdr);
> >>>
> >>> -   *p++ = htonl(NFS4_CONTENT_HOLE);
> >>> -   p = xdr_encode_hyper(p, read->rd_offset);
> >>> -   p = xdr_encode_hyper(p, count);
> >>> +   maxcount = min_t(unsigned long, read->rd_length,
> >>> +                    (xdr->buf->buflen - xdr->buf->len));
> >>>
> >>> -   *eof = (read->rd_offset + count) >= f_size;
> >>> -   *maxcount = min_t(unsigned long, count, *maxcount);
> >>> +   nfserr = nfsd4_read_plus_readv(resp, read, &maxcount, eof);
> >>> +   if (nfserr)
> >>> +           return nfserr;
> >>> +
> >>> +   while (maxcount > 0) {
> >>> +           vpage = xdr_buf_nth_page_address(xdr->buf, pagenum, &pagelen);
> >>> +           pagelen = min_t(unsigned int, pagelen, maxcount);
> >>> +           if (!vpage || pagelen == 0)
> >>> +                   break;
> >>> +           p = memchr_inv(vpage, 0, pagelen);
> >>
> >> I'm still not happy about touching every byte in each READ_PLUS
> >> payload. I think even though the rest of this work is merge-ready,
> >> this is a brute-force mechanism that's OK for a proof of concept
> >> but not appropriate for production-ready code.
> >
> > Seems like a step backwards as it defeats the benefit zero-copy read
> > IO paths on the server side....
>
> Tom Haynes' vision for READ_PLUS was to eventually replace the
> legacy READ operation. That means READ_PLUS(CONTENT_DATA) needs
> to be as fast and efficient as plain READ. (It would be great
> to use splice reads for CONTENT_DATA if we can!)

I remember Bruce thinking we could only use splice reads for the very
first segment if it's data, but that was a few years ago so I don't
know if anything has changed that would allow spliced reads for all
data.

>
> But I also thought the purpose of READ_PLUS was to help clients
> preserve unallocated extents in files during copy operations.
> An unallocated extent is not the same as an allocated extent
> that has zeroes written into it. IIUC this new logic does not
> distinguish between those two cases at all. (And please correct
> me if this is really not the goal of READ_PLUS).

I wasn't aware of this as a goal of READ_PLUS. As of right now, Linux
doesn't really have a way to punch holes into pagecache data, so we
and up needing to zero-fill on the client side during decoding.

>
> I would like to retain precise detection of unallocated extents
> in files. Maybe SEEK_HOLE/SEEK_DATA is not how to do that, but
> my feeling is that checking for zero bytes is definitely not
> the way to do it.

Ok.
>
>
> >> I've cc'd linux-fsdevel to see if we can get some more ideas going
> >> and move this forward.
> >>
> >> Another thought I had was to support returning only one or two
> >> segments per reply. One CONTENT segment, one HOLE segment, or one
> >> of each. Would that be enough to prevent the issues around file
> >> updates racing with the construction of the reply?
> >
> > Before I can make any sort of useful suggestion, I need to have it
> > explained to me why we care if the underlying file mapping has
> > changed between the read of the data and the SEEK_HOLE trim check,
> > because it's not at all clear to me what problem this change is
> > actually solving.
>
> The cover letter for this series calls out generic/091 and
> generic/263 -- it mentioned both are failing with NFSv4.2. I've
> tried to reproduce failures here, but both passed.

Did you check that CONFIG_NFS_V4_2_READ_PLUS=y on the client? We had
it disabled due to these tests and the very, very long time servers
exporting btrfs take to read already-cached data (see btrfs sections
in the wiki page linked in the cover letter).

There is also this bugzilla documenting the problem:
https://bugzilla.kernel.org/show_bug.cgi?id=215673

>
> Anna, can you provide a root-cause analysis of what is failing
> in your testing? Maybe a reproducer for us to kick around?

The only reproducers I have are the xfstests mentioned. They fail
pretty reliably on my setup (linux nfsd exporting xfs).

Anna

> I'm guessing you might be encountering races because your
> usual test environment is virtual, so we need to understand
> how timing effects the results.
>
>
> --
> Chuck Lever
>
>
>
Anna Schumaker July 19, 2022, 8:46 p.m. UTC | #5
On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> > > On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> > >
> > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > >
> > > Rather than relying on the underlying filesystem to tell us where hole
> > > and data segments are through vfs_llseek(), let's instead do the hole
> > > compression ourselves. This has a few advantages over the old
> > > implementation:
> > >
> > > 1) A single call to the underlying filesystem through nfsd_readv() means
> > >   the file can't change from underneath us in the middle of encoding.
>
> Hi Anna,
>
> I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
> of nfsd4_encode_read_plus_data() that is used to trim the data that
> has already been read out of the file?

There is also the vfs_llseek(SEEK_DATA) call at the start of
nfsd4_encode_read_plus_hole(). They are used to determine the length
of the current hole or data segment.

>
> What's the problem with racing with a hole punch here? All it does
> is shorten the read data returned to match the new hole, so all it's
> doing is making the returned data "more correct".

The problem is we call vfs_llseek() potentially many times when
encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
loop where we alternate between hole and data segments until we've
encoded the requested number of bytes. My attempts at locking the file
have resulted in a deadlock since vfs_llseek() also locks the file, so
the file could change from underneath us during each iteration of the
loop.

>
> OTOH, if something allocates over a hole that the read filled with
> zeros, what's the problem with occasionally returning zeros as data?
> Regardless, if this has raced with a write to the file that filled
> that hole, we're already returning stale data/hole information to
> the client regardless of whether we trim it or not....
>
> i.e. I can't see a correctness or data integrity problem here that
> doesn't already exist, and I have my doubts that hole
> punching/filling racing with reads happens often enough to create a
> performance or bandwidth problem OTW. Hence I've really got no idea
> what the problem that needs to be solved here is.
>
> Can you explain what the symptoms of the problem a user would see
> that this change solves?

This fixes xfstests generic/091 and generic/263, along with this
reported bug: https://bugzilla.kernel.org/show_bug.cgi?id=215673
>
> > > 2) A single call to the underlying filestem also means that the
> > >   underlying filesystem only needs to synchronize cached and on-disk
> > >   data one time instead of potentially many speeding up the reply.
>
> SEEK_HOLE/DATA doesn't require cached data to be sync'd to disk to
> be coherent - that's only a problem FIEMAP has (and syncing cached
> data doesn't fix the TOCTOU coherency issue!).  i.e. SEEK_HOLE/DATA
> will check the page cache for data if appropriate (e.g. unwritten
> disk extents may have data in memory over the top of them) instead
> of syncing data to disk.

For some reason, btrfs on virtual hardware has terrible performance
numbers when using vfs_llseek() with files that are already in the
server's cache. I think it had something to do with how they lock
extents, and some extra work that needs to be done if the file already
exists in the server's memory but it's been  a few years since I've
gone into their code to figure out where the slowdown is coming from.
See this section of my performance results wiki page:
https://wiki.linux-nfs.org/wiki/index.php/Read_Plus_May_2022#BTRFS_3

Anna

>
> > > 3) Hole support for filesystems that don't support SEEK_HOLE and SEEK_DATA
> >
> > Thanks for addressing my cosmetic comments! Looks good.
> >
> > I'm still not clear why NFSD needs to support filesystems that
> > do not support SEEK_HOLE/DATA. I'm guessing this is just a side
> > benefit of the memchr_inv approach below, not really a goal of
> > this overhaul?
>
> Don't other mechanisms in NFSv4 require SEEK_HOLE/SEEK_DATA support?
> ie. supporting a useful implementation of NFS4_CONTENT_DATA/
> NFS4_CONTENT_HOLE across the wire?  Why doesn't that server side
> implementations use memchr_inv() rather than allowing filesystems to
> optimise away the data read requirement for the operation?
>
> Or don't we care because what is returned to the client is *always*
> going to have races with whatever the client decides what to do with
> the information? If that's true, then why do we care if
> nfsd4_encode_read_plus_data() may return some zeros instead of a
> shorter read and a longer hole?
>
> > One more comment below.
> >
> >
> > > I also included an optimization where we can cut down on the amount of
> > > memory being shifed around by doing the compression as (hole, data)
> > > pairs.
> > >
> > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > ---
> > > fs/nfsd/nfs4xdr.c | 219 +++++++++++++++++++++++++---------------------
> > > 1 file changed, 119 insertions(+), 100 deletions(-)
> > >
> > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > index 61b2aae81abb..df8289fce4ef 100644
> > > --- a/fs/nfsd/nfs4xdr.c
> > > +++ b/fs/nfsd/nfs4xdr.c
> > > @@ -4731,81 +4731,138 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
> > >     return nfserr;
> > > }
> > >
> > > +struct read_plus_segment {
> > > +   enum data_content4      rp_type;
> > > +   u64                     rp_offset;
> > > +   u64                     rp_length;
> > > +   unsigned int            rp_page_pos;
> > > +};
> > > +
> > > static __be32
> > > -nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > > -                       struct nfsd4_read *read,
> > > -                       unsigned long *maxcount, u32 *eof,
> > > -                       loff_t *pos)
> > > +nfsd4_read_plus_readv(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
> > > +                 unsigned long *maxcount, u32 *eof)
> > > {
> > >     struct xdr_stream *xdr = resp->xdr;
> > > -   struct file *file = read->rd_nf->nf_file;
> > > -   int starting_len = xdr->buf->len;
> > > -   loff_t hole_pos;
> > > -   __be32 nfserr;
> > > -   __be32 *p, tmp;
> > > -   __be64 tmp64;
> > > -
> > > -   hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> > > -   if (hole_pos > read->rd_offset)
> > > -           *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
> > > -   *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
> > > -
> > > -   /* Content type, offset, byte count */
> > > -   p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > > -   if (!p)
> > > -           return nfserr_resource;
> > > +   unsigned int starting_len = xdr->buf->len;
> > > +   __be32 nfserr, zero = xdr_zero;
> > > +   unsigned int pad;
> > >
> > > +   /*
> > > +    * Reserve the maximum abount of space needed to craft a READ_PLUS
> > > +    * reply. The call to xdr_reserve_space_vec() switches us to the
> > > +    * xdr->pages, which we then read file data into before analyzing
> > > +    * the individual segments.
> > > +    */
> > >     read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
> > >     if (read->rd_vlen < 0)
> > >             return nfserr_resource;
> > >
> > > -   nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> > > -                       resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
> > > +   nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, read->rd_nf->nf_file,
> > > +                       read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
> > > +                       maxcount, eof);
> > >     if (nfserr)
> > >             return nfserr;
> > > -   xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
> > > +   xdr_truncate_encode(xdr, starting_len + xdr_align_size(*maxcount));
> > >
> > > -   tmp = htonl(NFS4_CONTENT_DATA);
> > > -   write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
> > > -   tmp64 = cpu_to_be64(read->rd_offset);
> > > -   write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
> > > -   tmp = htonl(*maxcount);
> > > -   write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
> > > -
> > > -   tmp = xdr_zero;
> > > -   write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
> > > -                          xdr_pad_size(*maxcount));
> > > +   pad = xdr_pad_size(*maxcount);
> > > +   write_bytes_to_xdr_buf(xdr->buf, starting_len + *maxcount, &zero, pad);
> > >     return nfs_ok;
> > > }
> > >
> > > +/**
> > > + * nfsd4_encode_read_plus_segment - Encode a single READ_PLUS segment
> > > + * @xdr: pointer to an xdr_stream
> > > + * @segment: pointer to a single segment
> > > + * @bufpos: xdr_stream offset to place the segment
> > > + * @segments: pointer to the total number of segments seen
> > > + *
> > > + * Performs surgery on the xdr_stream to compress out HOLE segments and
> > > + * to place DATA segments in the proper place.
> > > + */
> > > +static void
> > > +nfsd4_encode_read_plus_segment(struct xdr_stream *xdr,
> > > +                          struct read_plus_segment *segment,
> > > +                          unsigned int *bufpos, unsigned int *segments)
> > > +{
> > > +   struct xdr_buf *buf = xdr->buf;
> > > +
> > > +   xdr_encode_word(buf, *bufpos, segment->rp_type);
> > > +   xdr_encode_double(buf, *bufpos + XDR_UNIT, segment->rp_offset);
> > > +   *bufpos += 3 * XDR_UNIT;
> > > +
> > > +   if (segment->rp_type == NFS4_CONTENT_HOLE) {
> > > +           xdr_encode_double(buf, *bufpos, segment->rp_length);
> > > +           *bufpos += 2 * XDR_UNIT;
> > > +   } else {
> > > +           size_t align = xdr_align_size(segment->rp_length);
> > > +           xdr_encode_word(buf, *bufpos, segment->rp_length);
> > > +           if (*segments == 0)
> > > +                   xdr_buf_trim_head(buf, XDR_UNIT);
> > > +
> > > +           xdr_stream_move_subsegment(xdr,
> > > +                           buf->head[0].iov_len + segment->rp_page_pos,
> > > +                           *bufpos + XDR_UNIT, align);
> > > +           *bufpos += XDR_UNIT + align;
> > > +   }
> > > +
> > > +   *segments += 1;
> > > +}
> > > +
> > > static __be32
> > > -nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
> > > -                       struct nfsd4_read *read,
> > > -                       unsigned long *maxcount, u32 *eof)
> > > +nfsd4_encode_read_plus_segments(struct nfsd4_compoundres *resp,
> > > +                           struct nfsd4_read *read,
> > > +                           unsigned int *segments, u32 *eof)
> > > {
> > > -   struct file *file = read->rd_nf->nf_file;
> > > -   loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> > > -   loff_t f_size = i_size_read(file_inode(file));
> > > -   unsigned long count;
> > > -   __be32 *p;
> > > +   struct xdr_stream *xdr = resp->xdr;
> > > +   unsigned int bufpos = xdr->buf->len;
> > > +   u64 offset = read->rd_offset;
> > > +   struct read_plus_segment segment;
> > > +   enum data_content4 pagetype;
> > > +   unsigned long maxcount;
> > > +   unsigned int pagenum = 0;
> > > +   unsigned int pagelen;
> > > +   char *vpage, *p;
> > > +   __be32 nfserr;
> > >
> > > -   if (data_pos == -ENXIO)
> > > -           data_pos = f_size;
> > > -   else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> > > -           return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> > > -   count = data_pos - read->rd_offset;
> > > -
> > > -   /* Content type, offset, byte count */
> > > -   p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> > > -   if (!p)
> > > +   /* enough space for a HOLE segment before we switch to the pages */
> > > +   if (!xdr_reserve_space(xdr, 5 * XDR_UNIT))
> > >             return nfserr_resource;
> > > +   xdr_commit_encode(xdr);
> > >
> > > -   *p++ = htonl(NFS4_CONTENT_HOLE);
> > > -   p = xdr_encode_hyper(p, read->rd_offset);
> > > -   p = xdr_encode_hyper(p, count);
> > > +   maxcount = min_t(unsigned long, read->rd_length,
> > > +                    (xdr->buf->buflen - xdr->buf->len));
> > >
> > > -   *eof = (read->rd_offset + count) >= f_size;
> > > -   *maxcount = min_t(unsigned long, count, *maxcount);
> > > +   nfserr = nfsd4_read_plus_readv(resp, read, &maxcount, eof);
> > > +   if (nfserr)
> > > +           return nfserr;
> > > +
> > > +   while (maxcount > 0) {
> > > +           vpage = xdr_buf_nth_page_address(xdr->buf, pagenum, &pagelen);
> > > +           pagelen = min_t(unsigned int, pagelen, maxcount);
> > > +           if (!vpage || pagelen == 0)
> > > +                   break;
> > > +           p = memchr_inv(vpage, 0, pagelen);
> >
> > I'm still not happy about touching every byte in each READ_PLUS
> > payload. I think even though the rest of this work is merge-ready,
> > this is a brute-force mechanism that's OK for a proof of concept
> > but not appropriate for production-ready code.
>
> Seems like a step backwards as it defeats the benefit zero-copy read
> IO paths on the server side....
>
> > I've cc'd linux-fsdevel to see if we can get some more ideas going
> > and move this forward.
> >
> > Another thought I had was to support returning only one or two
> > segments per reply. One CONTENT segment, one HOLE segment, or one
> > of each. Would that be enough to prevent the issues around file
> > updates racing with the construction of the reply?
>
> Before I can make any sort of useful suggestion, I need to have it
> explained to me why we care if the underlying file mapping has
> changed between the read of the data and the SEEK_HOLE trim check,
> because it's not at all clear to me what problem this change is
> actually solving.
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
Chuck Lever III July 19, 2022, 8:47 p.m. UTC | #6
> On Jul 19, 2022, at 4:24 PM, Anna Schumaker <anna@kernel.org> wrote:
> 
> On Tue, Jul 19, 2022 at 1:21 PM Chuck Lever III <chuck.lever@oracle.com> wrote:
>> 
>> 
>> 
>>> On Jul 17, 2022, at 9:15 PM, Dave Chinner <david@fromorbit.com> wrote:
>>> 
>>> On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
>>>>> On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
>>>>> 
>>>>> +nfsd4_encode_read_plus_segments(struct nfsd4_compoundres *resp,
>>>>> + struct nfsd4_read *read,
>>>>> + unsigned int *segments, u32 *eof)
>>>>> {
>>>>> - struct file *file = read->rd_nf->nf_file;
>>>>> - loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
>>>>> - loff_t f_size = i_size_read(file_inode(file));
>>>>> - unsigned long count;
>>>>> - __be32 *p;
>>>>> + struct xdr_stream *xdr = resp->xdr;
>>>>> + unsigned int bufpos = xdr->buf->len;
>>>>> + u64 offset = read->rd_offset;
>>>>> + struct read_plus_segment segment;
>>>>> + enum data_content4 pagetype;
>>>>> + unsigned long maxcount;
>>>>> + unsigned int pagenum = 0;
>>>>> + unsigned int pagelen;
>>>>> + char *vpage, *p;
>>>>> + __be32 nfserr;
>>>>> 
>>>>> - if (data_pos == -ENXIO)
>>>>> - data_pos = f_size;
>>>>> - else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
>>>>> - return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
>>>>> - count = data_pos - read->rd_offset;
>>>>> -
>>>>> - /* Content type, offset, byte count */
>>>>> - p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
>>>>> - if (!p)
>>>>> + /* enough space for a HOLE segment before we switch to the pages */
>>>>> + if (!xdr_reserve_space(xdr, 5 * XDR_UNIT))
>>>>> return nfserr_resource;
>>>>> + xdr_commit_encode(xdr);
>>>>> 
>>>>> - *p++ = htonl(NFS4_CONTENT_HOLE);
>>>>> - p = xdr_encode_hyper(p, read->rd_offset);
>>>>> - p = xdr_encode_hyper(p, count);
>>>>> + maxcount = min_t(unsigned long, read->rd_length,
>>>>> + (xdr->buf->buflen - xdr->buf->len));
>>>>> 
>>>>> - *eof = (read->rd_offset + count) >= f_size;
>>>>> - *maxcount = min_t(unsigned long, count, *maxcount);
>>>>> + nfserr = nfsd4_read_plus_readv(resp, read, &maxcount, eof);
>>>>> + if (nfserr)
>>>>> + return nfserr;
>>>>> +
>>>>> + while (maxcount > 0) {
>>>>> + vpage = xdr_buf_nth_page_address(xdr->buf, pagenum, &pagelen);
>>>>> + pagelen = min_t(unsigned int, pagelen, maxcount);
>>>>> + if (!vpage || pagelen == 0)
>>>>> + break;
>>>>> + p = memchr_inv(vpage, 0, pagelen);
>>>> 
>>>> I'm still not happy about touching every byte in each READ_PLUS
>>>> payload. I think even though the rest of this work is merge-ready,
>>>> this is a brute-force mechanism that's OK for a proof of concept
>>>> but not appropriate for production-ready code.
>>> 
>>> Seems like a step backwards as it defeats the benefit zero-copy read
>>> IO paths on the server side....
>> 
>> Tom Haynes' vision for READ_PLUS was to eventually replace the
>> legacy READ operation. That means READ_PLUS(CONTENT_DATA) needs
>> to be as fast and efficient as plain READ. (It would be great
>> to use splice reads for CONTENT_DATA if we can!)
> 
> I remember Bruce thinking we could only use splice reads for the very
> first segment if it's data, but that was a few years ago so I don't
> know if anything has changed that would allow spliced reads for all
> data.

That might be a limitation of the current NFSv4 READ implementation
rather than something we always have to live with. But yes, I think
it's true that splice read only works for the first READ in a
COMPOUND, and READ_PLUS CONTENT_DATA segments are a similar
situation.

Even so, IMO enabling splice read on the first segment would cover
important use cases, in particular the case when there are no holes.

'Twould be interesting to add a metric to see how often READ_PLUS
returns a hole. Just a thought.


>> But I also thought the purpose of READ_PLUS was to help clients
>> preserve unallocated extents in files during copy operations.
>> An unallocated extent is not the same as an allocated extent
>> that has zeroes written into it. IIUC this new logic does not
>> distinguish between those two cases at all. (And please correct
>> me if this is really not the goal of READ_PLUS).
> 
> I wasn't aware of this as a goal of READ_PLUS. As of right now, Linux
> doesn't really have a way to punch holes into pagecache data, so we
> and up needing to zero-fill on the client side during decoding.

Again, that might not always be the case? But OK, I'll table that.


>> I would like to retain precise detection of unallocated extents
>> in files. Maybe SEEK_HOLE/SEEK_DATA is not how to do that, but
>> my feeling is that checking for zero bytes is definitely not
>> the way to do it.
> 
> Ok.
>> 
>> 
>>>> I've cc'd linux-fsdevel to see if we can get some more ideas going
>>>> and move this forward.
>>>> 
>>>> Another thought I had was to support returning only one or two
>>>> segments per reply. One CONTENT segment, one HOLE segment, or one
>>>> of each. Would that be enough to prevent the issues around file
>>>> updates racing with the construction of the reply?
>>> 
>>> Before I can make any sort of useful suggestion, I need to have it
>>> explained to me why we care if the underlying file mapping has
>>> changed between the read of the data and the SEEK_HOLE trim check,
>>> because it's not at all clear to me what problem this change is
>>> actually solving.
>> 
>> The cover letter for this series calls out generic/091 and
>> generic/263 -- it mentioned both are failing with NFSv4.2. I've
>> tried to reproduce failures here, but both passed.
> 
> Did you check that CONFIG_NFS_V4_2_READ_PLUS=y on the client? We had
> it disabled due to these tests and the very, very long time servers
> exporting btrfs take to read already-cached data (see btrfs sections
> in the wiki page linked in the cover letter).

Confirmed: that variable is set in my client's booted kernel.


> There is also this bugzilla documenting the problem:
> https://bugzilla.kernel.org/show_bug.cgi?id=215673

OK, then let's add

Link: https://bugzilla.kernel.org/show_bug.cgi?id=215673

to this patch when you resubmit. (I used to use BugLink:
but I gather Linus doesn't like that tag name).


>> Anna, can you provide a root-cause analysis of what is failing
>> in your testing? Maybe a reproducer for us to kick around?
> 
> The only reproducers I have are the xfstests mentioned. They fail
> pretty reliably on my setup (linux nfsd exporting xfs).

Then, the first order of business is to understand why you
can reproduce the failure easily but I cannot. (Hrm, I think
I did try to reproduce on xfs, but I'll have to check).

I'd like to see READ_PLUS working well in NFSD, so I'll
try to be as helpful as I can. There will be a learning
curve ;-)


> Anna
> 
>> I'm guessing you might be encountering races because your
>> usual test environment is virtual, so we need to understand
>> how timing effects the results.
>> 
>> 
>> --
>> Chuck Lever

--
Chuck Lever
Matthew Wilcox July 19, 2022, 9:10 p.m. UTC | #7
On Tue, Jul 19, 2022 at 04:24:18PM -0400, Anna Schumaker wrote:
> On Tue, Jul 19, 2022 at 1:21 PM Chuck Lever III <chuck.lever@oracle.com> wrote:
> > But I also thought the purpose of READ_PLUS was to help clients
> > preserve unallocated extents in files during copy operations.
> > An unallocated extent is not the same as an allocated extent
> > that has zeroes written into it. IIUC this new logic does not
> > distinguish between those two cases at all. (And please correct
> > me if this is really not the goal of READ_PLUS).
> 
> I wasn't aware of this as a goal of READ_PLUS. As of right now, Linux
> doesn't really have a way to punch holes into pagecache data, so we
> and up needing to zero-fill on the client side during decoding.

I've proven myself unqualified to opine on how NFS should be doing
things in the past ... so let me see if I understand how NFS works
for this today.

Userspace issues a read(), the VFS allocates some pages to cache the
data and calls ->readahead() to get the filesystem to fill those pages.
NFS uses READ_PLUS to get the data and the server says "this is a hole,
no data for you", at which point NFS has to call memset() because the
page cache does not have the ability to represent holes?

If so, that pretty much matches how block filesystems work.  Except that
block filesystems know the "layout" of the file; whether they use iomap
or buffer_heads, they can know this without doing I/O (some filesystems
like ext2 delay knowing the layout of the file until an I/O happens,
but then they cache it).

So I think Linux is currently built on assuming the filesystem knows
where its holes are, rather than informing the page cache about its holes.
Could we do better?  Probably!  I'd be interested in seeing what happens
if we add support for "this page is in a hole" to the page cache.
I'd also be interested in seeing how things would change if we had
filesystems provide their extent information to the VFS and have the VFS
handle holes all by itself without troubling the filesystem.  It'd require
network filesystems to invalidate the VFS's knowledge of extents if
another client modifies the file.  I haven't thought about it deeply.
Dave Chinner July 19, 2022, 10:44 p.m. UTC | #8
On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
> On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> > > > On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> > > >
> > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > >
> > > > Rather than relying on the underlying filesystem to tell us where hole
> > > > and data segments are through vfs_llseek(), let's instead do the hole
> > > > compression ourselves. This has a few advantages over the old
> > > > implementation:
> > > >
> > > > 1) A single call to the underlying filesystem through nfsd_readv() means
> > > >   the file can't change from underneath us in the middle of encoding.
> >
> > Hi Anna,
> >
> > I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
> > of nfsd4_encode_read_plus_data() that is used to trim the data that
> > has already been read out of the file?
> 
> There is also the vfs_llseek(SEEK_DATA) call at the start of
> nfsd4_encode_read_plus_hole(). They are used to determine the length
> of the current hole or data segment.
> 
> >
> > What's the problem with racing with a hole punch here? All it does
> > is shorten the read data returned to match the new hole, so all it's
> > doing is making the returned data "more correct".
> 
> The problem is we call vfs_llseek() potentially many times when
> encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
> loop where we alternate between hole and data segments until we've
> encoded the requested number of bytes. My attempts at locking the file
> have resulted in a deadlock since vfs_llseek() also locks the file, so
> the file could change from underneath us during each iteration of the
> loop.

So the problem being solved is that the current encoding is not
atomic, rather than trying to avoid any computational overhead of
multiple vfs_llseek calls (which are largely just the same extent
lookups as we do during the read call)?

The implementation just seems backwards to me - rather than reading
data and then trying to work out where the holes are, I suspect it
should be working out where the holes are and then reading the data.
This is how the IO path in filesystems work, so it would seem like a
no-brainer to try to leverage the infrastructure we already have to
do that.

The information is there and we have infrastructure that exposes it
to the IO path, it's just *underneath* the page cache and the page
cache destroys the information that it used to build the data it
returns to the NFSD.

IOWs, it seems to me that what READ_PLUS really wants is a "sparse
read operation" from the filesystem rather than the current "read
that fills holes with zeroes". i.e. a read operation that sets an
iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
read to just the ranges that contain data.

That way the read populates the page cache over a single contiguous
range of data and returns with the {offset, len} that spans the
range that is read and mapped. The caller can then read that region
out of the page cache and mark all the non-data regions as holes in
whatever manner they need to.

The iomap infrastructure that XFS and other filesystems use provide
this exact "map only what contains data" capability - an iomap tells
the page cache exactly what underlies the data range (hole, data,
unwritten extents, etc) in an efficient manner, so it wouldn't be a
huge stretch just to limit read IO ranges to those that contain only
DATA extents.

At this point READ_PLUS then just needs to iterate doing sparse
reads and recording the ranges that return data as vector of some
kind that is then passes to the encoding function to encode it as
a sparse READ_PLUS data range....

> > OTOH, if something allocates over a hole that the read filled with
> > zeros, what's the problem with occasionally returning zeros as data?
> > Regardless, if this has raced with a write to the file that filled
> > that hole, we're already returning stale data/hole information to
> > the client regardless of whether we trim it or not....
> >
> > i.e. I can't see a correctness or data integrity problem here that
> > doesn't already exist, and I have my doubts that hole
> > punching/filling racing with reads happens often enough to create a
> > performance or bandwidth problem OTW. Hence I've really got no idea
> > what the problem that needs to be solved here is.
> >
> > Can you explain what the symptoms of the problem a user would see
> > that this change solves?
> 
> This fixes xfstests generic/091 and generic/263, along with this
> reported bug: https://bugzilla.kernel.org/show_bug.cgi?id=215673

Oh, that bug is mixing mmap() reads and writes with direct IO reads
and writes. We don't guarantee data integrity and coherency when
applications do that, and a multi-part buffered read operation isn't
going to make that any better...

> > > > 2) A single call to the underlying filestem also means that the
> > > >   underlying filesystem only needs to synchronize cached and on-disk
> > > >   data one time instead of potentially many speeding up the reply.
> >
> > SEEK_HOLE/DATA doesn't require cached data to be sync'd to disk to
> > be coherent - that's only a problem FIEMAP has (and syncing cached
> > data doesn't fix the TOCTOU coherency issue!).  i.e. SEEK_HOLE/DATA
> > will check the page cache for data if appropriate (e.g. unwritten
> > disk extents may have data in memory over the top of them) instead
> > of syncing data to disk.
> 
> For some reason, btrfs on virtual hardware has terrible performance
> numbers when using vfs_llseek() with files that are already in the
> server's cache.

IIRC, btrfs has extent lookup scalability problems, so you're going
to see this sort of issue with SEEK_HOLE/DATA on large fragmented
cached files in btrfs, especially if things like compression is
enabled. See this recent btrfs bug report, for example:

https://lore.kernel.org/linux-fsdevel/Yr1QwVW+sHWlAqKj@atmark-techno.com/

The fiemap overhead in the second cached read is effectively what
you'll also see with SEEK_HOLE/DATA as they are similar extent
lookup operations.

> I think it had something to do with how they lock
> extents, and some extra work that needs to be done if the file already
> exists in the server's memory but it's been  a few years since I've
> gone into their code to figure out where the slowdown is coming from.
> See this section of my performance results wiki page:
> https://wiki.linux-nfs.org/wiki/index.php/Read_Plus_May_2022#BTRFS_3

Yup, that's pretty much it - a 1.2GB/s -> 100MB/s perf drop on cached
reads when READ_PLUS is enabled is in line with the above bug
report.

This really is a btrfs extent lookup issue, not a problem with
SEEK_HOLE/DATA, but I think it's a moot point because I suspect that
sparse read capability in the FS IO path would be a much better
solution to this problem than trying to use SEEK_HOLE/DATA to
reconstruct file sparseness post-read...

Cheers,

Dave.
Dave Chinner July 19, 2022, 11:18 p.m. UTC | #9
On Tue, Jul 19, 2022 at 05:21:21PM +0000, Chuck Lever III wrote:
> 
> 
> > On Jul 17, 2022, at 9:15 PM, Dave Chinner <david@fromorbit.com> wrote:
> > 
> > On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> >>> On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> >>> 
> >>> +nfsd4_encode_read_plus_segments(struct nfsd4_compoundres *resp,
> >>> +				struct nfsd4_read *read,
> >>> +				unsigned int *segments, u32 *eof)
> >>> {
> >>> -	struct file *file = read->rd_nf->nf_file;
> >>> -	loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
> >>> -	loff_t f_size = i_size_read(file_inode(file));
> >>> -	unsigned long count;
> >>> -	__be32 *p;
> >>> +	struct xdr_stream *xdr = resp->xdr;
> >>> +	unsigned int bufpos = xdr->buf->len;
> >>> +	u64 offset = read->rd_offset;
> >>> +	struct read_plus_segment segment;
> >>> +	enum data_content4 pagetype;
> >>> +	unsigned long maxcount;
> >>> +	unsigned int pagenum = 0;
> >>> +	unsigned int pagelen;
> >>> +	char *vpage, *p;
> >>> +	__be32 nfserr;
> >>> 
> >>> -	if (data_pos == -ENXIO)
> >>> -		data_pos = f_size;
> >>> -	else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
> >>> -		return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
> >>> -	count = data_pos - read->rd_offset;
> >>> -
> >>> -	/* Content type, offset, byte count */
> >>> -	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
> >>> -	if (!p)
> >>> +	/* enough space for a HOLE segment before we switch to the pages */
> >>> +	if (!xdr_reserve_space(xdr, 5 * XDR_UNIT))
> >>> 		return nfserr_resource;
> >>> +	xdr_commit_encode(xdr);
> >>> 
> >>> -	*p++ = htonl(NFS4_CONTENT_HOLE);
> >>> -	p = xdr_encode_hyper(p, read->rd_offset);
> >>> -	p = xdr_encode_hyper(p, count);
> >>> +	maxcount = min_t(unsigned long, read->rd_length,
> >>> +			 (xdr->buf->buflen - xdr->buf->len));
> >>> 
> >>> -	*eof = (read->rd_offset + count) >= f_size;
> >>> -	*maxcount = min_t(unsigned long, count, *maxcount);
> >>> +	nfserr = nfsd4_read_plus_readv(resp, read, &maxcount, eof);
> >>> +	if (nfserr)
> >>> +		return nfserr;
> >>> +
> >>> +	while (maxcount > 0) {
> >>> +		vpage = xdr_buf_nth_page_address(xdr->buf, pagenum, &pagelen);
> >>> +		pagelen = min_t(unsigned int, pagelen, maxcount);
> >>> +		if (!vpage || pagelen == 0)
> >>> +			break;
> >>> +		p = memchr_inv(vpage, 0, pagelen);
> >> 
> >> I'm still not happy about touching every byte in each READ_PLUS
> >> payload. I think even though the rest of this work is merge-ready,
> >> this is a brute-force mechanism that's OK for a proof of concept
> >> but not appropriate for production-ready code.
> > 
> > Seems like a step backwards as it defeats the benefit zero-copy read
> > IO paths on the server side....
> 
> Tom Haynes' vision for READ_PLUS was to eventually replace the
> legacy READ operation. That means READ_PLUS(CONTENT_DATA) needs
> to be as fast and efficient as plain READ. (It would be great
> to use splice reads for CONTENT_DATA if we can!)
> 
> But I also thought the purpose of READ_PLUS was to help clients
> preserve unallocated extents in files during copy operations.

That's not a good idea. Userspace can't rely on the file layout
remaining unchanged while the copy is in progress. It's the
fundamental flaw in cp using FIEMAP - it will miss data that is in
memory over unwritten extents because it thinks that unwritten
extents are holes. Fundamentally, extent data is stale the moment the
filesystem inode is unlocked by the IO path, so you cannot rely on
it being accurate and correct anywhere but deep in the filesystem
implementation itself while the data is being read from or written
to the extent.

SEEK_HOLE/DATA is the best compromise for that which we have as it
considers the data ranges in the file, not the on-disk state.
However, if the extents are remote (i.e. in the case of NFS) then
there can still be massive TOCTOU race conditions in using
SEEK_HOLE/DATA at the client for determining the sparse regions of
the data - the client needs to hold an exclusive delegation on the
source file it is copying to ensure no other client changes the
layout while it is trying to perform the sparse copy....

Essentially, there is no safe way to do sparse copies short of
a filesystem having native offload support for either FI_CLONERANGE
(reflink) or copy_file_range() that allows the filesystem to make an
atomic duplicate of the source file....

> An unallocated extent is not the same as an allocated extent
> that has zeroes written into it.

Yes, but it is also also not the same as an unwritten extent, which
is an allocated extent that has a special "unwritten" flag in it.
SEEK_HOLE/DATA will present these as holes (i.e. zero
filled data), unless there is dirty data in memory over it in which case
they are DATA.

This is the key distinction between FIEMAP and SEEK_HOLE/DATA -
FIEMAP reports purely the on-disk filesystem extent state, while
SEEK_HOLE/DATA returns information about the *contents* of the file.

IOWs, if you are thinking about READ_PLUS in terms of the underlying
file layout, then you are thinking about it incorrectly. READ_PLUS
encodes information about the data it returns, not the layout of the
file the data came from.

> IIUC this new logic does not
> distinguish between those two cases at all. (And please correct
> me if this is really not the goal of READ_PLUS).

From 15 years ago when I was last deeply involved in NFS, the whole
discussion around eventual NFSv4 support for sparse reads was about
how to acheive server and network bandwidth reduction for really
large sparse files. e.g. sparse matrix files used in HPC might be
TBs in size, but are almost entirely zeros. Shipping TBs of zeros to
each of the thousands of clients in the HPC cluster is not in any
way efficient...

And, quite frankly, when you have large sparse read-only source
files like this that might be accessed by thousands of clients, the
last thing you want the server to be doing is populating the page
cache with of TBs of zeros just so it can read the zeroes to punch
them out of the encoded response to every read operation.

> I would like to retain precise detection of unallocated extents
> in files. Maybe SEEK_HOLE/SEEK_DATA is not how to do that, but
> my feeling is that checking for zero bytes is definitely not
> the way to do it.

SEEK_HOLE/DATA is the only way to do this "safely" in a generic
manner. But as I mentioned in my response to Anna, it seems like
entirely the wrong way to go about implementing efficient sparse
reads that reflect the state of the data as accurately as using
memchr_inv() after the fact to punch out zeroed holes in the data...

Cheers,

Dave.
Chuck Lever III July 20, 2022, 1:26 a.m. UTC | #10
> On Jul 19, 2022, at 6:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
>> On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
>>> 
>>> On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
>>>>> On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
>>>>> 
>>>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>> 
>>>>> Rather than relying on the underlying filesystem to tell us where hole
>>>>> and data segments are through vfs_llseek(), let's instead do the hole
>>>>> compression ourselves. This has a few advantages over the old
>>>>> implementation:
>>>>> 
>>>>> 1) A single call to the underlying filesystem through nfsd_readv() means
>>>>>  the file can't change from underneath us in the middle of encoding.
>>> 
>>> Hi Anna,
>>> 
>>> I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
>>> of nfsd4_encode_read_plus_data() that is used to trim the data that
>>> has already been read out of the file?
>> 
>> There is also the vfs_llseek(SEEK_DATA) call at the start of
>> nfsd4_encode_read_plus_hole(). They are used to determine the length
>> of the current hole or data segment.
>> 
>>> 
>>> What's the problem with racing with a hole punch here? All it does
>>> is shorten the read data returned to match the new hole, so all it's
>>> doing is making the returned data "more correct".
>> 
>> The problem is we call vfs_llseek() potentially many times when
>> encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
>> loop where we alternate between hole and data segments until we've
>> encoded the requested number of bytes. My attempts at locking the file
>> have resulted in a deadlock since vfs_llseek() also locks the file, so
>> the file could change from underneath us during each iteration of the
>> loop.
> 
> So the problem being solved is that the current encoding is not
> atomic, rather than trying to avoid any computational overhead of
> multiple vfs_llseek calls (which are largely just the same extent
> lookups as we do during the read call)?

Reviewing [1] and [2] I don't find any remarks about atomicity
guarantees. If a client needs an uncontended view of a file's
data, it's expected to fence other accessors via a OPEN(deny)
or LOCK operation, or serialize the requests itself.


> The implementation just seems backwards to me - rather than reading
> data and then trying to work out where the holes are, I suspect it
> should be working out where the holes are and then reading the data.
> This is how the IO path in filesystems work, so it would seem like a
> no-brainer to try to leverage the infrastructure we already have to
> do that.
> 
> The information is there and we have infrastructure that exposes it
> to the IO path, it's just *underneath* the page cache and the page
> cache destroys the information that it used to build the data it
> returns to the NFSD.
> 
> IOWs, it seems to me that what READ_PLUS really wants is a "sparse
> read operation" from the filesystem rather than the current "read
> that fills holes with zeroes". i.e. a read operation that sets an
> iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
> read to just the ranges that contain data.
> 
> That way the read populates the page cache over a single contiguous
> range of data and returns with the {offset, len} that spans the
> range that is read and mapped. The caller can then read that region
> out of the page cache and mark all the non-data regions as holes in
> whatever manner they need to.
> 
> The iomap infrastructure that XFS and other filesystems use provide
> this exact "map only what contains data" capability - an iomap tells
> the page cache exactly what underlies the data range (hole, data,
> unwritten extents, etc) in an efficient manner, so it wouldn't be a
> huge stretch just to limit read IO ranges to those that contain only
> DATA extents.
> 
> At this point READ_PLUS then just needs to iterate doing sparse
> reads and recording the ranges that return data as vector of some
> kind that is then passes to the encoding function to encode it as
> a sparse READ_PLUS data range....

The iomap approach seems sensible to me and covers the two basic
usage scenarios:

- Large sparse files, where we want to conserve both network
  bandwidth and client (and intermediate) cache occupancy.
  These are best served by exposing data and holes.

- Everyday files that are relatively small and generally will
  continue few, if any, holes. These are best served by using
  a splice read (where possible) -- that is, READ_PLUS in this
  case should work exactly like READ.

My impression of client implementations is that, a priori,
a client does not know whether a file contains holes or not,
but will probably always use READ_PLUS and let the server
make the choice for it.

Now how does the server make that choice? Is there an attribute
bit that indicates when a file should be treated as sparse? Can
we assume that immutable files (or compressed files) should
always be treated as sparse? Alternately, the server might use
the file's data : hole ratio.


--
Chuck Lever

[1] https://datatracker.ietf.org/doc/html/rfc7862#section-6

[2] https://datatracker.ietf.org/doc/html/rfc5661#section-18.22
Dave Chinner July 20, 2022, 2:36 a.m. UTC | #11
On Wed, Jul 20, 2022 at 01:26:13AM +0000, Chuck Lever III wrote:
> 
> 
> > On Jul 19, 2022, at 6:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> > 
> > On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
> >> On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
> >>> 
> >>> On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> >>>>> On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> >>>>> 
> >>>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> >>>>> 
> >>>>> Rather than relying on the underlying filesystem to tell us where hole
> >>>>> and data segments are through vfs_llseek(), let's instead do the hole
> >>>>> compression ourselves. This has a few advantages over the old
> >>>>> implementation:
> >>>>> 
> >>>>> 1) A single call to the underlying filesystem through nfsd_readv() means
> >>>>>  the file can't change from underneath us in the middle of encoding.
> >>> 
> >>> Hi Anna,
> >>> 
> >>> I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
> >>> of nfsd4_encode_read_plus_data() that is used to trim the data that
> >>> has already been read out of the file?
> >> 
> >> There is also the vfs_llseek(SEEK_DATA) call at the start of
> >> nfsd4_encode_read_plus_hole(). They are used to determine the length
> >> of the current hole or data segment.
> >> 
> >>> 
> >>> What's the problem with racing with a hole punch here? All it does
> >>> is shorten the read data returned to match the new hole, so all it's
> >>> doing is making the returned data "more correct".
> >> 
> >> The problem is we call vfs_llseek() potentially many times when
> >> encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
> >> loop where we alternate between hole and data segments until we've
> >> encoded the requested number of bytes. My attempts at locking the file
> >> have resulted in a deadlock since vfs_llseek() also locks the file, so
> >> the file could change from underneath us during each iteration of the
> >> loop.
> > 
> > So the problem being solved is that the current encoding is not
> > atomic, rather than trying to avoid any computational overhead of
> > multiple vfs_llseek calls (which are largely just the same extent
> > lookups as we do during the read call)?
> 
> Reviewing [1] and [2] I don't find any remarks about atomicity
> guarantees. If a client needs an uncontended view of a file's
> data, it's expected to fence other accessors via a OPEN(deny)
> or LOCK operation, or serialize the requests itself.

You've got the wrong "atomicity" scope :)

What I was talking about is the internal server side data operation
atomicity. that is, what is returned from the read to the READ_PLUS
code is not atomic w.r.t. the vfs_llseek() that are then used to
determine where there holes in the data returned by the read are.

Hence after the read has returned data to READ_PLUS, something else
can modify the data in the file (e.g. filling a hole or punching a
new one out) and then the ranges vfs_llseek() returns to READ_PLUS
does not match the data that is has in it's local buffer.

i.e. to do what the READ_PLUS operation is doing now, it would
somehow need to ensure no modifications can be made between the read
starting and the last vfs_llseek() call completing. IOWs, they need
to be performed as an atomic operation, not as a set of
independently locked (or unlocked!) operations as they are now.

> > The implementation just seems backwards to me - rather than reading
> > data and then trying to work out where the holes are, I suspect it
> > should be working out where the holes are and then reading the data.
> > This is how the IO path in filesystems work, so it would seem like a
> > no-brainer to try to leverage the infrastructure we already have to
> > do that.
> > 
> > The information is there and we have infrastructure that exposes it
> > to the IO path, it's just *underneath* the page cache and the page
> > cache destroys the information that it used to build the data it
> > returns to the NFSD.
> > 
> > IOWs, it seems to me that what READ_PLUS really wants is a "sparse
> > read operation" from the filesystem rather than the current "read
> > that fills holes with zeroes". i.e. a read operation that sets an
> > iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
> > read to just the ranges that contain data.
> > 
> > That way the read populates the page cache over a single contiguous
> > range of data and returns with the {offset, len} that spans the
> > range that is read and mapped. The caller can then read that region
> > out of the page cache and mark all the non-data regions as holes in
> > whatever manner they need to.
> > 
> > The iomap infrastructure that XFS and other filesystems use provide
> > this exact "map only what contains data" capability - an iomap tells
> > the page cache exactly what underlies the data range (hole, data,
> > unwritten extents, etc) in an efficient manner, so it wouldn't be a
> > huge stretch just to limit read IO ranges to those that contain only
> > DATA extents.
> > 
> > At this point READ_PLUS then just needs to iterate doing sparse
> > reads and recording the ranges that return data as vector of some
> > kind that is then passes to the encoding function to encode it as
> > a sparse READ_PLUS data range....
> 
> The iomap approach

Not actually what I proposed - I'm suggesting a new kiocb flag that
changes what the read passed to the filesystem does. My comments
about iomap are that this infrastructure already provides the extent
range query mechanisms that allow us to efficiently perform such
"restricted range" IO operations.

> seems sensible to me and covers the two basic
> usage scenarios:
> 
> - Large sparse files, where we want to conserve both network
>   bandwidth and client (and intermediate) cache occupancy.
>   These are best served by exposing data and holes.

*nod*

> - Everyday files that are relatively small and generally will
>   continue few, if any, holes. These are best served by using
>   a splice read (where possible) -- that is, READ_PLUS in this
>   case should work exactly like READ.

*nod*

> My impression of client implementations is that, a priori,
> a client does not know whether a file contains holes or not,
> but will probably always use READ_PLUS and let the server
> make the choice for it.

*nod*

> Now how does the server make that choice? Is there an attribute
> bit that indicates when a file should be treated as sparse? Can
> we assume that immutable files (or compressed files) should
> always be treated as sparse? Alternately, the server might use
> the file's data : hole ratio.

None of the above. The NFS server has no business knowing intimate
details about how the filesystem has laid out the file. All it cares
about ranges containing data and ranges that have no data (holes).

If the filesystem can support a sparse read, it returns sparse
ranges containing data to the NFS server. If the filesystem can't
support it, or it's internal file layout doesn't allow for efficient
hole/data discrimination, then it can just return the entire read
range.

Alternatively, in this latter case, the filesystem could call a
generic "sparse read" implementation that runs memchr_inv() to find
the first data range to return. Then the NFS server doesn't have to
code things differently, filesystems don't need to advertise
support for sparse reads, etc because every filesystem could
support sparse reads.

The only difference is that some filesystems will be much more
efficient and faster at it than others. We already see that sort
of thing with btrfs and seek hole/data on large cached files so I
don't see "filesystems perform differently" as a problem here...

Cheers,

Dave.
Chuck Lever III July 20, 2022, 4:18 a.m. UTC | #12
> On Jul 19, 2022, at 10:36 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Wed, Jul 20, 2022 at 01:26:13AM +0000, Chuck Lever III wrote:
>> 
>> 
>>> On Jul 19, 2022, at 6:44 PM, Dave Chinner <david@fromorbit.com> wrote:
>>> 
>>> On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
>>>> On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
>>>>> 
>>>>> On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
>>>>>>> On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
>>>>>>> 
>>>>>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>>>> 
>>>>>>> Rather than relying on the underlying filesystem to tell us where hole
>>>>>>> and data segments are through vfs_llseek(), let's instead do the hole
>>>>>>> compression ourselves. This has a few advantages over the old
>>>>>>> implementation:
>>>>>>> 
>>>>>>> 1) A single call to the underlying filesystem through nfsd_readv() means
>>>>>>> the file can't change from underneath us in the middle of encoding.
>>>>> 
>>>>> Hi Anna,
>>>>> 
>>>>> I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
>>>>> of nfsd4_encode_read_plus_data() that is used to trim the data that
>>>>> has already been read out of the file?
>>>> 
>>>> There is also the vfs_llseek(SEEK_DATA) call at the start of
>>>> nfsd4_encode_read_plus_hole(). They are used to determine the length
>>>> of the current hole or data segment.
>>>> 
>>>>> 
>>>>> What's the problem with racing with a hole punch here? All it does
>>>>> is shorten the read data returned to match the new hole, so all it's
>>>>> doing is making the returned data "more correct".
>>>> 
>>>> The problem is we call vfs_llseek() potentially many times when
>>>> encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
>>>> loop where we alternate between hole and data segments until we've
>>>> encoded the requested number of bytes. My attempts at locking the file
>>>> have resulted in a deadlock since vfs_llseek() also locks the file, so
>>>> the file could change from underneath us during each iteration of the
>>>> loop.
>>> 
>>> So the problem being solved is that the current encoding is not
>>> atomic, rather than trying to avoid any computational overhead of
>>> multiple vfs_llseek calls (which are largely just the same extent
>>> lookups as we do during the read call)?
>> 
>> Reviewing [1] and [2] I don't find any remarks about atomicity
>> guarantees. If a client needs an uncontended view of a file's
>> data, it's expected to fence other accessors via a OPEN(deny)
>> or LOCK operation, or serialize the requests itself.
> 
> You've got the wrong "atomicity" scope :)
> 
> What I was talking about is the internal server side data operation
> atomicity. that is, what is returned from the read to the READ_PLUS
> code is not atomic w.r.t. the vfs_llseek() that are then used to
> determine where there holes in the data returned by the read are.
> 
> Hence after the read has returned data to READ_PLUS, something else
> can modify the data in the file (e.g. filling a hole or punching a
> new one out) and then the ranges vfs_llseek() returns to READ_PLUS
> does not match the data that is has in it's local buffer.

Architecturally, with the NFS protocol, the client and the application
running there are responsible for stopping "something else [from]
modifying the data in the file." NFS operations in and of themselves
are not usually atomic in that respect.

A READ operation has the same issue as READ_PLUS: a hole punch can
remove data from a file while the server is constructing and
encoding a READ reply, unless the application on the client has
taken the trouble to block foreign file modifications.


> i.e. to do what the READ_PLUS operation is doing now, it would
> somehow need to ensure no modifications can be made between the read
> starting and the last vfs_llseek() call completing. IOWs, they need
> to be performed as an atomic operation, not as a set of
> independently locked (or unlocked!) operations as they are now.

There is nothing making that guarantee on the server, and as I
said, there is nothing I've found in the specs that require that
level of atomicity from a single READ or READ_PLUS operation.

Maybe I don't understand what you mean by "what the READ_PLUS
operation is doing now"?


>>> The implementation just seems backwards to me - rather than reading
>>> data and then trying to work out where the holes are, I suspect it
>>> should be working out where the holes are and then reading the data.
>>> This is how the IO path in filesystems work, so it would seem like a
>>> no-brainer to try to leverage the infrastructure we already have to
>>> do that.
>>> 
>>> The information is there and we have infrastructure that exposes it
>>> to the IO path, it's just *underneath* the page cache and the page
>>> cache destroys the information that it used to build the data it
>>> returns to the NFSD.
>>> 
>>> IOWs, it seems to me that what READ_PLUS really wants is a "sparse
>>> read operation" from the filesystem rather than the current "read
>>> that fills holes with zeroes". i.e. a read operation that sets an
>>> iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
>>> read to just the ranges that contain data.
>>> 
>>> That way the read populates the page cache over a single contiguous
>>> range of data and returns with the {offset, len} that spans the
>>> range that is read and mapped. The caller can then read that region
>>> out of the page cache and mark all the non-data regions as holes in
>>> whatever manner they need to.
>>> 
>>> The iomap infrastructure that XFS and other filesystems use provide
>>> this exact "map only what contains data" capability - an iomap tells
>>> the page cache exactly what underlies the data range (hole, data,
>>> unwritten extents, etc) in an efficient manner, so it wouldn't be a
>>> huge stretch just to limit read IO ranges to those that contain only
>>> DATA extents.
>>> 
>>> At this point READ_PLUS then just needs to iterate doing sparse
>>> reads and recording the ranges that return data as vector of some
>>> kind that is then passes to the encoding function to encode it as
>>> a sparse READ_PLUS data range....
>> 
>> The iomap approach
> 
> Not actually what I proposed - I'm suggesting a new kiocb flag that
> changes what the read passed to the filesystem does. My comments
> about iomap are that this infrastructure already provides the extent
> range query mechanisms that allow us to efficiently perform such
> "restricted range" IO operations.
> 
>> seems sensible to me and covers the two basic
>> usage scenarios:
>> 
>> - Large sparse files, where we want to conserve both network
>>  bandwidth and client (and intermediate) cache occupancy.
>>  These are best served by exposing data and holes.
> 
> *nod*
> 
>> - Everyday files that are relatively small and generally will
>>  continue few, if any, holes. These are best served by using
>>  a splice read (where possible) -- that is, READ_PLUS in this
>>  case should work exactly like READ.
> 
> *nod*
> 
>> My impression of client implementations is that, a priori,
>> a client does not know whether a file contains holes or not,
>> but will probably always use READ_PLUS and let the server
>> make the choice for it.
> 
> *nod*
> 
>> Now how does the server make that choice? Is there an attribute
>> bit that indicates when a file should be treated as sparse? Can
>> we assume that immutable files (or compressed files) should
>> always be treated as sparse? Alternately, the server might use
>> the file's data : hole ratio.
> 
> None of the above. The NFS server has no business knowing intimate
> details about how the filesystem has laid out the file. All it cares
> about ranges containing data and ranges that have no data (holes).

That would be the case if we want nothing more than impermeable
software layering. That's nice for software developers, but
maybe not of much benefit to average users.

I see no efficiency benefit, for example, if a 10MB object file
has 512 bytes of zeroes at a convenient offset and the server
returns that as DATA/HOLE/DATA. The amount of extra work it has
to do to make that happen results in the same latencies as
transmitting 512 extra bytes on GbE. It might be even worse on
faster network fabrics.

On fast networks, the less the server's host CPU has to be
involved in doing the READ, the better it scales. It's
better to set up the I/O and step out of the way; use zero
touch as much as possible.

Likewise on the client: it might receive a CONTENT_HOLE, but
then its CPU has to zero out that range, with all the memory
and cache activity that entails. For small holes, that's going
to be a lot of memset(0). If the server returns holes only
when they are large, then the client can use more efficient
techniques (like marking page cache pages or using ZERO_PAGE).

On networks with large bandwidth-latency products, however,
it makes sense to trade as much server and client CPU and
memory for transferred bytes as you can.


The mechanism that handles sparse files needs to be distinct
from the policy of when to return more than a single
CONTENT_DATA segment, since one of our goals is to ensure
that READ_PLUS performs and scales as well as READ on common
workloads (ie, not HPC / large sparse file workloads).


> If the filesystem can support a sparse read, it returns sparse
> ranges containing data to the NFS server. If the filesystem can't
> support it, or it's internal file layout doesn't allow for efficient
> hole/data discrimination, then it can just return the entire read
> range.
> 
> Alternatively, in this latter case, the filesystem could call a
> generic "sparse read" implementation that runs memchr_inv() to find
> the first data range to return. Then the NFS server doesn't have to
> code things differently, filesystems don't need to advertise
> support for sparse reads, etc because every filesystem could
> support sparse reads.
> 
> The only difference is that some filesystems will be much more
> efficient and faster at it than others. We already see that sort
> of thing with btrfs and seek hole/data on large cached files so I
> don't see "filesystems perform differently" as a problem here...

The problem with that approach is that will result in
performance regressions on NFSv4.2 with exports that reside
on underperforming filesystem types. We need READ_PLUS to
perform as well as READ so there is no regression between
NFSv4.2 without and with READ_PLUS, and no regression when
migrating from NFSv4.1 to NFSv4.2 with READ_PLUS enabled.


--
Chuck Lever
Jeff Layton July 20, 2022, 12:55 p.m. UTC | #13
On Wed, 2022-07-20 at 12:36 +1000, Dave Chinner wrote:
> On Wed, Jul 20, 2022 at 01:26:13AM +0000, Chuck Lever III wrote:
> > 
> > 
> > > On Jul 19, 2022, at 6:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> > > 
> > > On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
> > > > On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
> > > > > 
> > > > > On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> > > > > > > On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> > > > > > > 
> > > > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > > > > 
> > > > > > > Rather than relying on the underlying filesystem to tell us where hole
> > > > > > > and data segments are through vfs_llseek(), let's instead do the hole
> > > > > > > compression ourselves. This has a few advantages over the old
> > > > > > > implementation:
> > > > > > > 
> > > > > > > 1) A single call to the underlying filesystem through nfsd_readv() means
> > > > > > >  the file can't change from underneath us in the middle of encoding.
> > > > > 
> > > > > Hi Anna,
> > > > > 
> > > > > I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
> > > > > of nfsd4_encode_read_plus_data() that is used to trim the data that
> > > > > has already been read out of the file?
> > > > 
> > > > There is also the vfs_llseek(SEEK_DATA) call at the start of
> > > > nfsd4_encode_read_plus_hole(). They are used to determine the length
> > > > of the current hole or data segment.
> > > > 
> > > > > 
> > > > > What's the problem with racing with a hole punch here? All it does
> > > > > is shorten the read data returned to match the new hole, so all it's
> > > > > doing is making the returned data "more correct".
> > > > 
> > > > The problem is we call vfs_llseek() potentially many times when
> > > > encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
> > > > loop where we alternate between hole and data segments until we've
> > > > encoded the requested number of bytes. My attempts at locking the file
> > > > have resulted in a deadlock since vfs_llseek() also locks the file, so
> > > > the file could change from underneath us during each iteration of the
> > > > loop.
> > > 
> > > So the problem being solved is that the current encoding is not
> > > atomic, rather than trying to avoid any computational overhead of
> > > multiple vfs_llseek calls (which are largely just the same extent
> > > lookups as we do during the read call)?
> > 
> > Reviewing [1] and [2] I don't find any remarks about atomicity
> > guarantees. If a client needs an uncontended view of a file's
> > data, it's expected to fence other accessors via a OPEN(deny)
> > or LOCK operation, or serialize the requests itself.
> 
> You've got the wrong "atomicity" scope :)
> 
> What I was talking about is the internal server side data operation
> atomicity. that is, what is returned from the read to the READ_PLUS
> code is not atomic w.r.t. the vfs_llseek() that are then used to
> determine where there holes in the data returned by the read are.
> 
> Hence after the read has returned data to READ_PLUS, something else
> can modify the data in the file (e.g. filling a hole or punching a
> new one out) and then the ranges vfs_llseek() returns to READ_PLUS
> does not match the data that is has in it's local buffer.
> 
> i.e. to do what the READ_PLUS operation is doing now, it would
> somehow need to ensure no modifications can be made between the read
> starting and the last vfs_llseek() call completing. IOWs, they need
> to be performed as an atomic operation, not as a set of
> independently locked (or unlocked!) operations as they are now.
> 
> > > The implementation just seems backwards to me - rather than reading
> > > data and then trying to work out where the holes are, I suspect it
> > > should be working out where the holes are and then reading the data.
> > > This is how the IO path in filesystems work, so it would seem like a
> > > no-brainer to try to leverage the infrastructure we already have to
> > > do that.
> > > 
> > > The information is there and we have infrastructure that exposes it
> > > to the IO path, it's just *underneath* the page cache and the page
> > > cache destroys the information that it used to build the data it
> > > returns to the NFSD.
> > > 
> > > IOWs, it seems to me that what READ_PLUS really wants is a "sparse
> > > read operation" from the filesystem rather than the current "read
> > > that fills holes with zeroes". i.e. a read operation that sets an
> > > iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
> > > read to just the ranges that contain data.
> > > 
> > > That way the read populates the page cache over a single contiguous
> > > range of data and returns with the {offset, len} that spans the
> > > range that is read and mapped. The caller can then read that region
> > > out of the page cache and mark all the non-data regions as holes in
> > > whatever manner they need to.
> > > 
> > > The iomap infrastructure that XFS and other filesystems use provide
> > > this exact "map only what contains data" capability - an iomap tells
> > > the page cache exactly what underlies the data range (hole, data,
> > > unwritten extents, etc) in an efficient manner, so it wouldn't be a
> > > huge stretch just to limit read IO ranges to those that contain only
> > > DATA extents.
> > > 
> > > At this point READ_PLUS then just needs to iterate doing sparse
> > > reads and recording the ranges that return data as vector of some
> > > kind that is then passes to the encoding function to encode it as
> > > a sparse READ_PLUS data range....
> > 
> > The iomap approach
> 
> Not actually what I proposed - I'm suggesting a new kiocb flag that
> changes what the read passed to the filesystem does. My comments
> about iomap are that this infrastructure already provides the extent
> range query mechanisms that allow us to efficiently perform such
> "restricted range" IO operations.
> 
> > seems sensible to me and covers the two basic
> > usage scenarios:
> > 
> > - Large sparse files, where we want to conserve both network
> >   bandwidth and client (and intermediate) cache occupancy.
> >   These are best served by exposing data and holes.
> 
> *nod*
> 
> > - Everyday files that are relatively small and generally will
> >   continue few, if any, holes. These are best served by using
> >   a splice read (where possible) -- that is, READ_PLUS in this
> >   case should work exactly like READ.
> 
> *nod*
> 
> > My impression of client implementations is that, a priori,
> > a client does not know whether a file contains holes or not,
> > but will probably always use READ_PLUS and let the server
> > make the choice for it.
> 
> *nod*
> 
> > Now how does the server make that choice? Is there an attribute
> > bit that indicates when a file should be treated as sparse? Can
> > we assume that immutable files (or compressed files) should
> > always be treated as sparse? Alternately, the server might use
> > the file's data : hole ratio.
> 
> None of the above. The NFS server has no business knowing intimate
> details about how the filesystem has laid out the file. All it cares
> about ranges containing data and ranges that have no data (holes).
> 
> If the filesystem can support a sparse read, it returns sparse
> ranges containing data to the NFS server. If the filesystem can't
> support it, or it's internal file layout doesn't allow for efficient
> hole/data discrimination, then it can just return the entire read
> range.
> 
> Alternatively, in this latter case, the filesystem could call a
> generic "sparse read" implementation that runs memchr_inv() to find
> the first data range to return. Then the NFS server doesn't have to
> code things differently, filesystems don't need to advertise
> support for sparse reads, etc because every filesystem could
> support sparse reads.
> 
> The only difference is that some filesystems will be much more
> efficient and faster at it than others. We already see that sort
> of thing with btrfs and seek hole/data on large cached files so I
> don't see "filesystems perform differently" as a problem here...
> 

^^^
This seems like more trouble than it's worth, and would probably result
in worse performance. The generic implementation should just return a
single non-sparse extent in the sparse read reply if it doesn't know how
to fill out a sparse read properly. IOW, we shouldn't try to find holes,
unless the underlying filesystem can do that itself via iomap sparse
read or some similar mechanism.
Josef Bacik July 21, 2022, 8:47 p.m. UTC | #14
On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
> On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> > > > On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> > > >
> > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > >
> > > > Rather than relying on the underlying filesystem to tell us where hole
> > > > and data segments are through vfs_llseek(), let's instead do the hole
> > > > compression ourselves. This has a few advantages over the old
> > > > implementation:
> > > >
> > > > 1) A single call to the underlying filesystem through nfsd_readv() means
> > > >   the file can't change from underneath us in the middle of encoding.
> >
> > Hi Anna,
> >
> > I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
> > of nfsd4_encode_read_plus_data() that is used to trim the data that
> > has already been read out of the file?
> 
> There is also the vfs_llseek(SEEK_DATA) call at the start of
> nfsd4_encode_read_plus_hole(). They are used to determine the length
> of the current hole or data segment.
> 
> >
> > What's the problem with racing with a hole punch here? All it does
> > is shorten the read data returned to match the new hole, so all it's
> > doing is making the returned data "more correct".
> 
> The problem is we call vfs_llseek() potentially many times when
> encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
> loop where we alternate between hole and data segments until we've
> encoded the requested number of bytes. My attempts at locking the file
> have resulted in a deadlock since vfs_llseek() also locks the file, so
> the file could change from underneath us during each iteration of the
> loop.
> 
> >
> > OTOH, if something allocates over a hole that the read filled with
> > zeros, what's the problem with occasionally returning zeros as data?
> > Regardless, if this has raced with a write to the file that filled
> > that hole, we're already returning stale data/hole information to
> > the client regardless of whether we trim it or not....
> >
> > i.e. I can't see a correctness or data integrity problem here that
> > doesn't already exist, and I have my doubts that hole
> > punching/filling racing with reads happens often enough to create a
> > performance or bandwidth problem OTW. Hence I've really got no idea
> > what the problem that needs to be solved here is.
> >
> > Can you explain what the symptoms of the problem a user would see
> > that this change solves?
> 
> This fixes xfstests generic/091 and generic/263, along with this
> reported bug: https://bugzilla.kernel.org/show_bug.cgi?id=215673
> >
> > > > 2) A single call to the underlying filestem also means that the
> > > >   underlying filesystem only needs to synchronize cached and on-disk
> > > >   data one time instead of potentially many speeding up the reply.
> >
> > SEEK_HOLE/DATA doesn't require cached data to be sync'd to disk to
> > be coherent - that's only a problem FIEMAP has (and syncing cached
> > data doesn't fix the TOCTOU coherency issue!).  i.e. SEEK_HOLE/DATA
> > will check the page cache for data if appropriate (e.g. unwritten
> > disk extents may have data in memory over the top of them) instead
> > of syncing data to disk.
> 
> For some reason, btrfs on virtual hardware has terrible performance
> numbers when using vfs_llseek() with files that are already in the
> server's cache. I think it had something to do with how they lock
> extents, and some extra work that needs to be done if the file already
> exists in the server's memory but it's been  a few years since I've
> gone into their code to figure out where the slowdown is coming from.
> See this section of my performance results wiki page:
> https://wiki.linux-nfs.org/wiki/index.php/Read_Plus_May_2022#BTRFS_3
>

I just did this locally, once in my test vm's and once on my continuous
performance testing hardware and I'm not able to reproduce the numbers, so I
think I'm doing something wrong.

My test is stupid, I just dd a 5gib file, come behind it and punch holes every
other 4k.  Then I umount and remount, SEEK_DATA+SEEK_HOLE through the whole
file, and then do it again so I have uncached and cached.  The numbers I'm
seeing are equivalent to ext4/xfs.  Granted on my VM I had to redo the test
because I had lockdep and other debugging on which makes us look stupid because
of the extent locking stuff, but with it off everything appears normal.

Does this more or less mirror your testing?  Looking at our code we're not doing
anything inherently stupid, so I'm not entirely sure what could be the problem.
Thanks,

Josef
Dave Chinner July 21, 2022, 11:12 p.m. UTC | #15
On Wed, Jul 20, 2022 at 08:55:23AM -0400, Jeff Layton wrote:
> On Wed, 2022-07-20 at 12:36 +1000, Dave Chinner wrote:
> > > Now how does the server make that choice? Is there an attribute
> > > bit that indicates when a file should be treated as sparse? Can
> > > we assume that immutable files (or compressed files) should
> > > always be treated as sparse? Alternately, the server might use
> > > the file's data : hole ratio.
> > 
> > None of the above. The NFS server has no business knowing intimate
> > details about how the filesystem has laid out the file. All it cares
> > about ranges containing data and ranges that have no data (holes).
> > 
> > If the filesystem can support a sparse read, it returns sparse
> > ranges containing data to the NFS server. If the filesystem can't
> > support it, or it's internal file layout doesn't allow for efficient
> > hole/data discrimination, then it can just return the entire read
> > range.
> > 
> > Alternatively, in this latter case, the filesystem could call a
> > generic "sparse read" implementation that runs memchr_inv() to find
> > the first data range to return. Then the NFS server doesn't have to
> > code things differently, filesystems don't need to advertise
> > support for sparse reads, etc because every filesystem could
> > support sparse reads.
> > 
> > The only difference is that some filesystems will be much more
> > efficient and faster at it than others. We already see that sort
> > of thing with btrfs and seek hole/data on large cached files so I
> > don't see "filesystems perform differently" as a problem here...
> > 
> 
> ^^^
> This seems like more trouble than it's worth, and would probably result
> in worse performance. The generic implementation should just return a
> single non-sparse extent in the sparse read reply if it doesn't know how
> to fill out a sparse read properly. IOW, we shouldn't try to find holes,
> unless the underlying filesystem can do that itself via iomap sparse
> read or some similar mechanism.

Yup, that's what I'd suggest initially, then a separate
investigation can be done to determine if manual hole detection is
worth the effort for those filesystems that cannot support sparse
reads effectively.

Cheers,

Dave.
Dave Chinner July 22, 2022, 12:44 a.m. UTC | #16
On Wed, Jul 20, 2022 at 04:18:36AM +0000, Chuck Lever III wrote:
> > On Jul 19, 2022, at 10:36 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Wed, Jul 20, 2022 at 01:26:13AM +0000, Chuck Lever III wrote:
> >>> On Jul 19, 2022, at 6:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> >>>>> What's the problem with racing with a hole punch here? All it does
> >>>>> is shorten the read data returned to match the new hole, so all it's
> >>>>> doing is making the returned data "more correct".
> >>>> 
> >>>> The problem is we call vfs_llseek() potentially many times when
> >>>> encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
> >>>> loop where we alternate between hole and data segments until we've
> >>>> encoded the requested number of bytes. My attempts at locking the file
> >>>> have resulted in a deadlock since vfs_llseek() also locks the file, so
> >>>> the file could change from underneath us during each iteration of the
> >>>> loop.
> >>> 
> >>> So the problem being solved is that the current encoding is not
> >>> atomic, rather than trying to avoid any computational overhead of
> >>> multiple vfs_llseek calls (which are largely just the same extent
> >>> lookups as we do during the read call)?
> >> 
> >> Reviewing [1] and [2] I don't find any remarks about atomicity
> >> guarantees. If a client needs an uncontended view of a file's
> >> data, it's expected to fence other accessors via a OPEN(deny)
> >> or LOCK operation, or serialize the requests itself.
> > 
> > You've got the wrong "atomicity" scope :)
> > 
> > What I was talking about is the internal server side data operation
> > atomicity. that is, what is returned from the read to the READ_PLUS
> > code is not atomic w.r.t. the vfs_llseek() that are then used to
> > determine where there holes in the data returned by the read are.
> > 
> > Hence after the read has returned data to READ_PLUS, something else
> > can modify the data in the file (e.g. filling a hole or punching a
> > new one out) and then the ranges vfs_llseek() returns to READ_PLUS
> > does not match the data that is has in it's local buffer.
> 
> Architecturally, with the NFS protocol, the client and the application
> running there are responsible for stopping "something else [from]
> modifying the data in the file." NFS operations in and of themselves
> are not usually atomic in that respect.

Sure, but that's not the atomicity scope I'm talking about. I'm
looking purely at what the server is doing and how atomicity within
the underlying filesystem - not the NFS server - changes the result.

> A READ operation has the same issue as READ_PLUS: a hole punch can
> remove data from a file while the server is constructing and
> encoding a READ reply, unless the application on the client has
> taken the trouble to block foreign file modifications.

Yes, at the NFS server that can happen, but it *can't happen in the
underlying filesystem*.

That is, while a read IO is in progress, the hole punch will not
start because we are not allowed to punch out an extent from under
an IO that is currently in progress. If we allow that, then the
extent could be reallocated to a different file and new data written
into before the actual read IO was submitted.

i.e. the filesystem has atomicity guarantees about data IO vs direct
extent manipulations.

Hence in the case of "read entire range then use memchr_inv() to
detect holes", the read IO has been executed without racing
with some other extent manipulation occurring. Hence the read IO is
effectively atomic, even if it needed to do multiple physical IOs
because the read was sparse.

In the case of "read data followed by seek hole/data", the
underlying extent map can be changed by racing modifications
(writes, hole punch, fallocate, truncate, etc) because once the read
is complete there is nothing stopping other operations being
performed.

Hence READ_PLUS via seek_hole/data is a non-atomic operation w.r.t.
the data read from the underlying filesystem, whereas read w/
memchr_inv is atomic w.r.t the data read...

> > i.e. to do what the READ_PLUS operation is doing now, it would
> > somehow need to ensure no modifications can be made between the read
> > starting and the last vfs_llseek() call completing. IOWs, they need
> > to be performed as an atomic operation, not as a set of
> > independently locked (or unlocked!) operations as they are now.
> 
> There is nothing making that guarantee on the server, and as I
> said, there is nothing I've found in the specs that require that
> level of atomicity from a single READ or READ_PLUS operation.

That's my point - the data corruption bug that got fixed by Anna's
patch changes the -data access atomicity- of the server READ_PLUS
encoding operation. i.e. it has nothing to do with the NFS
specification for the operation but rather results from how the
server implementation is retreiving data and metadata
from the underlying filesystem independently and then combining them
as it they were obtained atomically.

IOWs, the information returned by SEEK_HOLE/SEEK_DATA after a data
read does not reflect the state at the time the data was read - the
data is effectively stale at this point. seek hole/data information
is supposed to be used *before* the data is read to find the
locations of the data. That way, if there is a TOCTOU race, the read
still returns valid, correct data that matched the underlying
file layout at the time of the read.

> >>> IOWs, it seems to me that what READ_PLUS really wants is a "sparse
> >>> read operation" from the filesystem rather than the current "read
> >>> that fills holes with zeroes". i.e. a read operation that sets an
> >>> iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
> >>> read to just the ranges that contain data.
.....
> >> Now how does the server make that choice? Is there an attribute
> >> bit that indicates when a file should be treated as sparse? Can
> >> we assume that immutable files (or compressed files) should
> >> always be treated as sparse? Alternately, the server might use
> >> the file's data : hole ratio.
> > 
> > None of the above. The NFS server has no business knowing intimate
> > details about how the filesystem has laid out the file. All it cares
> > about ranges containing data and ranges that have no data (holes).
> 
> That would be the case if we want nothing more than impermeable
> software layering. That's nice for software developers, but
> maybe not of much benefit to average users.
> 
> I see no efficiency benefit, for example, if a 10MB object file
> has 512 bytes of zeroes at a convenient offset and the server
> returns that as DATA/HOLE/DATA. The amount of extra work it has
> to do to make that happen results in the same latencies as
> transmitting 512 extra bytes on GbE. It might be even worse on
> faster network fabrics.

Welcome to the world of sparse file IO optimisation - NFS may be new
to this, but local filesystems have been optimising this sort of
thing for decades. Don't think we don't know about it - seek time
between 2 discontiguous extents is *much worse* than the few extra
microseconds that DMAing a few extra sectors worth of data in a
single IO. Even on SSDs, there's a noticable latency difference
between a single 16KB IO and two separate 4kB IOs to get the same
data.

As it is, we worry about filesystem block granularity, not sectors.
A filesystem would only report a 512 byte range as a hole if it had
a 512 byte block size. Almost no-one uses such small block sizes -
4kB is the default for ext4 and XFS - so the minimum hole size is
generally 4KB.

Indeed, in this case XFS will not actually have a hole in the file
layout - the underlying extent would have been allocated as a single
contiguous unwritten extent, then when the data gets written it will
convert the two ranges to written, resulting in a physically
contiguous "data-unwritten-data" set of extents. However,
SEEK_HOLE/DATA will see that unwritten extent as a hole, so you'll
get that reported via seek hole/data or sparse reads regardless of
whether it is optimal for READ_PLUS encoding.

However, the physical layout is optimal for XFS - if the hole gets
filled by a future write, it ends up converting the file layout to a
single contiguous data extent that can be read with a single IO
rather than three physically discontigous data extents that require
3 physical IOs to read.

ext4 optimises this in a different way - it will allocate small
holes similar to the way XFS does, but it will also fill the with
real zeros rather than leaving them unwritten. As a result, ext4
will have a single physically contiguous extent on disk too, but it
will -always- report as data even though the data written by the
application is sparse and contains a run of zeroes in it.

IOWs, ext4 and XFS will behave differently for the simple case you
gave because they've optimised small holes in sparse file data
differently. Both have their pros and cons, but it also means that
the READ_PLUS response for the same file data written the same way
can be different because the underlying filesystem on the server is
different.

IOWs, the NFS server cannot rely on a specific behaviour w.r.t.
holes and data from the underlying filesystem. Sparseness of a file
is determined by how the underlying filesystem wants to optimise the
physical layout or the data and IO patterns that data access results
in. The NFS server really cannot influence that, or change the
"sparseness" it reports to the client except by examining the
data it reads from the filesystem....

> On fast networks, the less the server's host CPU has to be
> involved in doing the READ, the better it scales. It's
> better to set up the I/O and step out of the way; use zero
> touch as much as possible.

However, as you demonstrate for the NFS client below, something has
to fill the holes. i.e. if we aren't doing sparse reads on the NFS
server, then the filesystem underneath the NFS server has to spend
the CPU to instantiate pages over the hole in the file in the page
cache, then zero them before it can pass them to the "zero touch"
NFS send path to be encoded into the READ reponse.

IOWs, sending holes as zeroed data from the server is most
definitely not zero touch even if NFS doesn't touch the data.

Hence the "sparse read" from the underlying filesystem, which avoids
the need to populate and zero pages in the server page cache
altogether, as well as enabling NFS to use it's zero-touch
encode/send path for the data that is returned. And with a sparse
read operation, the encoding of hole ranges in READ_PLUS has almost
zero CPU overhead because the calculation is dead simple (i.e. hole
start = end of last data, hole len = start of next data - end of
last data).

IOWs, the lowest server CPU and memory READ processing overhead
comes from using READ_PLUS with sparse reads from the filesystem and
zero-touch READ_PLUS data range encoding.

> Likewise on the client: it might receive a CONTENT_HOLE, but
> then its CPU has to zero out that range, with all the memory
> and cache activity that entails. For small holes, that's going
> to be a lot of memset(0). If the server returns holes only
> when they are large, then the client can use more efficient
> techniques (like marking page cache pages or using ZERO_PAGE).

Yes, if we have sparse files we have to take that page cache
instantiation penalty *somewhere* in the IO stack between the client
and the server.

Indeed, we actually want this overhead in the NFS client, because we
have many more NFS clients than we do NFS servers and hence on
aggregate there is way more CPU and memory to dedicate to
instantiating zeroed data on the client side than on the server
side.

IOWs, if we ship zeroes via RDMA to the NFS client so the NFS client
doesn't need to instantiate holes in the page cache, then we're
actually forcing the server to perform hole instantiation. Forcing
the server to instantiate all the holes for all the files that all
the clients it is serving is prohibitive from a server CPU and
memory POV, even if you have the both the server network bandwidth
and the cross-sectional network bandwidth available to ship all
those zeros to all the clients.

> On networks with large bandwidth-latency products, however,
> it makes sense to trade as much server and client CPU and
> memory for transferred bytes as you can.

Server, yes, client not so much. If you focus purely on single
client<->server throughput, the server is not going to scale when
hundreds or thousands of clients all demand data at the same time.

But, regardless of this, the fact is that the NFS server is a
consumer of the sparseness data stored in the underlying filesystem.
Unless it wants to touch every byte that is read, it can only export
the HOLE/DATA information that the filesystem can provide it with.

What we want is a method that allows the filesystem to provide that
information to the NFS server READ_PLUS operation as efficiently as
possible. AFAICT, that's a sparse read operation....

> The mechanism that handles sparse files needs to be distinct
> from the policy of when to return more than a single
> CONTENT_DATA segment, since one of our goals is to ensure
> that READ_PLUS performs and scales as well as READ on common
> workloads (ie, not HPC / large sparse file workloads).

Yes, so make the server operation as efficient as possible whilst
still being correct (i.e. sparse reads), and push the the CPU and
memory requirements for instantiating and storing zeroes out
to the NFS client.  If the NFS client page cache instantiation can't
keep up with the server sending it encoded ranges of zeros, then
this isn't a server side issue; the client side sparse file support
needs fixing/optimising.

> > If the filesystem can support a sparse read, it returns sparse
> > ranges containing data to the NFS server. If the filesystem can't
> > support it, or it's internal file layout doesn't allow for efficient
> > hole/data discrimination, then it can just return the entire read
> > range.
> > 
> > Alternatively, in this latter case, the filesystem could call a
> > generic "sparse read" implementation that runs memchr_inv() to find
> > the first data range to return. Then the NFS server doesn't have to
> > code things differently, filesystems don't need to advertise
> > support for sparse reads, etc because every filesystem could
> > support sparse reads.
> > 
> > The only difference is that some filesystems will be much more
> > efficient and faster at it than others. We already see that sort
> > of thing with btrfs and seek hole/data on large cached files so I
> > don't see "filesystems perform differently" as a problem here...
> 
> The problem with that approach is that will result in
> performance regressions on NFSv4.2 with exports that reside
> on underperforming filesystem types. We need READ_PLUS to
> perform as well as READ so there is no regression between
> NFSv4.2 without and with READ_PLUS, and no regression when
> migrating from NFSv4.1 to NFSv4.2 with READ_PLUS enabled.

Sure, but fear of regressions is not a valid reason for preventing
change from being made.

Cheers,

Dave.
Anna Schumaker July 22, 2022, 12:45 p.m. UTC | #17
On Thu, Jul 21, 2022 at 4:48 PM Josef Bacik <josef@toxicpanda.com> wrote:
>
> On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
> > On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> > > > > On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> > > > >
> > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > >
> > > > > Rather than relying on the underlying filesystem to tell us where hole
> > > > > and data segments are through vfs_llseek(), let's instead do the hole
> > > > > compression ourselves. This has a few advantages over the old
> > > > > implementation:
> > > > >
> > > > > 1) A single call to the underlying filesystem through nfsd_readv() means
> > > > >   the file can't change from underneath us in the middle of encoding.
> > >
> > > Hi Anna,
> > >
> > > I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
> > > of nfsd4_encode_read_plus_data() that is used to trim the data that
> > > has already been read out of the file?
> >
> > There is also the vfs_llseek(SEEK_DATA) call at the start of
> > nfsd4_encode_read_plus_hole(). They are used to determine the length
> > of the current hole or data segment.
> >
> > >
> > > What's the problem with racing with a hole punch here? All it does
> > > is shorten the read data returned to match the new hole, so all it's
> > > doing is making the returned data "more correct".
> >
> > The problem is we call vfs_llseek() potentially many times when
> > encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
> > loop where we alternate between hole and data segments until we've
> > encoded the requested number of bytes. My attempts at locking the file
> > have resulted in a deadlock since vfs_llseek() also locks the file, so
> > the file could change from underneath us during each iteration of the
> > loop.
> >
> > >
> > > OTOH, if something allocates over a hole that the read filled with
> > > zeros, what's the problem with occasionally returning zeros as data?
> > > Regardless, if this has raced with a write to the file that filled
> > > that hole, we're already returning stale data/hole information to
> > > the client regardless of whether we trim it or not....
> > >
> > > i.e. I can't see a correctness or data integrity problem here that
> > > doesn't already exist, and I have my doubts that hole
> > > punching/filling racing with reads happens often enough to create a
> > > performance or bandwidth problem OTW. Hence I've really got no idea
> > > what the problem that needs to be solved here is.
> > >
> > > Can you explain what the symptoms of the problem a user would see
> > > that this change solves?
> >
> > This fixes xfstests generic/091 and generic/263, along with this
> > reported bug: https://bugzilla.kernel.org/show_bug.cgi?id=215673
> > >
> > > > > 2) A single call to the underlying filestem also means that the
> > > > >   underlying filesystem only needs to synchronize cached and on-disk
> > > > >   data one time instead of potentially many speeding up the reply.
> > >
> > > SEEK_HOLE/DATA doesn't require cached data to be sync'd to disk to
> > > be coherent - that's only a problem FIEMAP has (and syncing cached
> > > data doesn't fix the TOCTOU coherency issue!).  i.e. SEEK_HOLE/DATA
> > > will check the page cache for data if appropriate (e.g. unwritten
> > > disk extents may have data in memory over the top of them) instead
> > > of syncing data to disk.
> >
> > For some reason, btrfs on virtual hardware has terrible performance
> > numbers when using vfs_llseek() with files that are already in the
> > server's cache. I think it had something to do with how they lock
> > extents, and some extra work that needs to be done if the file already
> > exists in the server's memory but it's been  a few years since I've
> > gone into their code to figure out where the slowdown is coming from.
> > See this section of my performance results wiki page:
> > https://wiki.linux-nfs.org/wiki/index.php/Read_Plus_May_2022#BTRFS_3
> >
>
> I just did this locally, once in my test vm's and once on my continuous
> performance testing hardware and I'm not able to reproduce the numbers, so I
> think I'm doing something wrong.
>
> My test is stupid, I just dd a 5gib file, come behind it and punch holes every
> other 4k.  Then I umount and remount, SEEK_DATA+SEEK_HOLE through the whole
> file, and then do it again so I have uncached and cached.  The numbers I'm
> seeing are equivalent to ext4/xfs.  Granted on my VM I had to redo the test
> because I had lockdep and other debugging on which makes us look stupid because
> of the extent locking stuff, but with it off everything appears normal.
>
> Does this more or less mirror your testing?  Looking at our code we're not doing
> anything inherently stupid, so I'm not entirely sure what could be the problem.
> Thanks,

I think that's pretty close to what the server is doing with the
current code, except the NFS server would also do a read for every
data segment it found. I've been using `vmtouch` to make sure the file
doesn't get evicted from the server's page cache for my cached data.

Anna

>
> Josef
Josef Bacik July 22, 2022, 1:32 p.m. UTC | #18
On Fri, Jul 22, 2022 at 08:45:13AM -0400, Anna Schumaker wrote:
> On Thu, Jul 21, 2022 at 4:48 PM Josef Bacik <josef@toxicpanda.com> wrote:
> >
> > On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
> > > On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
> > > >
> > > > On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> > > > > > On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> > > > > >
> > > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > > >
> > > > > > Rather than relying on the underlying filesystem to tell us where hole
> > > > > > and data segments are through vfs_llseek(), let's instead do the hole
> > > > > > compression ourselves. This has a few advantages over the old
> > > > > > implementation:
> > > > > >
> > > > > > 1) A single call to the underlying filesystem through nfsd_readv() means
> > > > > >   the file can't change from underneath us in the middle of encoding.
> > > >
> > > > Hi Anna,
> > > >
> > > > I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
> > > > of nfsd4_encode_read_plus_data() that is used to trim the data that
> > > > has already been read out of the file?
> > >
> > > There is also the vfs_llseek(SEEK_DATA) call at the start of
> > > nfsd4_encode_read_plus_hole(). They are used to determine the length
> > > of the current hole or data segment.
> > >
> > > >
> > > > What's the problem with racing with a hole punch here? All it does
> > > > is shorten the read data returned to match the new hole, so all it's
> > > > doing is making the returned data "more correct".
> > >
> > > The problem is we call vfs_llseek() potentially many times when
> > > encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
> > > loop where we alternate between hole and data segments until we've
> > > encoded the requested number of bytes. My attempts at locking the file
> > > have resulted in a deadlock since vfs_llseek() also locks the file, so
> > > the file could change from underneath us during each iteration of the
> > > loop.
> > >
> > > >
> > > > OTOH, if something allocates over a hole that the read filled with
> > > > zeros, what's the problem with occasionally returning zeros as data?
> > > > Regardless, if this has raced with a write to the file that filled
> > > > that hole, we're already returning stale data/hole information to
> > > > the client regardless of whether we trim it or not....
> > > >
> > > > i.e. I can't see a correctness or data integrity problem here that
> > > > doesn't already exist, and I have my doubts that hole
> > > > punching/filling racing with reads happens often enough to create a
> > > > performance or bandwidth problem OTW. Hence I've really got no idea
> > > > what the problem that needs to be solved here is.
> > > >
> > > > Can you explain what the symptoms of the problem a user would see
> > > > that this change solves?
> > >
> > > This fixes xfstests generic/091 and generic/263, along with this
> > > reported bug: https://bugzilla.kernel.org/show_bug.cgi?id=215673
> > > >
> > > > > > 2) A single call to the underlying filestem also means that the
> > > > > >   underlying filesystem only needs to synchronize cached and on-disk
> > > > > >   data one time instead of potentially many speeding up the reply.
> > > >
> > > > SEEK_HOLE/DATA doesn't require cached data to be sync'd to disk to
> > > > be coherent - that's only a problem FIEMAP has (and syncing cached
> > > > data doesn't fix the TOCTOU coherency issue!).  i.e. SEEK_HOLE/DATA
> > > > will check the page cache for data if appropriate (e.g. unwritten
> > > > disk extents may have data in memory over the top of them) instead
> > > > of syncing data to disk.
> > >
> > > For some reason, btrfs on virtual hardware has terrible performance
> > > numbers when using vfs_llseek() with files that are already in the
> > > server's cache. I think it had something to do with how they lock
> > > extents, and some extra work that needs to be done if the file already
> > > exists in the server's memory but it's been  a few years since I've
> > > gone into their code to figure out where the slowdown is coming from.
> > > See this section of my performance results wiki page:
> > > https://wiki.linux-nfs.org/wiki/index.php/Read_Plus_May_2022#BTRFS_3
> > >
> >
> > I just did this locally, once in my test vm's and once on my continuous
> > performance testing hardware and I'm not able to reproduce the numbers, so I
> > think I'm doing something wrong.
> >
> > My test is stupid, I just dd a 5gib file, come behind it and punch holes every
> > other 4k.  Then I umount and remount, SEEK_DATA+SEEK_HOLE through the whole
> > file, and then do it again so I have uncached and cached.  The numbers I'm
> > seeing are equivalent to ext4/xfs.  Granted on my VM I had to redo the test
> > because I had lockdep and other debugging on which makes us look stupid because
> > of the extent locking stuff, but with it off everything appears normal.
> >
> > Does this more or less mirror your testing?  Looking at our code we're not doing
> > anything inherently stupid, so I'm not entirely sure what could be the problem.
> > Thanks,
> 
> I think that's pretty close to what the server is doing with the
> current code, except the NFS server would also do a read for every
> data segment it found. I've been using `vmtouch` to make sure the file
> doesn't get evicted from the server's page cache for my cached data.
>

I messed with it some more and I can't get it to be slow.  If you trip over
something like this again just give a shout and I'd be happy to dig in.  Thanks,

Josef
Anna Schumaker July 22, 2022, 1:43 p.m. UTC | #19
On Fri, Jul 22, 2022 at 9:32 AM Josef Bacik <josef@toxicpanda.com> wrote:
>
> On Fri, Jul 22, 2022 at 08:45:13AM -0400, Anna Schumaker wrote:
> > On Thu, Jul 21, 2022 at 4:48 PM Josef Bacik <josef@toxicpanda.com> wrote:
> > >
> > > On Tue, Jul 19, 2022 at 04:46:50PM -0400, Anna Schumaker wrote:
> > > > On Sun, Jul 17, 2022 at 9:16 PM Dave Chinner <david@fromorbit.com> wrote:
> > > > >
> > > > > On Fri, Jul 15, 2022 at 07:08:13PM +0000, Chuck Lever III wrote:
> > > > > > > On Jul 15, 2022, at 2:44 PM, Anna Schumaker <anna@kernel.org> wrote:
> > > > > > >
> > > > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > > > >
> > > > > > > Rather than relying on the underlying filesystem to tell us where hole
> > > > > > > and data segments are through vfs_llseek(), let's instead do the hole
> > > > > > > compression ourselves. This has a few advantages over the old
> > > > > > > implementation:
> > > > > > >
> > > > > > > 1) A single call to the underlying filesystem through nfsd_readv() means
> > > > > > >   the file can't change from underneath us in the middle of encoding.
> > > > >
> > > > > Hi Anna,
> > > > >
> > > > > I'm assuming you mean the vfs_llseek(SEEK_HOLE) call at the start
> > > > > of nfsd4_encode_read_plus_data() that is used to trim the data that
> > > > > has already been read out of the file?
> > > >
> > > > There is also the vfs_llseek(SEEK_DATA) call at the start of
> > > > nfsd4_encode_read_plus_hole(). They are used to determine the length
> > > > of the current hole or data segment.
> > > >
> > > > >
> > > > > What's the problem with racing with a hole punch here? All it does
> > > > > is shorten the read data returned to match the new hole, so all it's
> > > > > doing is making the returned data "more correct".
> > > >
> > > > The problem is we call vfs_llseek() potentially many times when
> > > > encoding a single reply to READ_PLUS. nfsd4_encode_read_plus() has a
> > > > loop where we alternate between hole and data segments until we've
> > > > encoded the requested number of bytes. My attempts at locking the file
> > > > have resulted in a deadlock since vfs_llseek() also locks the file, so
> > > > the file could change from underneath us during each iteration of the
> > > > loop.
> > > >
> > > > >
> > > > > OTOH, if something allocates over a hole that the read filled with
> > > > > zeros, what's the problem with occasionally returning zeros as data?
> > > > > Regardless, if this has raced with a write to the file that filled
> > > > > that hole, we're already returning stale data/hole information to
> > > > > the client regardless of whether we trim it or not....
> > > > >
> > > > > i.e. I can't see a correctness or data integrity problem here that
> > > > > doesn't already exist, and I have my doubts that hole
> > > > > punching/filling racing with reads happens often enough to create a
> > > > > performance or bandwidth problem OTW. Hence I've really got no idea
> > > > > what the problem that needs to be solved here is.
> > > > >
> > > > > Can you explain what the symptoms of the problem a user would see
> > > > > that this change solves?
> > > >
> > > > This fixes xfstests generic/091 and generic/263, along with this
> > > > reported bug: https://bugzilla.kernel.org/show_bug.cgi?id=215673
> > > > >
> > > > > > > 2) A single call to the underlying filestem also means that the
> > > > > > >   underlying filesystem only needs to synchronize cached and on-disk
> > > > > > >   data one time instead of potentially many speeding up the reply.
> > > > >
> > > > > SEEK_HOLE/DATA doesn't require cached data to be sync'd to disk to
> > > > > be coherent - that's only a problem FIEMAP has (and syncing cached
> > > > > data doesn't fix the TOCTOU coherency issue!).  i.e. SEEK_HOLE/DATA
> > > > > will check the page cache for data if appropriate (e.g. unwritten
> > > > > disk extents may have data in memory over the top of them) instead
> > > > > of syncing data to disk.
> > > >
> > > > For some reason, btrfs on virtual hardware has terrible performance
> > > > numbers when using vfs_llseek() with files that are already in the
> > > > server's cache. I think it had something to do with how they lock
> > > > extents, and some extra work that needs to be done if the file already
> > > > exists in the server's memory but it's been  a few years since I've
> > > > gone into their code to figure out where the slowdown is coming from.
> > > > See this section of my performance results wiki page:
> > > > https://wiki.linux-nfs.org/wiki/index.php/Read_Plus_May_2022#BTRFS_3
> > > >
> > >
> > > I just did this locally, once in my test vm's and once on my continuous
> > > performance testing hardware and I'm not able to reproduce the numbers, so I
> > > think I'm doing something wrong.
> > >
> > > My test is stupid, I just dd a 5gib file, come behind it and punch holes every
> > > other 4k.  Then I umount and remount, SEEK_DATA+SEEK_HOLE through the whole
> > > file, and then do it again so I have uncached and cached.  The numbers I'm
> > > seeing are equivalent to ext4/xfs.  Granted on my VM I had to redo the test
> > > because I had lockdep and other debugging on which makes us look stupid because
> > > of the extent locking stuff, but with it off everything appears normal.
> > >
> > > Does this more or less mirror your testing?  Looking at our code we're not doing
> > > anything inherently stupid, so I'm not entirely sure what could be the problem.
> > > Thanks,
> >
> > I think that's pretty close to what the server is doing with the
> > current code, except the NFS server would also do a read for every
> > data segment it found. I've been using `vmtouch` to make sure the file
> > doesn't get evicted from the server's page cache for my cached data.
> >
>
> I messed with it some more and I can't get it to be slow.  If you trip over
> something like this again just give a shout and I'd be happy to dig in.  Thanks,

Interesting. The only other suggestion I can think of is to try it
over NFS, could be some interaction with how the server is opening
files.

Anna

>
> Josef
Chuck Lever III July 22, 2022, 3:09 p.m. UTC | #20
> On Jul 21, 2022, at 8:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Wed, Jul 20, 2022 at 04:18:36AM +0000, Chuck Lever III wrote:
>>> On Jul 19, 2022, at 10:36 PM, Dave Chinner <david@fromorbit.com> wrote:
>>> On Wed, Jul 20, 2022 at 01:26:13AM +0000, Chuck Lever III wrote:
>>>>> On Jul 19, 2022, at 6:44 PM, Dave Chinner <david@fromorbit.com> wrote:
>>>>> IOWs, it seems to me that what READ_PLUS really wants is a "sparse
>>>>> read operation" from the filesystem rather than the current "read
>>>>> that fills holes with zeroes". i.e. a read operation that sets an
>>>>> iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
>>>>> read to just the ranges that contain data.
> .....
>>>> Now how does the server make that choice? Is there an attribute
>>>> bit that indicates when a file should be treated as sparse? Can
>>>> we assume that immutable files (or compressed files) should
>>>> always be treated as sparse? Alternately, the server might use
>>>> the file's data : hole ratio.
>>> 
>>> None of the above. The NFS server has no business knowing intimate
>>> details about how the filesystem has laid out the file. All it cares
>>> about ranges containing data and ranges that have no data (holes).
>> 
>> That would be the case if we want nothing more than impermeable
>> software layering. That's nice for software developers, but
>> maybe not of much benefit to average users.
>> 
>> I see no efficiency benefit, for example, if a 10MB object file
>> has 512 bytes of zeroes at a convenient offset and the server
>> returns that as DATA/HOLE/DATA. The amount of extra work it has
>> to do to make that happen results in the same latencies as
>> transmitting 512 extra bytes on GbE. It might be even worse on
>> faster network fabrics.
> 
> Welcome to the world of sparse file IO optimisation - NFS may be new
> to this, but local filesystems have been optimising this sort of
> thing for decades. Don't think we don't know about it - seek time
> between 2 discontiguous extents is *much worse* than the few extra
> microseconds that DMAing a few extra sectors worth of data in a
> single IO. Even on SSDs, there's a noticable latency difference
> between a single 16KB IO and two separate 4kB IOs to get the same
> data.
> 
> As it is, we worry about filesystem block granularity, not sectors.
> A filesystem would only report a 512 byte range as a hole if it had
> a 512 byte block size. Almost no-one uses such small block sizes -
> 4kB is the default for ext4 and XFS - so the minimum hole size is
> generally 4KB.
> 
> Indeed, in this case XFS will not actually have a hole in the file
> layout - the underlying extent would have been allocated as a single
> contiguous unwritten extent, then when the data gets written it will
> convert the two ranges to written, resulting in a physically
> contiguous "data-unwritten-data" set of extents. However,
> SEEK_HOLE/DATA will see that unwritten extent as a hole, so you'll
> get that reported via seek hole/data or sparse reads regardless of
> whether it is optimal for READ_PLUS encoding.
> 
> However, the physical layout is optimal for XFS - if the hole gets
> filled by a future write, it ends up converting the file layout to a
> single contiguous data extent that can be read with a single IO
> rather than three physically discontigous data extents that require
> 3 physical IOs to read.
> 
> ext4 optimises this in a different way - it will allocate small
> holes similar to the way XFS does, but it will also fill the with
> real zeros rather than leaving them unwritten. As a result, ext4
> will have a single physically contiguous extent on disk too, but it
> will -always- report as data even though the data written by the
> application is sparse and contains a run of zeroes in it.
> 
> IOWs, ext4 and XFS will behave differently for the simple case you
> gave because they've optimised small holes in sparse file data
> differently. Both have their pros and cons, but it also means that
> the READ_PLUS response for the same file data written the same way
> can be different because the underlying filesystem on the server is
> different.
> 
> IOWs, the NFS server cannot rely on a specific behaviour w.r.t.
> holes and data from the underlying filesystem. Sparseness of a file
> is determined by how the underlying filesystem wants to optimise the
> physical layout or the data and IO patterns that data access results
> in. The NFS server really cannot influence that, or change the
> "sparseness" it reports to the client except by examining the
> data it reads from the filesystem....

What I'm proposing is that the NFS server needs to pick and
choose whether to internally use sparse reads or classic
reads of the underlying file when satisfying a READ_PLUS
request.

I am not proposing that the server should try to change the
extent layout used by the filesystem.


>> On fast networks, the less the server's host CPU has to be
>> involved in doing the READ, the better it scales. It's
>> better to set up the I/O and step out of the way; use zero
>> touch as much as possible.
> 
> However, as you demonstrate for the NFS client below, something has
> to fill the holes. i.e. if we aren't doing sparse reads on the NFS
> server, then the filesystem underneath the NFS server has to spend
> the CPU to instantiate pages over the hole in the file in the page
> cache, then zero them before it can pass them to the "zero touch"
> NFS send path to be encoded into the READ reponse.
> 
> IOWs, sending holes as zeroed data from the server is most
> definitely not zero touch even if NFS doesn't touch the data.
> 
> Hence the "sparse read" from the underlying filesystem, which avoids
> the need to populate and zero pages in the server page cache
> altogether, as well as enabling NFS to use it's zero-touch
> encode/send path for the data that is returned. And with a sparse
> read operation, the encoding of hole ranges in READ_PLUS has almost
> zero CPU overhead because the calculation is dead simple (i.e. hole
> start = end of last data, hole len = start of next data - end of
> last data).
> 
> IOWs, the lowest server CPU and memory READ processing overhead
> comes from using READ_PLUS with sparse reads from the filesystem and
> zero-touch READ_PLUS data range encoding.

Agree on zero-touch: I would like to enable the use of
splice read for CONTENT_DATA wherever it can be supported.

Sparse reads for large sparse files will avoid hole
instantiation, and that's a good thing as well.

For small to average size files, I'm not as clear. If the
server does not instantiate a hole, then the clients end
up filling that hole themselves whenever re-reading the
file. That seems like an unreasonable expense for them.

The cost of hole instantiation is low and amortized over
the life of the file, and after instantiated, zero touch
can be used to satisfy subsequent READ_PLUS requests. On
balance that seems like a better approach.


>> Likewise on the client: it might receive a CONTENT_HOLE, but
>> then its CPU has to zero out that range, with all the memory
>> and cache activity that entails. For small holes, that's going
>> to be a lot of memset(0). If the server returns holes only
>> when they are large, then the client can use more efficient
>> techniques (like marking page cache pages or using ZERO_PAGE).
> 
> Yes, if we have sparse files we have to take that page cache
> instantiation penalty *somewhere* in the IO stack between the client
> and the server.
> 
> Indeed, we actually want this overhead in the NFS client, because we
> have many more NFS clients than we do NFS servers and hence on
> aggregate there is way more CPU and memory to dedicate to
> instantiating zeroed data on the client side than on the server
> side.
> 
> IOWs, if we ship zeroes via RDMA to the NFS client so the NFS client
> doesn't need to instantiate holes in the page cache, then we're
> actually forcing the server to perform hole instantiation. Forcing
> the server to instantiate all the holes for all the files that all
> the clients it is serving is prohibitive from a server CPU and
> memory POV,

I would hope not. Server-side hole instantiation is what we
already have today with classic NFS READ, right?

Anyway, the value proposition of RDMA storage protocols is
that the network fabric handles data transfer on behalf of
both the client and server host CPU -- that way, more CPU
capacity is available on the client for applications running
there.

NFS servers are typically overprovisioned with CPU because
serving files is not CPU intensive work. If anything, I would
be concerned that hole instantiation would increase the amount
of I/O (as a form of write amplification). But again, NFS
server hardware is usually designed for I/O capacity, and
IIUC the server already does hole instantiation now.

But, it's academic. READ_PLUS is not efficient on NFS/RDMA
as READ because with NFS/RDMA the client has to register
memory for the server to write the payload in. However, the
client cannot know in advance what the payload looks like
in terms of DATA and HOLEs. So it's only choice is to set up
a large buffer and parse through it when it gets the reply.
That buffer can be filled via an RDMA Write, of course, but
the new expense for the client is parsing the returned
segment list. Even if it's just one segment, the client will
have to copy the payload into place.

For a classic READ over NFS/RDMA, that buffer is carved out
of the local page cache and the server can place the payload
directly into that cache. The client NFS protocol stack never
touches it. This enables 2-3x higher throughput for READs. We
see a benefit even with software-implemented RDMA (siw).

So we would need to come up with some way of helping each
NFS/RDMA client to set up receive buffers that will be able
to get filled via direct data placement no matter how the
server wants to return the payload. Perhaps some new protocol
would be needed. Not impossible, but I don't think NFSv4.2
READ_PLUS can do this.


> even if you have the both the server network bandwidth
> and the cross-sectional network bandwidth available to ship all
> those zeros to all the clients.
> 
>> On networks with large bandwidth-latency products, however,
>> it makes sense to trade as much server and client CPU and
>> memory for transferred bytes as you can.
> 
> Server, yes, client not so much. If you focus purely on single
> client<->server throughput, the server is not going to scale when
> hundreds or thousands of clients all demand data at the same time.
> 
> But, regardless of this, the fact is that the NFS server is a
> consumer of the sparseness data stored in the underlying filesystem.
> Unless it wants to touch every byte that is read, it can only export
> the HOLE/DATA information that the filesystem can provide it with.
> 
> What we want is a method that allows the filesystem to provide that
> information to the NFS server READ_PLUS operation as efficiently as
> possible. AFAICT, that's a sparse read operation....

I'm already sold on that. I don't want NFSD touching every
byte in every READ payload, even on TCP networks. And
definitely, for reads of large sparse files, the proposed
internal sparse read operation sounds great.


>> The mechanism that handles sparse files needs to be distinct
>> from the policy of when to return more than a single
>> CONTENT_DATA segment, since one of our goals is to ensure
>> that READ_PLUS performs and scales as well as READ on common
>> workloads (ie, not HPC / large sparse file workloads).
> 
> Yes, so make the server operation as efficient as possible whilst
> still being correct (i.e. sparse reads), and push the the CPU and
> memory requirements for instantiating and storing zeroes out
> to the NFS client.

Again, I believe that's a policy choice. It really depends on
the CPU available on both ends and the throughput capacity
of the network fabric (which always depends on instantaneous
traffic levels).


> If the NFS client page cache instantiation can't
> keep up with the server sending it encoded ranges of zeros, then
> this isn't a server side issue; the client side sparse file support
> needs fixing/optimising.

Once optimized, I don't think it will be a "can't keep up"
issue but rather a "how much CPU and memory bandwidth is being
stolen from applications" issue.


>>> If the filesystem can support a sparse read, it returns sparse
>>> ranges containing data to the NFS server. If the filesystem can't
>>> support it, or it's internal file layout doesn't allow for efficient
>>> hole/data discrimination, then it can just return the entire read
>>> range.
>>> 
>>> Alternatively, in this latter case, the filesystem could call a
>>> generic "sparse read" implementation that runs memchr_inv() to find
>>> the first data range to return. Then the NFS server doesn't have to
>>> code things differently, filesystems don't need to advertise
>>> support for sparse reads, etc because every filesystem could
>>> support sparse reads.
>>> 
>>> The only difference is that some filesystems will be much more
>>> efficient and faster at it than others. We already see that sort
>>> of thing with btrfs and seek hole/data on large cached files so I
>>> don't see "filesystems perform differently" as a problem here...
>> 
>> The problem with that approach is that will result in
>> performance regressions on NFSv4.2 with exports that reside
>> on underperforming filesystem types. We need READ_PLUS to
>> perform as well as READ so there is no regression between
>> NFSv4.2 without and with READ_PLUS, and no regression when
>> migrating from NFSv4.1 to NFSv4.2 with READ_PLUS enabled.
> 
> Sure, but fear of regressions is not a valid reason for preventing
> change from being made.

I don't believe I'm stating a fear, but rather I'm stating
a hard requirement for our final READ_PLUS implementation.

In any event, a good way to deal with fear of any kind is
to do some reality testing. I think we have enough consensus
to build a prototype, if Anna is willing, and then assess
its performance.

There are two mechanisms:

1 The sparse internal read mechanism you proposed

2 The classic splice/readv mechanism that READ uses


There are three usage scenarios:

A Large sparse files that reside on a filesystem that supports
  storing sparse file

B Small to average size files that may or may not be sparse,
  residing on a filesystem that supports storing sparse files

C Files that reside on a filesystem that does not support
  storing files sparsely (or, IIUC, does not support iomap)


I think we agree that scenario A should use mechanism 1 and
scenario C should use mechanism 2. The question is whether
mechanism 1 will perform as well as 2 for scenario B. If
yes, then READ_PLUS can use mechanism 1 to handle scenarios
A and B, and 2 to handle C. If no, then READ_PLUS can use
mechanism 1 to handle scenario A and 2 to handle scenarios
B and C.

Both mechanisms need to be implemented no matter what, so it
looks to me like very little of the prototype code would be
wasted if such a change in direction becomes necessary once
performance evaluation is complete.


--
Chuck Lever
Anna Schumaker Aug. 18, 2022, 6:31 p.m. UTC | #21
On Fri, Jul 22, 2022 at 11:09 AM Chuck Lever III <chuck.lever@oracle.com> wrote:
>
>
>
> > On Jul 21, 2022, at 8:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Wed, Jul 20, 2022 at 04:18:36AM +0000, Chuck Lever III wrote:
> >>> On Jul 19, 2022, at 10:36 PM, Dave Chinner <david@fromorbit.com> wrote:
> >>> On Wed, Jul 20, 2022 at 01:26:13AM +0000, Chuck Lever III wrote:
> >>>>> On Jul 19, 2022, at 6:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> >>>>> IOWs, it seems to me that what READ_PLUS really wants is a "sparse
> >>>>> read operation" from the filesystem rather than the current "read
> >>>>> that fills holes with zeroes". i.e. a read operation that sets an
> >>>>> iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
> >>>>> read to just the ranges that contain data.
> > .....
> >>>> Now how does the server make that choice? Is there an attribute
> >>>> bit that indicates when a file should be treated as sparse? Can
> >>>> we assume that immutable files (or compressed files) should
> >>>> always be treated as sparse? Alternately, the server might use
> >>>> the file's data : hole ratio.
> >>>
> >>> None of the above. The NFS server has no business knowing intimate
> >>> details about how the filesystem has laid out the file. All it cares
> >>> about ranges containing data and ranges that have no data (holes).
> >>
> >> That would be the case if we want nothing more than impermeable
> >> software layering. That's nice for software developers, but
> >> maybe not of much benefit to average users.
> >>
> >> I see no efficiency benefit, for example, if a 10MB object file
> >> has 512 bytes of zeroes at a convenient offset and the server
> >> returns that as DATA/HOLE/DATA. The amount of extra work it has
> >> to do to make that happen results in the same latencies as
> >> transmitting 512 extra bytes on GbE. It might be even worse on
> >> faster network fabrics.
> >
> > Welcome to the world of sparse file IO optimisation - NFS may be new
> > to this, but local filesystems have been optimising this sort of
> > thing for decades. Don't think we don't know about it - seek time
> > between 2 discontiguous extents is *much worse* than the few extra
> > microseconds that DMAing a few extra sectors worth of data in a
> > single IO. Even on SSDs, there's a noticable latency difference
> > between a single 16KB IO and two separate 4kB IOs to get the same
> > data.
> >
> > As it is, we worry about filesystem block granularity, not sectors.
> > A filesystem would only report a 512 byte range as a hole if it had
> > a 512 byte block size. Almost no-one uses such small block sizes -
> > 4kB is the default for ext4 and XFS - so the minimum hole size is
> > generally 4KB.
> >
> > Indeed, in this case XFS will not actually have a hole in the file
> > layout - the underlying extent would have been allocated as a single
> > contiguous unwritten extent, then when the data gets written it will
> > convert the two ranges to written, resulting in a physically
> > contiguous "data-unwritten-data" set of extents. However,
> > SEEK_HOLE/DATA will see that unwritten extent as a hole, so you'll
> > get that reported via seek hole/data or sparse reads regardless of
> > whether it is optimal for READ_PLUS encoding.
> >
> > However, the physical layout is optimal for XFS - if the hole gets
> > filled by a future write, it ends up converting the file layout to a
> > single contiguous data extent that can be read with a single IO
> > rather than three physically discontigous data extents that require
> > 3 physical IOs to read.
> >
> > ext4 optimises this in a different way - it will allocate small
> > holes similar to the way XFS does, but it will also fill the with
> > real zeros rather than leaving them unwritten. As a result, ext4
> > will have a single physically contiguous extent on disk too, but it
> > will -always- report as data even though the data written by the
> > application is sparse and contains a run of zeroes in it.
> >
> > IOWs, ext4 and XFS will behave differently for the simple case you
> > gave because they've optimised small holes in sparse file data
> > differently. Both have their pros and cons, but it also means that
> > the READ_PLUS response for the same file data written the same way
> > can be different because the underlying filesystem on the server is
> > different.
> >
> > IOWs, the NFS server cannot rely on a specific behaviour w.r.t.
> > holes and data from the underlying filesystem. Sparseness of a file
> > is determined by how the underlying filesystem wants to optimise the
> > physical layout or the data and IO patterns that data access results
> > in. The NFS server really cannot influence that, or change the
> > "sparseness" it reports to the client except by examining the
> > data it reads from the filesystem....
>
> What I'm proposing is that the NFS server needs to pick and
> choose whether to internally use sparse reads or classic
> reads of the underlying file when satisfying a READ_PLUS
> request.
>
> I am not proposing that the server should try to change the
> extent layout used by the filesystem.
>
>
> >> On fast networks, the less the server's host CPU has to be
> >> involved in doing the READ, the better it scales. It's
> >> better to set up the I/O and step out of the way; use zero
> >> touch as much as possible.
> >
> > However, as you demonstrate for the NFS client below, something has
> > to fill the holes. i.e. if we aren't doing sparse reads on the NFS
> > server, then the filesystem underneath the NFS server has to spend
> > the CPU to instantiate pages over the hole in the file in the page
> > cache, then zero them before it can pass them to the "zero touch"
> > NFS send path to be encoded into the READ reponse.
> >
> > IOWs, sending holes as zeroed data from the server is most
> > definitely not zero touch even if NFS doesn't touch the data.
> >
> > Hence the "sparse read" from the underlying filesystem, which avoids
> > the need to populate and zero pages in the server page cache
> > altogether, as well as enabling NFS to use it's zero-touch
> > encode/send path for the data that is returned. And with a sparse
> > read operation, the encoding of hole ranges in READ_PLUS has almost
> > zero CPU overhead because the calculation is dead simple (i.e. hole
> > start = end of last data, hole len = start of next data - end of
> > last data).
> >
> > IOWs, the lowest server CPU and memory READ processing overhead
> > comes from using READ_PLUS with sparse reads from the filesystem and
> > zero-touch READ_PLUS data range encoding.
>
> Agree on zero-touch: I would like to enable the use of
> splice read for CONTENT_DATA wherever it can be supported.
>
> Sparse reads for large sparse files will avoid hole
> instantiation, and that's a good thing as well.
>
> For small to average size files, I'm not as clear. If the
> server does not instantiate a hole, then the clients end
> up filling that hole themselves whenever re-reading the
> file. That seems like an unreasonable expense for them.
>
> The cost of hole instantiation is low and amortized over
> the life of the file, and after instantiated, zero touch
> can be used to satisfy subsequent READ_PLUS requests. On
> balance that seems like a better approach.
>
>
> >> Likewise on the client: it might receive a CONTENT_HOLE, but
> >> then its CPU has to zero out that range, with all the memory
> >> and cache activity that entails. For small holes, that's going
> >> to be a lot of memset(0). If the server returns holes only
> >> when they are large, then the client can use more efficient
> >> techniques (like marking page cache pages or using ZERO_PAGE).
> >
> > Yes, if we have sparse files we have to take that page cache
> > instantiation penalty *somewhere* in the IO stack between the client
> > and the server.
> >
> > Indeed, we actually want this overhead in the NFS client, because we
> > have many more NFS clients than we do NFS servers and hence on
> > aggregate there is way more CPU and memory to dedicate to
> > instantiating zeroed data on the client side than on the server
> > side.
> >
> > IOWs, if we ship zeroes via RDMA to the NFS client so the NFS client
> > doesn't need to instantiate holes in the page cache, then we're
> > actually forcing the server to perform hole instantiation. Forcing
> > the server to instantiate all the holes for all the files that all
> > the clients it is serving is prohibitive from a server CPU and
> > memory POV,
>
> I would hope not. Server-side hole instantiation is what we
> already have today with classic NFS READ, right?
>
> Anyway, the value proposition of RDMA storage protocols is
> that the network fabric handles data transfer on behalf of
> both the client and server host CPU -- that way, more CPU
> capacity is available on the client for applications running
> there.
>
> NFS servers are typically overprovisioned with CPU because
> serving files is not CPU intensive work. If anything, I would
> be concerned that hole instantiation would increase the amount
> of I/O (as a form of write amplification). But again, NFS
> server hardware is usually designed for I/O capacity, and
> IIUC the server already does hole instantiation now.
>
> But, it's academic. READ_PLUS is not efficient on NFS/RDMA
> as READ because with NFS/RDMA the client has to register
> memory for the server to write the payload in. However, the
> client cannot know in advance what the payload looks like
> in terms of DATA and HOLEs. So it's only choice is to set up
> a large buffer and parse through it when it gets the reply.
> That buffer can be filled via an RDMA Write, of course, but
> the new expense for the client is parsing the returned
> segment list. Even if it's just one segment, the client will
> have to copy the payload into place.
>
> For a classic READ over NFS/RDMA, that buffer is carved out
> of the local page cache and the server can place the payload
> directly into that cache. The client NFS protocol stack never
> touches it. This enables 2-3x higher throughput for READs. We
> see a benefit even with software-implemented RDMA (siw).
>
> So we would need to come up with some way of helping each
> NFS/RDMA client to set up receive buffers that will be able
> to get filled via direct data placement no matter how the
> server wants to return the payload. Perhaps some new protocol
> would be needed. Not impossible, but I don't think NFSv4.2
> READ_PLUS can do this.
>
>
> > even if you have the both the server network bandwidth
> > and the cross-sectional network bandwidth available to ship all
> > those zeros to all the clients.
> >
> >> On networks with large bandwidth-latency products, however,
> >> it makes sense to trade as much server and client CPU and
> >> memory for transferred bytes as you can.
> >
> > Server, yes, client not so much. If you focus purely on single
> > client<->server throughput, the server is not going to scale when
> > hundreds or thousands of clients all demand data at the same time.
> >
> > But, regardless of this, the fact is that the NFS server is a
> > consumer of the sparseness data stored in the underlying filesystem.
> > Unless it wants to touch every byte that is read, it can only export
> > the HOLE/DATA information that the filesystem can provide it with.
> >
> > What we want is a method that allows the filesystem to provide that
> > information to the NFS server READ_PLUS operation as efficiently as
> > possible. AFAICT, that's a sparse read operation....
>
> I'm already sold on that. I don't want NFSD touching every
> byte in every READ payload, even on TCP networks. And
> definitely, for reads of large sparse files, the proposed
> internal sparse read operation sounds great.
>
>
> >> The mechanism that handles sparse files needs to be distinct
> >> from the policy of when to return more than a single
> >> CONTENT_DATA segment, since one of our goals is to ensure
> >> that READ_PLUS performs and scales as well as READ on common
> >> workloads (ie, not HPC / large sparse file workloads).
> >
> > Yes, so make the server operation as efficient as possible whilst
> > still being correct (i.e. sparse reads), and push the the CPU and
> > memory requirements for instantiating and storing zeroes out
> > to the NFS client.
>
> Again, I believe that's a policy choice. It really depends on
> the CPU available on both ends and the throughput capacity
> of the network fabric (which always depends on instantaneous
> traffic levels).
>
>
> > If the NFS client page cache instantiation can't
> > keep up with the server sending it encoded ranges of zeros, then
> > this isn't a server side issue; the client side sparse file support
> > needs fixing/optimising.
>
> Once optimized, I don't think it will be a "can't keep up"
> issue but rather a "how much CPU and memory bandwidth is being
> stolen from applications" issue.
>
>
> >>> If the filesystem can support a sparse read, it returns sparse
> >>> ranges containing data to the NFS server. If the filesystem can't
> >>> support it, or it's internal file layout doesn't allow for efficient
> >>> hole/data discrimination, then it can just return the entire read
> >>> range.
> >>>
> >>> Alternatively, in this latter case, the filesystem could call a
> >>> generic "sparse read" implementation that runs memchr_inv() to find
> >>> the first data range to return. Then the NFS server doesn't have to
> >>> code things differently, filesystems don't need to advertise
> >>> support for sparse reads, etc because every filesystem could
> >>> support sparse reads.
> >>>
> >>> The only difference is that some filesystems will be much more
> >>> efficient and faster at it than others. We already see that sort
> >>> of thing with btrfs and seek hole/data on large cached files so I
> >>> don't see "filesystems perform differently" as a problem here...
> >>
> >> The problem with that approach is that will result in
> >> performance regressions on NFSv4.2 with exports that reside
> >> on underperforming filesystem types. We need READ_PLUS to
> >> perform as well as READ so there is no regression between
> >> NFSv4.2 without and with READ_PLUS, and no regression when
> >> migrating from NFSv4.1 to NFSv4.2 with READ_PLUS enabled.
> >
> > Sure, but fear of regressions is not a valid reason for preventing
> > change from being made.
>
> I don't believe I'm stating a fear, but rather I'm stating
> a hard requirement for our final READ_PLUS implementation.
>
> In any event, a good way to deal with fear of any kind is
> to do some reality testing. I think we have enough consensus
> to build a prototype, if Anna is willing, and then assess
> its performance.
>
> There are two mechanisms:
>
> 1 The sparse internal read mechanism you proposed

Assuming nobody else has started working on the sparse-read function,
I would like to check my understanding of what it would look like:

- The function splice_directo_to_actor() takes a "struct splice_desc"
as an argument. The sparse read function would take something similar,
a "struct sparse_desc", which has callbacks used by the underlying
filesystem to tell the server to encode either a hole or data segment
next.

In the default case, the VFS tells the server to encode the entire
read range as data. If underlying filesystems opt-in, then whenever a
data extent is found the encode_data_segment() function is called and
whenever a hole is found it calls the encode_hole_segment() function.

The NFS server would need to know file offset and length of each
segment, so these would probably be arguments to both functions. How
would reading data work, though? The server needs to reserve space in
the xdr stream for the offset & length metadata, but also the data
itself. I'm assuming it would do something similar to
fs/nfsd/vfs.c:nfsd_readv() and set up an iov_iter or kvec that the
underlying filesystem can use to place file data?

Does that sound about right? Am I missing anything?

I can definitely do the NFS server portion of this, and probably the
vfs-level sparse read function as well. I'll need help with the
underlying filesystem piece though, if anybody has a few spare cycles
once I cobble together a vfs function.

Thanks,
Anna

>
> 2 The classic splice/readv mechanism that READ uses
>
>
> There are three usage scenarios:
>
> A Large sparse files that reside on a filesystem that supports
>   storing sparse file
>
> B Small to average size files that may or may not be sparse,
>   residing on a filesystem that supports storing sparse files
>
> C Files that reside on a filesystem that does not support
>   storing files sparsely (or, IIUC, does not support iomap)
>
>
> I think we agree that scenario A should use mechanism 1 and
> scenario C should use mechanism 2. The question is whether
> mechanism 1 will perform as well as 2 for scenario B. If
> yes, then READ_PLUS can use mechanism 1 to handle scenarios
> A and B, and 2 to handle C. If no, then READ_PLUS can use
> mechanism 1 to handle scenario A and 2 to handle scenarios
> B and C.
>
> Both mechanisms need to be implemented no matter what, so it
> looks to me like very little of the prototype code would be
> wasted if such a change in direction becomes necessary once
> performance evaluation is complete.
>
>
> --
> Chuck Lever
>
>
>
Chuck Lever III Aug. 19, 2022, 3:18 p.m. UTC | #22
> On Aug 18, 2022, at 2:31 PM, Anna Schumaker <anna@kernel.org> wrote:
> 
> Assuming nobody else has started working on the sparse-read function,
> I would like to check my understanding of what it would look like:
> 
> - The function splice_directo_to_actor() takes a "struct splice_desc"
> as an argument. The sparse read function would take something similar,
> a "struct sparse_desc", which has callbacks used by the underlying
> filesystem to tell the server to encode either a hole or data segment
> next.
> 
> In the default case, the VFS tells the server to encode the entire
> read range as data. If underlying filesystems opt-in, then whenever a
> data extent is found the encode_data_segment() function is called and
> whenever a hole is found it calls the encode_hole_segment() function.
> 
> The NFS server would need to know file offset and length of each
> segment, so these would probably be arguments to both functions. How
> would reading data work, though? The server needs to reserve space in
> the xdr stream for the offset & length metadata, but also the data
> itself. I'm assuming it would do something similar to
> fs/nfsd/vfs.c:nfsd_readv() and set up an iov_iter or kvec that the
> underlying filesystem can use to place file data?
> 
> Does that sound about right? Am I missing anything?

I was expecting something more like what Dave originally described:

>> IOWs, it seems to me that what READ_PLUS really wants is a "sparse
>> read operation" from the filesystem rather than the current "read
>> that fills holes with zeroes". i.e. a read operation that sets an
>> iocb flag like RWF_SPARSE_READ to tell the filesystem to trim the
>> read to just the ranges that contain data.
>> 
>> That way the read populates the page cache over a single contiguous
>> range of data and returns with the {offset, len} that spans the
>> range that is read and mapped. The caller can then read that region
>> out of the page cache and mark all the non-data regions as holes in
>> whatever manner they need to.
>> 
>> The iomap infrastructure that XFS and other filesystems use provide
>> this exact "map only what contains data" capability - an iomap tells
>> the page cache exactly what underlies the data range (hole, data,
>> unwritten extents, etc) in an efficient manner, so it wouldn't be a
>> huge stretch just to limit read IO ranges to those that contain only
>> DATA extents.
>> 
>> At this point READ_PLUS then just needs to iterate doing sparse
>> reads and recording the ranges that return data as vector of some
>> kind that is then passes to the encoding function to encode it as
>> a sparse READ_PLUS data range....


But it's not clear how this would be made atomic, so maybe the
callback mechanism you described is necessary.

To make some progress on moving READ_PLUS out of EXPERIMENTAL
status, can you build a READ_PLUS implementation that always
returns a single CONTENT_DATA segment? We can merge that straight
away, and it should perform on par with READ immediately.

Then the bells and whistles can be added over time.


--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 61b2aae81abb..df8289fce4ef 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4731,81 +4731,138 @@  nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfserr;
 }
 
+struct read_plus_segment {
+	enum data_content4	rp_type;
+	u64			rp_offset;
+	u64			rp_length;
+	unsigned int		rp_page_pos;
+};
+
 static __be32
-nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
-			    struct nfsd4_read *read,
-			    unsigned long *maxcount, u32 *eof,
-			    loff_t *pos)
+nfsd4_read_plus_readv(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
+		      unsigned long *maxcount, u32 *eof)
 {
 	struct xdr_stream *xdr = resp->xdr;
-	struct file *file = read->rd_nf->nf_file;
-	int starting_len = xdr->buf->len;
-	loff_t hole_pos;
-	__be32 nfserr;
-	__be32 *p, tmp;
-	__be64 tmp64;
-
-	hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
-	if (hole_pos > read->rd_offset)
-		*maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
-	*maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
-
-	/* Content type, offset, byte count */
-	p = xdr_reserve_space(xdr, 4 + 8 + 4);
-	if (!p)
-		return nfserr_resource;
+	unsigned int starting_len = xdr->buf->len;
+	__be32 nfserr, zero = xdr_zero;
+	unsigned int pad;
 
+	/*
+	 * Reserve the maximum abount of space needed to craft a READ_PLUS
+	 * reply. The call to xdr_reserve_space_vec() switches us to the
+	 * xdr->pages, which we then read file data into before analyzing
+	 * the individual segments.
+	 */
 	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
 	if (read->rd_vlen < 0)
 		return nfserr_resource;
 
-	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
-			    resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
+	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, read->rd_nf->nf_file,
+			    read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
+			    maxcount, eof);
 	if (nfserr)
 		return nfserr;
-	xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
+	xdr_truncate_encode(xdr, starting_len + xdr_align_size(*maxcount));
 
-	tmp = htonl(NFS4_CONTENT_DATA);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
-	tmp64 = cpu_to_be64(read->rd_offset);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
-	tmp = htonl(*maxcount);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
-
-	tmp = xdr_zero;
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
-			       xdr_pad_size(*maxcount));
+	pad = xdr_pad_size(*maxcount);
+	write_bytes_to_xdr_buf(xdr->buf, starting_len + *maxcount, &zero, pad);
 	return nfs_ok;
 }
 
+/**
+ * nfsd4_encode_read_plus_segment - Encode a single READ_PLUS segment
+ * @xdr: pointer to an xdr_stream
+ * @segment: pointer to a single segment
+ * @bufpos: xdr_stream offset to place the segment
+ * @segments: pointer to the total number of segments seen
+ *
+ * Performs surgery on the xdr_stream to compress out HOLE segments and
+ * to place DATA segments in the proper place.
+ */
+static void
+nfsd4_encode_read_plus_segment(struct xdr_stream *xdr,
+			       struct read_plus_segment *segment,
+			       unsigned int *bufpos, unsigned int *segments)
+{
+	struct xdr_buf *buf = xdr->buf;
+
+	xdr_encode_word(buf, *bufpos, segment->rp_type);
+	xdr_encode_double(buf, *bufpos + XDR_UNIT, segment->rp_offset);
+	*bufpos += 3 * XDR_UNIT;
+
+	if (segment->rp_type == NFS4_CONTENT_HOLE) {
+		xdr_encode_double(buf, *bufpos, segment->rp_length);
+		*bufpos += 2 * XDR_UNIT;
+	} else {
+		size_t align = xdr_align_size(segment->rp_length);
+		xdr_encode_word(buf, *bufpos, segment->rp_length);
+		if (*segments == 0)
+			xdr_buf_trim_head(buf, XDR_UNIT);
+
+		xdr_stream_move_subsegment(xdr,
+				buf->head[0].iov_len + segment->rp_page_pos,
+				*bufpos + XDR_UNIT, align);
+		*bufpos += XDR_UNIT + align;
+	}
+
+	*segments += 1;
+}
+
 static __be32
-nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
-			    struct nfsd4_read *read,
-			    unsigned long *maxcount, u32 *eof)
+nfsd4_encode_read_plus_segments(struct nfsd4_compoundres *resp,
+				struct nfsd4_read *read,
+				unsigned int *segments, u32 *eof)
 {
-	struct file *file = read->rd_nf->nf_file;
-	loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
-	loff_t f_size = i_size_read(file_inode(file));
-	unsigned long count;
-	__be32 *p;
+	struct xdr_stream *xdr = resp->xdr;
+	unsigned int bufpos = xdr->buf->len;
+	u64 offset = read->rd_offset;
+	struct read_plus_segment segment;
+	enum data_content4 pagetype;
+	unsigned long maxcount;
+	unsigned int pagenum = 0;
+	unsigned int pagelen;
+	char *vpage, *p;
+	__be32 nfserr;
 
-	if (data_pos == -ENXIO)
-		data_pos = f_size;
-	else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
-		return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
-	count = data_pos - read->rd_offset;
-
-	/* Content type, offset, byte count */
-	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
-	if (!p)
+	/* enough space for a HOLE segment before we switch to the pages */
+	if (!xdr_reserve_space(xdr, 5 * XDR_UNIT))
 		return nfserr_resource;
+	xdr_commit_encode(xdr);
 
-	*p++ = htonl(NFS4_CONTENT_HOLE);
-	p = xdr_encode_hyper(p, read->rd_offset);
-	p = xdr_encode_hyper(p, count);
+	maxcount = min_t(unsigned long, read->rd_length,
+			 (xdr->buf->buflen - xdr->buf->len));
 
-	*eof = (read->rd_offset + count) >= f_size;
-	*maxcount = min_t(unsigned long, count, *maxcount);
+	nfserr = nfsd4_read_plus_readv(resp, read, &maxcount, eof);
+	if (nfserr)
+		return nfserr;
+
+	while (maxcount > 0) {
+		vpage = xdr_buf_nth_page_address(xdr->buf, pagenum, &pagelen);
+		pagelen = min_t(unsigned int, pagelen, maxcount);
+		if (!vpage || pagelen == 0)
+			break;
+		p = memchr_inv(vpage, 0, pagelen);
+		pagetype = (p == NULL) ? NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA;
+
+		if (pagetype != segment.rp_type || pagenum == 0) {
+			if (likely(pagenum > 0)) {
+				nfsd4_encode_read_plus_segment(xdr, &segment,
+							      &bufpos, segments);
+				offset += segment.rp_length;
+			}
+			segment.rp_type = pagetype;
+			segment.rp_offset = offset;
+			segment.rp_length = pagelen;
+			segment.rp_page_pos = pagenum * PAGE_SIZE;
+		} else
+			segment.rp_length += pagelen;
+
+		maxcount -= pagelen;
+		pagenum++;
+	}
+
+	nfsd4_encode_read_plus_segment(xdr, &segment, &bufpos, segments);
+	xdr_truncate_encode(xdr, bufpos);
 	return nfs_ok;
 }
 
@@ -4813,69 +4870,31 @@  static __be32
 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
 		       struct nfsd4_read *read)
 {
-	unsigned long maxcount, count;
 	struct xdr_stream *xdr = resp->xdr;
-	struct file *file;
 	int starting_len = xdr->buf->len;
-	int last_segment = xdr->buf->len;
-	int segments = 0;
-	__be32 *p, tmp;
-	bool is_data;
-	loff_t pos;
+	unsigned int segments = 0;
 	u32 eof;
 
 	if (nfserr)
 		return nfserr;
-	file = read->rd_nf->nf_file;
 
 	/* eof flag, segment count */
-	p = xdr_reserve_space(xdr, 4 + 4);
-	if (!p)
+	if (!xdr_reserve_space(xdr, 2 * XDR_UNIT))
 		return nfserr_resource;
 	xdr_commit_encode(xdr);
 
-	maxcount = min_t(unsigned long, read->rd_length,
-			 (xdr->buf->buflen - xdr->buf->len));
-	count    = maxcount;
-
-	eof = read->rd_offset >= i_size_read(file_inode(file));
+	eof = read->rd_offset >= i_size_read(file_inode(read->rd_nf->nf_file));
 	if (eof)
 		goto out;
 
-	pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
-	is_data = pos > read->rd_offset;
-
-	while (count > 0 && !eof) {
-		maxcount = count;
-		if (is_data)
-			nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
-						segments == 0 ? &pos : NULL);
-		else
-			nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
-		if (nfserr)
-			goto out;
-		count -= maxcount;
-		read->rd_offset += maxcount;
-		is_data = !is_data;
-		last_segment = xdr->buf->len;
-		segments++;
-	}
-
+	nfserr = nfsd4_encode_read_plus_segments(resp, read, &segments, &eof);
 out:
-	if (nfserr && segments == 0)
+	if (nfserr)
 		xdr_truncate_encode(xdr, starting_len);
 	else {
-		if (nfserr) {
-			xdr_truncate_encode(xdr, last_segment);
-			nfserr = nfs_ok;
-			eof = 0;
-		}
-		tmp = htonl(eof);
-		write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
-		tmp = htonl(segments);
-		write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+		xdr_encode_word(xdr->buf, starting_len, eof);
+		xdr_encode_word(xdr->buf, starting_len + XDR_UNIT, segments);
 	}
-
 	return nfserr;
 }