NFSv4.2: improve page handling for GETXATTR

Message ID	20201201213128.13624-1-fllinden@amazon.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-nfs-owner@kernel.org> From: Frank van der Linden <fllinden@amazon.com> To: <linux-nfs@vger.kernel.org>, <anna.schumaker@netapp.com>, <trond.myklebust@hammerspace.com> CC: <chuck.lever@oracle.com>, <aglo@umich.edu>, Frank van der Linden <fllinden@amazon.com> Subject: [PATCH] NFSv4.2: improve page handling for GETXATTR Date: Tue, 1 Dec 2020 21:31:28 +0000 Message-ID: <20201201213128.13624-1-fllinden@amazon.com> MIME-Version: 1.0 Content-Type: text/plain Precedence: bulk
Series	NFSv4.2: improve page handling for GETXATTR \| expand NFSv4.2: improve page handling for GETXATTR

Frank van der Linden Dec. 1, 2020, 9:31 p.m. UTC

XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
and it's easy enough to allocate enough pages for the request
up front, so do that.

Also, since we've allocated the pages anyway, use the full
page aligned length for the receive buffer. This will allow
caching of valid replies that are too large for the caller,
but that still fit in the allocated pages.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
---
 fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
 fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
 2 files changed, 47 insertions(+), 14 deletions(-)

Chuck Lever Dec. 1, 2020, 10:01 p.m. UTC | #1

> On Dec 1, 2020, at 4:31 PM, Frank van der Linden <fllinden@amazon.com> wrote:
> 
> XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
> and it's easy enough to allocate enough pages for the request
> up front, so do that.
> 
> Also, since we've allocated the pages anyway, use the full
> page aligned length for the receive buffer. This will allow
> caching of valid replies that are too large for the caller,
> but that still fit in the allocated pages.
> 
> Signed-off-by: Frank van der Linden <fllinden@amazon.com>
> ---
> fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
> fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
> 2 files changed, 47 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> index 2b2211d1234e..bfe15ac77bd9 100644
> --- a/fs/nfs/nfs42proc.c
> +++ b/fs/nfs/nfs42proc.c
> @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> 				void *buf, size_t buflen)
> {
> 	struct nfs_server *server = NFS_SERVER(inode);
> -	struct page *pages[NFS4XATTR_MAXPAGES] = {};
> +	struct page **pages;

Taking this array of pointers off the stack is Goodness(tm).

Thanks for doing this update!

> 	struct nfs42_getxattrargs arg = {
> 		.fh		= NFS_FH(inode),
> -		.xattr_pages	= pages,
> -		.xattr_len	= buflen,
> 		.xattr_name	= name,
> 	};
> 	struct nfs42_getxattrres res;
> @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> 		.rpc_argp	= &arg,
> 		.rpc_resp	= &res,
> 	};
> -	int ret, np;
> +	ssize_t ret, np, i;
> +
> +	arg.xattr_len = buflen ?: XATTR_SIZE_MAX;
> +
> +	ret = -ENOMEM;
> +	np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE);
> +
> +	pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
> +	if (pages == NULL)
> +		return ret;
> +
> +	for (i = 0; i < np; i++) {
> +		pages[i] = alloc_page(GFP_KERNEL);
> +		if (!pages[i])
> +			goto out_free;
> +	}
> +
> +	arg.xattr_pages = pages;
> 
> 	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
> 	    &res.seq_res, 0);
> 	if (ret < 0)
> -		return ret;
> +		goto out_free;
> +
> +	ret = res.xattr_len;
> 
> 	/*
> 	 * Normally, the caching is done one layer up, but for successful
> @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> 	nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len);
> 
> 	if (buflen) {
> -		if (res.xattr_len > buflen)
> -			return -ERANGE;
> +		if (res.xattr_len > buflen) {
> +			ret = -ERANGE;
> +			goto out_free;
> +		}
> 		_copy_from_pages(buf, pages, 0, res.xattr_len);
> 	}
> 
> -	np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
> +out_free:
> 	while (--np >= 0)
> 		__free_page(pages[np]);
> 
> -	return res.xattr_len;
> +	kfree(pages);
> +
> +	return ret;
> }
> 
> static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
> diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> index 6e060a88f98c..8dfe674d1301 100644
> --- a/fs/nfs/nfs42xdr.c
> +++ b/fs/nfs/nfs42xdr.c
> @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream *xdr,
> 		return -EIO;
> 
> 	len = be32_to_cpup(p);
> +
> +	/*
> +	 * Only check against the page length here. The actual
> +	 * requested length may be smaller, but that is only
> +	 * checked against after possibly caching a valid reply.
> +	 */
> 	if (len > req->rq_rcv_buf.page_len)
> 		return -ERANGE;
> 
> @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
> 	encode_putfh(xdr, args->fh, &hdr);
> 	encode_getxattr(xdr, args->xattr_name, &hdr);
> 
> -	plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
> -
> -	rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
> -	    hdr.replen);
> -	req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
> +	/*
> +	 * The GETXATTR op has no length field in the call, and the
> +	 * xattr data is at the end of the reply.
> +	 *
> +	 * Since the pages have already been allocated, there is no
> +	 * downside in using the page-aligned length. It will allow
> +	 * receiving and caching xattrs that are too large for the
> +	 * caller but still fit in the page-rounded value.
> +	 */
> +	plen = round_up(args->xattr_len, PAGE_SIZE);
> +	rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, hdr.replen);
> 
> 	encode_nops(&hdr);
> }
> -- 
> 2.23.3
> 

--
Chuck Lever

Olga Kornievskaia Dec. 1, 2020, 10:03 p.m. UTC | #2

On Tue, Dec 1, 2020 at 4:44 PM Frank van der Linden <fllinden@amazon.com> wrote:
>
> XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
> and it's easy enough to allocate enough pages for the request
> up front, so do that.
>
> Also, since we've allocated the pages anyway, use the full
> page aligned length for the receive buffer. This will allow
> caching of valid replies that are too large for the caller,
> but that still fit in the allocated pages.
>
> Signed-off-by: Frank van der Linden <fllinden@amazon.com>
> ---
>  fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
>  fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
>  2 files changed, 47 insertions(+), 14 deletions(-)
>
> diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> index 2b2211d1234e..bfe15ac77bd9 100644
> --- a/fs/nfs/nfs42proc.c
> +++ b/fs/nfs/nfs42proc.c
> @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
>                                 void *buf, size_t buflen)
>  {
>         struct nfs_server *server = NFS_SERVER(inode);
> -       struct page *pages[NFS4XATTR_MAXPAGES] = {};
> +       struct page **pages;
>         struct nfs42_getxattrargs arg = {
>                 .fh             = NFS_FH(inode),
> -               .xattr_pages    = pages,
> -               .xattr_len      = buflen,
>                 .xattr_name     = name,
>         };
>         struct nfs42_getxattrres res;
> @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
>                 .rpc_argp       = &arg,
>                 .rpc_resp       = &res,
>         };
> -       int ret, np;
> +       ssize_t ret, np, i;
> +
> +       arg.xattr_len = buflen ?: XATTR_SIZE_MAX;
> +
> +       ret = -ENOMEM;
> +       np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE);
> +
> +       pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
> +       if (pages == NULL)
> +               return ret;
> +
> +       for (i = 0; i < np; i++) {
> +               pages[i] = alloc_page(GFP_KERNEL);
> +               if (!pages[i])
> +                       goto out_free;
> +       }
> +
> +       arg.xattr_pages = pages;
>
>         ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
>             &res.seq_res, 0);
>         if (ret < 0)
> -               return ret;
> +               goto out_free;
> +
> +       ret = res.xattr_len;
>
>         /*
>          * Normally, the caching is done one layer up, but for successful
> @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
>         nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len);
>
>         if (buflen) {
> -               if (res.xattr_len > buflen)
> -                       return -ERANGE;
> +               if (res.xattr_len > buflen) {
> +                       ret = -ERANGE;
> +                       goto out_free;
> +               }
>                 _copy_from_pages(buf, pages, 0, res.xattr_len);
>         }
>
> -       np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
> +out_free:
>         while (--np >= 0)
>                 __free_page(pages[np]);

Looking at Chuck's page for listxattr, I think we need to check if
(pages[np) before freeing?

Otherwise, looks fine to me. Reviewed-by.

> -       return res.xattr_len;
> +       kfree(pages);
> +
> +       return ret;
>  }
>
>  static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
> diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> index 6e060a88f98c..8dfe674d1301 100644
> --- a/fs/nfs/nfs42xdr.c
> +++ b/fs/nfs/nfs42xdr.c
> @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream *xdr,
>                 return -EIO;
>
>         len = be32_to_cpup(p);
> +
> +       /*
> +        * Only check against the page length here. The actual
> +        * requested length may be smaller, but that is only
> +        * checked against after possibly caching a valid reply.
> +        */
>         if (len > req->rq_rcv_buf.page_len)
>                 return -ERANGE;
>
> @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
>         encode_putfh(xdr, args->fh, &hdr);
>         encode_getxattr(xdr, args->xattr_name, &hdr);
>
> -       plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
> -
> -       rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
> -           hdr.replen);
> -       req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
> +       /*
> +        * The GETXATTR op has no length field in the call, and the
> +        * xattr data is at the end of the reply.
> +        *
> +        * Since the pages have already been allocated, there is no
> +        * downside in using the page-aligned length. It will allow
> +        * receiving and caching xattrs that are too large for the
> +        * caller but still fit in the page-rounded value.
> +        */
> +       plen = round_up(args->xattr_len, PAGE_SIZE);
> +       rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, hdr.replen);
>
>         encode_nops(&hdr);
>  }
> --
> 2.23.3
>

Trond Myklebust Dec. 1, 2020, 10:15 p.m. UTC | #3

On Tue, 2020-12-01 at 21:31 +0000, Frank van der Linden wrote:
> XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
> and it's easy enough to allocate enough pages for the request
> up front, so do that.
> 
> Also, since we've allocated the pages anyway, use the full
> page aligned length for the receive buffer. This will allow
> caching of valid replies that are too large for the caller,
> but that still fit in the allocated pages.
> 
> Signed-off-by: Frank van der Linden <fllinden@amazon.com>
> ---
>  fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
>  fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
>  2 files changed, 47 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> index 2b2211d1234e..bfe15ac77bd9 100644
> --- a/fs/nfs/nfs42proc.c
> +++ b/fs/nfs/nfs42proc.c
> @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct
> inode *inode, const char *name,
>                                 void *buf, size_t buflen)
>  {
>         struct nfs_server *server = NFS_SERVER(inode);
> -       struct page *pages[NFS4XATTR_MAXPAGES] = {};
> +       struct page **pages;
>         struct nfs42_getxattrargs arg = {
>                 .fh             = NFS_FH(inode),
> -               .xattr_pages    = pages,
> -               .xattr_len      = buflen,
>                 .xattr_name     = name,
>         };
>         struct nfs42_getxattrres res;
> @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct

Why can't we set up the page array in nfs42_proc_getxattr()? This means
that if we get a retryable error from the server, then we perform
multiple allocations of the same buffer.

> inode *inode, const char *name,
>                 .rpc_argp       = &arg,
>                 .rpc_resp       = &res,
>         };
> -       int ret, np;
> +       ssize_t ret, np, i;
> +
> +       arg.xattr_len = buflen ?: XATTR_SIZE_MAX;
> +
> +       ret = -ENOMEM;
> +       np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE);

Please just use nfs_page_array_len(). It should be more efficient than
the above.

> +
> +       pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);

Perhaps do this as kmalloc_array() so we don't have to zero out the
array? If the alloc_page() fails below, you can always just truncate
the value of 'np' before jumping to 'out_free'.

Also note that we prefer to use 'sizeof(*pages)' rather than
sizeof(struct page *).

> +       if (pages == NULL)
> +               return ret;
> +
> +       for (i = 0; i < np; i++) {
> +               pages[i] = alloc_page(GFP_KERNEL);
> +               if (!pages[i])
> +                       goto out_free;
> +       }
> +
> +       arg.xattr_pages = pages;
>  
>         ret = nfs4_call_sync(server->client, server, &msg,
> &arg.seq_args,
>             &res.seq_res, 0);
>         if (ret < 0)
> -               return ret;
> +               goto out_free;
> +
> +       ret = res.xattr_len;
>  
>         /*
>          * Normally, the caching is done one layer up, but for
> successful
> @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct
> inode *inode, const char *name,
>         nfs4_xattr_cache_add(inode, name, NULL, pages,
> res.xattr_len);
>  
>         if (buflen) {
> -               if (res.xattr_len > buflen)
> -                       return -ERANGE;
> +               if (res.xattr_len > buflen) {
> +                       ret = -ERANGE;
> +                       goto out_free;
> +               }
>                 _copy_from_pages(buf, pages, 0, res.xattr_len);
>         }
>  
> -       np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
> +out_free:
>         while (--np >= 0)
>                 __free_page(pages[np]);
>  
> -       return res.xattr_len;
> +       kfree(pages);
> +
> +       return ret;
>  }
>  
>  static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void
> *buf,
> diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> index 6e060a88f98c..8dfe674d1301 100644
> --- a/fs/nfs/nfs42xdr.c
> +++ b/fs/nfs/nfs42xdr.c
> @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream
> *xdr,
>                 return -EIO;
>  
>         len = be32_to_cpup(p);
> +
> +       /*
> +        * Only check against the page length here. The actual
> +        * requested length may be smaller, but that is only
> +        * checked against after possibly caching a valid reply.
> +        */
>         if (len > req->rq_rcv_buf.page_len)
>                 return -ERANGE;
>  
> @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct
> rpc_rqst *req, struct xdr_stream *xdr,
>         encode_putfh(xdr, args->fh, &hdr);
>         encode_getxattr(xdr, args->xattr_name, &hdr);
>  
> -       plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
> -
> -       rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
> -           hdr.replen);
> -       req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
> +       /*
> +        * The GETXATTR op has no length field in the call, and the
> +        * xattr data is at the end of the reply.
> +        *
> +        * Since the pages have already been allocated, there is no
> +        * downside in using the page-aligned length. It will allow
> +        * receiving and caching xattrs that are too large for the
> +        * caller but still fit in the page-rounded value.
> +        */
> +       plen = round_up(args->xattr_len, PAGE_SIZE);

This is the wrong place to be recalculating page buffer sizes. This
function should have no idea what was allocated or by whom. Please just
set the correct value for args->xattr_len in nfs42_proc_getxattr().

> +       rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
> hdr.replen);
>  
>         encode_nops(&hdr);
>  }

Frank van der Linden Dec. 1, 2020, 10:24 p.m. UTC | #4

On Tue, Dec 01, 2020 at 05:03:46PM -0500, Olga Kornievskaia wrote:
> 
> On Tue, Dec 1, 2020 at 4:44 PM Frank van der Linden <fllinden@amazon.com> wrote:
> >
> > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
> > and it's easy enough to allocate enough pages for the request
> > up front, so do that.
> >
> > Also, since we've allocated the pages anyway, use the full
> > page aligned length for the receive buffer. This will allow
> > caching of valid replies that are too large for the caller,
> > but that still fit in the allocated pages.
> >
> > Signed-off-by: Frank van der Linden <fllinden@amazon.com>
> > ---
> >  fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
> >  fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
> >  2 files changed, 47 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> > index 2b2211d1234e..bfe15ac77bd9 100644
> > --- a/fs/nfs/nfs42proc.c
> > +++ b/fs/nfs/nfs42proc.c
> > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> >                                 void *buf, size_t buflen)
> >  {
> >         struct nfs_server *server = NFS_SERVER(inode);
> > -       struct page *pages[NFS4XATTR_MAXPAGES] = {};
> > +       struct page **pages;
> >         struct nfs42_getxattrargs arg = {
> >                 .fh             = NFS_FH(inode),
> > -               .xattr_pages    = pages,
> > -               .xattr_len      = buflen,
> >                 .xattr_name     = name,
> >         };
> >         struct nfs42_getxattrres res;
> > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> >                 .rpc_argp       = &arg,
> >                 .rpc_resp       = &res,
> >         };
> > -       int ret, np;
> > +       ssize_t ret, np, i;
> > +
> > +       arg.xattr_len = buflen ?: XATTR_SIZE_MAX;
> > +
> > +       ret = -ENOMEM;
> > +       np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE);
> > +
> > +       pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
> > +       if (pages == NULL)
> > +               return ret;
> > +
> > +       for (i = 0; i < np; i++) {
> > +               pages[i] = alloc_page(GFP_KERNEL);
> > +               if (!pages[i])
> > +                       goto out_free;
> > +       }
> > +
> > +       arg.xattr_pages = pages;
> >
> >         ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
> >             &res.seq_res, 0);
> >         if (ret < 0)
> > -               return ret;
> > +               goto out_free;
> > +
> > +       ret = res.xattr_len;
> >
> >         /*
> >          * Normally, the caching is done one layer up, but for successful
> > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> >         nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len);
> >
> >         if (buflen) {
> > -               if (res.xattr_len > buflen)
> > -                       return -ERANGE;
> > +               if (res.xattr_len > buflen) {
> > +                       ret = -ERANGE;
> > +                       goto out_free;
> > +               }
> >                 _copy_from_pages(buf, pages, 0, res.xattr_len);
> >         }
> >
> > -       np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
> > +out_free:
> >         while (--np >= 0)
> >                 __free_page(pages[np]);
> 
> Looking at Chuck's page for listxattr, I think we need to check if
> (pages[np) before freeing?
> 
> Otherwise, looks fine to me. Reviewed-by.

Hm, originally, yes, because of SPARSE_PAGES use. So that was actually
a bug in the original code, which is now fixed. In other words, we
need a Cc: -stable here (which I assume we want anyway, since this
fixes RDMA).

It's not needed anymore, and it's not needed for listxattr anymore either,
although there the code was originally correct.

- Frank

Frank van der Linden Dec. 1, 2020, 10:45 p.m. UTC | #5

On Tue, Dec 01, 2020 at 10:24:11PM +0000, Frank van der Linden wrote:
> On Tue, Dec 01, 2020 at 05:03:46PM -0500, Olga Kornievskaia wrote:
> > 
> > On Tue, Dec 1, 2020 at 4:44 PM Frank van der Linden <fllinden@amazon.com> wrote:
> > >
> > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
> > > and it's easy enough to allocate enough pages for the request
> > > up front, so do that.
> > >
> > > Also, since we've allocated the pages anyway, use the full
> > > page aligned length for the receive buffer. This will allow
> > > caching of valid replies that are too large for the caller,
> > > but that still fit in the allocated pages.
> > >
> > > Signed-off-by: Frank van der Linden <fllinden@amazon.com>
> > > ---
> > >  fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
> > >  fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
> > >  2 files changed, 47 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> > > index 2b2211d1234e..bfe15ac77bd9 100644
> > > --- a/fs/nfs/nfs42proc.c
> > > +++ b/fs/nfs/nfs42proc.c
> > > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> > >                                 void *buf, size_t buflen)
> > >  {
> > >         struct nfs_server *server = NFS_SERVER(inode);
> > > -       struct page *pages[NFS4XATTR_MAXPAGES] = {};
> > > +       struct page **pages;
> > >         struct nfs42_getxattrargs arg = {
> > >                 .fh             = NFS_FH(inode),
> > > -               .xattr_pages    = pages,
> > > -               .xattr_len      = buflen,
> > >                 .xattr_name     = name,
> > >         };
> > >         struct nfs42_getxattrres res;
> > > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> > >                 .rpc_argp       = &arg,
> > >                 .rpc_resp       = &res,
> > >         };
> > > -       int ret, np;
> > > +       ssize_t ret, np, i;
> > > +
> > > +       arg.xattr_len = buflen ?: XATTR_SIZE_MAX;
> > > +
> > > +       ret = -ENOMEM;
> > > +       np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE);
> > > +
> > > +       pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
> > > +       if (pages == NULL)
> > > +               return ret;
> > > +
> > > +       for (i = 0; i < np; i++) {
> > > +               pages[i] = alloc_page(GFP_KERNEL);
> > > +               if (!pages[i])
> > > +                       goto out_free;
> > > +       }
> > > +
> > > +       arg.xattr_pages = pages;
> > >
> > >         ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
> > >             &res.seq_res, 0);
> > >         if (ret < 0)
> > > -               return ret;
> > > +               goto out_free;
> > > +
> > > +       ret = res.xattr_len;
> > >
> > >         /*
> > >          * Normally, the caching is done one layer up, but for successful
> > > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
> > >         nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len);
> > >
> > >         if (buflen) {
> > > -               if (res.xattr_len > buflen)
> > > -                       return -ERANGE;
> > > +               if (res.xattr_len > buflen) {
> > > +                       ret = -ERANGE;
> > > +                       goto out_free;
> > > +               }
> > >                 _copy_from_pages(buf, pages, 0, res.xattr_len);
> > >         }
> > >
> > > -       np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
> > > +out_free:
> > >         while (--np >= 0)
> > >                 __free_page(pages[np]);
> > 
> > Looking at Chuck's page for listxattr, I think we need to check if
> > (pages[np) before freeing?
> > 
> > Otherwise, looks fine to me. Reviewed-by.
> 
> Hm, originally, yes, because of SPARSE_PAGES use. So that was actually
> a bug in the original code, which is now fixed. In other words, we
> need a Cc: -stable here (which I assume we want anyway, since this
> fixes RDMA).

Oh, no, actually the original code was fine - it only freed the pages
based on the actual returned result - so no issue there.

Let me address Trond's comments, though, I'll send a v2.

Frank

Olga Kornievskaia Dec. 1, 2020, 11:03 p.m. UTC | #6

On Tue, Dec 1, 2020 at 5:15 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Tue, 2020-12-01 at 21:31 +0000, Frank van der Linden wrote:
> > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
> > and it's easy enough to allocate enough pages for the request
> > up front, so do that.
> >
> > Also, since we've allocated the pages anyway, use the full
> > page aligned length for the receive buffer. This will allow
> > caching of valid replies that are too large for the caller,
> > but that still fit in the allocated pages.
> >
> > Signed-off-by: Frank van der Linden <fllinden@amazon.com>
> > ---
> >  fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
> >  fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
> >  2 files changed, 47 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> > index 2b2211d1234e..bfe15ac77bd9 100644
> > --- a/fs/nfs/nfs42proc.c
> > +++ b/fs/nfs/nfs42proc.c
> > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct
> > inode *inode, const char *name,
> >                                 void *buf, size_t buflen)
> >  {
> >         struct nfs_server *server = NFS_SERVER(inode);
> > -       struct page *pages[NFS4XATTR_MAXPAGES] = {};
> > +       struct page **pages;
> >         struct nfs42_getxattrargs arg = {
> >                 .fh             = NFS_FH(inode),
> > -               .xattr_pages    = pages,
> > -               .xattr_len      = buflen,
> >                 .xattr_name     = name,
> >         };
> >         struct nfs42_getxattrres res;
> > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct
>
> Why can't we set up the page array in nfs42_proc_getxattr()? This means
> that if we get a retryable error from the server, then we perform
> multiple allocations of the same buffer.

I wonder if the same then is needed for the LISTXATTR as we allocated
pages in _nfs42_proc_listxattr() and instead should do it in
nfs42_proc_listxattr()....

> > inode *inode, const char *name,
> >                 .rpc_argp       = &arg,
> >                 .rpc_resp       = &res,
> >         };
> > -       int ret, np;
> > +       ssize_t ret, np, i;
> > +
> > +       arg.xattr_len = buflen ?: XATTR_SIZE_MAX;
> > +
> > +       ret = -ENOMEM;
> > +       np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE);
>
> Please just use nfs_page_array_len(). It should be more efficient than
> the above.
>
> > +
> > +       pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
>
> Perhaps do this as kmalloc_array() so we don't have to zero out the
> array? If the alloc_page() fails below, you can always just truncate
> the value of 'np' before jumping to 'out_free'.
>
> Also note that we prefer to use 'sizeof(*pages)' rather than
> sizeof(struct page *).
>
> > +       if (pages == NULL)
> > +               return ret;
> > +
> > +       for (i = 0; i < np; i++) {
> > +               pages[i] = alloc_page(GFP_KERNEL);
> > +               if (!pages[i])
> > +                       goto out_free;
> > +       }
> > +
> > +       arg.xattr_pages = pages;
> >
> >         ret = nfs4_call_sync(server->client, server, &msg,
> > &arg.seq_args,
> >             &res.seq_res, 0);
> >         if (ret < 0)
> > -               return ret;
> > +               goto out_free;
> > +
> > +       ret = res.xattr_len;
> >
> >         /*
> >          * Normally, the caching is done one layer up, but for
> > successful
> > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct
> > inode *inode, const char *name,
> >         nfs4_xattr_cache_add(inode, name, NULL, pages,
> > res.xattr_len);
> >
> >         if (buflen) {
> > -               if (res.xattr_len > buflen)
> > -                       return -ERANGE;
> > +               if (res.xattr_len > buflen) {
> > +                       ret = -ERANGE;
> > +                       goto out_free;
> > +               }
> >                 _copy_from_pages(buf, pages, 0, res.xattr_len);
> >         }
> >
> > -       np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
> > +out_free:
> >         while (--np >= 0)
> >                 __free_page(pages[np]);
> >
> > -       return res.xattr_len;
> > +       kfree(pages);
> > +
> > +       return ret;
> >  }
> >
> >  static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void
> > *buf,
> > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> > index 6e060a88f98c..8dfe674d1301 100644
> > --- a/fs/nfs/nfs42xdr.c
> > +++ b/fs/nfs/nfs42xdr.c
> > @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream
> > *xdr,
> >                 return -EIO;
> >
> >         len = be32_to_cpup(p);
> > +
> > +       /*
> > +        * Only check against the page length here. The actual
> > +        * requested length may be smaller, but that is only
> > +        * checked against after possibly caching a valid reply.
> > +        */
> >         if (len > req->rq_rcv_buf.page_len)
> >                 return -ERANGE;
> >
> > @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct
> > rpc_rqst *req, struct xdr_stream *xdr,
> >         encode_putfh(xdr, args->fh, &hdr);
> >         encode_getxattr(xdr, args->xattr_name, &hdr);
> >
> > -       plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
> > -
> > -       rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
> > -           hdr.replen);
> > -       req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
> > +       /*
> > +        * The GETXATTR op has no length field in the call, and the
> > +        * xattr data is at the end of the reply.
> > +        *
> > +        * Since the pages have already been allocated, there is no
> > +        * downside in using the page-aligned length. It will allow
> > +        * receiving and caching xattrs that are too large for the
> > +        * caller but still fit in the page-rounded value.
> > +        */
> > +       plen = round_up(args->xattr_len, PAGE_SIZE);
>
> This is the wrong place to be recalculating page buffer sizes. This
> function should have no idea what was allocated or by whom. Please just
> set the correct value for args->xattr_len in nfs42_proc_getxattr().
>
> > +       rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
> > hdr.replen);
> >
> >         encode_nops(&hdr);
> >  }
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>

Frank van der Linden Dec. 2, 2020, 12:35 a.m. UTC | #7

On Tue, Dec 01, 2020 at 10:15:47PM +0000, Trond Myklebust wrote:
> On Tue, 2020-12-01 at 21:31 +0000, Frank van der Linden wrote:
> > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
> > and it's easy enough to allocate enough pages for the request
> > up front, so do that.
> >
> > Also, since we've allocated the pages anyway, use the full
> > page aligned length for the receive buffer. This will allow
> > caching of valid replies that are too large for the caller,
> > but that still fit in the allocated pages.
> >
> > Signed-off-by: Frank van der Linden <fllinden@amazon.com>
> > ---
> >  fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
> >  fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
> >  2 files changed, 47 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> > index 2b2211d1234e..bfe15ac77bd9 100644
> > --- a/fs/nfs/nfs42proc.c
> > +++ b/fs/nfs/nfs42proc.c
> > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct
> > inode *inode, const char *name,
> >                                 void *buf, size_t buflen)
> >  {
> >         struct nfs_server *server = NFS_SERVER(inode);
> > -       struct page *pages[NFS4XATTR_MAXPAGES] = {};
> > +       struct page **pages;
> >         struct nfs42_getxattrargs arg = {
> >                 .fh             = NFS_FH(inode),
> > -               .xattr_pages    = pages,
> > -               .xattr_len      = buflen,
> >                 .xattr_name     = name,
> >         };
> >         struct nfs42_getxattrres res;
> > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct
> 
> Why can't we set up the page array in nfs42_proc_getxattr()? This means
> that if we get a retryable error from the server, then we perform
> multiple allocations of the same buffer.
> 
> > inode *inode, const char *name,
> >                 .rpc_argp       = &arg,
> >                 .rpc_resp       = &res,
> >         };
> > -       int ret, np;
> > +       ssize_t ret, np, i;
> > +
> > +       arg.xattr_len = buflen ?: XATTR_SIZE_MAX;
> > +
> > +       ret = -ENOMEM;
> > +       np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE);
> 
> Please just use nfs_page_array_len(). It should be more efficient than
> the above.
> 
> > +
> > +       pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
> 
> Perhaps do this as kmalloc_array() so we don't have to zero out the
> array? If the alloc_page() fails below, you can always just truncate
> the value of 'np' before jumping to 'out_free'.
> 
> Also note that we prefer to use 'sizeof(*pages)' rather than
> sizeof(struct page *).
> 
> > +       if (pages == NULL)
> > +               return ret;
> > +
> > +       for (i = 0; i < np; i++) {
> > +               pages[i] = alloc_page(GFP_KERNEL);
> > +               if (!pages[i])
> > +                       goto out_free;
> > +       }
> > +
> > +       arg.xattr_pages = pages;
> >
> >         ret = nfs4_call_sync(server->client, server, &msg,
> > &arg.seq_args,
> >             &res.seq_res, 0);
> >         if (ret < 0)
> > -               return ret;
> > +               goto out_free;
> > +
> > +       ret = res.xattr_len;
> >
> >         /*
> >          * Normally, the caching is done one layer up, but for
> > successful
> > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct
> > inode *inode, const char *name,
> >         nfs4_xattr_cache_add(inode, name, NULL, pages,
> > res.xattr_len);
> >
> >         if (buflen) {
> > -               if (res.xattr_len > buflen)
> > -                       return -ERANGE;
> > +               if (res.xattr_len > buflen) {
> > +                       ret = -ERANGE;
> > +                       goto out_free;
> > +               }
> >                 _copy_from_pages(buf, pages, 0, res.xattr_len);
> >         }
> >
> > -       np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
> > +out_free:
> >         while (--np >= 0)
> >                 __free_page(pages[np]);
> >
> > -       return res.xattr_len;
> > +       kfree(pages);
> > +
> > +       return ret;
> >  }
> >
> >  static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void
> > *buf,
> > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> > index 6e060a88f98c..8dfe674d1301 100644
> > --- a/fs/nfs/nfs42xdr.c
> > +++ b/fs/nfs/nfs42xdr.c
> > @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream
> > *xdr,
> >                 return -EIO;
> >
> >         len = be32_to_cpup(p);
> > +
> > +       /*
> > +        * Only check against the page length here. The actual
> > +        * requested length may be smaller, but that is only
> > +        * checked against after possibly caching a valid reply.
> > +        */
> >         if (len > req->rq_rcv_buf.page_len)
> >                 return -ERANGE;
> >
> > @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct
> > rpc_rqst *req, struct xdr_stream *xdr,
> >         encode_putfh(xdr, args->fh, &hdr);
> >         encode_getxattr(xdr, args->xattr_name, &hdr);
> >
> > -       plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
> > -
> > -       rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
> > -           hdr.replen);
> > -       req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
> > +       /*
> > +        * The GETXATTR op has no length field in the call, and the
> > +        * xattr data is at the end of the reply.
> > +        *
> > +        * Since the pages have already been allocated, there is no
> > +        * downside in using the page-aligned length. It will allow
> > +        * receiving and caching xattrs that are too large for the
> > +        * caller but still fit in the page-rounded value.
> > +        */
> > +       plen = round_up(args->xattr_len, PAGE_SIZE);
> 
> This is the wrong place to be recalculating page buffer sizes. This
> function should have no idea what was allocated or by whom. Please just
> set the correct value for args->xattr_len in nfs42_proc_getxattr().
> 
> > +       rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
> > hdr.replen);
> >
> >         encode_nops(&hdr);
> >  }
> 
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com

Thanks for the comments - v2 sent.

- Frank

Frank van der Linden Dec. 2, 2020, 12:36 a.m. UTC | #8

On Tue, Dec 01, 2020 at 06:03:31PM -0500, Olga Kornievskaia wrote:
> On Tue, Dec 1, 2020 at 5:15 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
> >
> > On Tue, 2020-12-01 at 21:31 +0000, Frank van der Linden wrote:
> > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
> > > and it's easy enough to allocate enough pages for the request
> > > up front, so do that.
> > >
> > > Also, since we've allocated the pages anyway, use the full
> > > page aligned length for the receive buffer. This will allow
> > > caching of valid replies that are too large for the caller,
> > > but that still fit in the allocated pages.
> > >
> > > Signed-off-by: Frank van der Linden <fllinden@amazon.com>
> > > ---
> > >  fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++---------
> > >  fs/nfs/nfs42xdr.c  | 22 +++++++++++++++++-----
> > >  2 files changed, 47 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> > > index 2b2211d1234e..bfe15ac77bd9 100644
> > > --- a/fs/nfs/nfs42proc.c
> > > +++ b/fs/nfs/nfs42proc.c
> > > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct
> > > inode *inode, const char *name,
> > >                                 void *buf, size_t buflen)
> > >  {
> > >         struct nfs_server *server = NFS_SERVER(inode);
> > > -       struct page *pages[NFS4XATTR_MAXPAGES] = {};
> > > +       struct page **pages;
> > >         struct nfs42_getxattrargs arg = {
> > >                 .fh             = NFS_FH(inode),
> > > -               .xattr_pages    = pages,
> > > -               .xattr_len      = buflen,
> > >                 .xattr_name     = name,
> > >         };
> > >         struct nfs42_getxattrres res;
> > > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct
> >
> > Why can't we set up the page array in nfs42_proc_getxattr()? This means
> > that if we get a retryable error from the server, then we perform
> > multiple allocations of the same buffer.
> 
> I wonder if the same then is needed for the LISTXATTR as we allocated
> pages in _nfs42_proc_listxattr() and instead should do it in
> nfs42_proc_listxattr()....

There are a few more places of which this is true, yep. getactl is the
other one.

In any case, fixed for the getxattr case in v2.

- Frank

NFSv4.2: improve page handling for GETXATTR

Commit Message

Comments

Patch