Message ID | 20201201213128.13624-1-fllinden@amazon.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | NFSv4.2: improve page handling for GETXATTR | expand |
> On Dec 1, 2020, at 4:31 PM, Frank van der Linden <fllinden@amazon.com> wrote: > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, > and it's easy enough to allocate enough pages for the request > up front, so do that. > > Also, since we've allocated the pages anyway, use the full > page aligned length for the receive buffer. This will allow > caching of valid replies that are too large for the caller, > but that still fit in the allocated pages. > > Signed-off-by: Frank van der Linden <fllinden@amazon.com> > --- > fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- > fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- > 2 files changed, 47 insertions(+), 14 deletions(-) > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > index 2b2211d1234e..bfe15ac77bd9 100644 > --- a/fs/nfs/nfs42proc.c > +++ b/fs/nfs/nfs42proc.c > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > void *buf, size_t buflen) > { > struct nfs_server *server = NFS_SERVER(inode); > - struct page *pages[NFS4XATTR_MAXPAGES] = {}; > + struct page **pages; Taking this array of pointers off the stack is Goodness(tm). Thanks for doing this update! > struct nfs42_getxattrargs arg = { > .fh = NFS_FH(inode), > - .xattr_pages = pages, > - .xattr_len = buflen, > .xattr_name = name, > }; > struct nfs42_getxattrres res; > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > .rpc_argp = &arg, > .rpc_resp = &res, > }; > - int ret, np; > + ssize_t ret, np, i; > + > + arg.xattr_len = buflen ?: XATTR_SIZE_MAX; > + > + ret = -ENOMEM; > + np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE); > + > + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); > + if (pages == NULL) > + return ret; > + > + for (i = 0; i < np; i++) { > + pages[i] = alloc_page(GFP_KERNEL); > + if (!pages[i]) > + goto out_free; > + } > + > + arg.xattr_pages = pages; > > ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, > &res.seq_res, 0); > if (ret < 0) > - return ret; > + goto out_free; > + > + ret = res.xattr_len; > > /* > * Normally, the caching is done one layer up, but for successful > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); > > if (buflen) { > - if (res.xattr_len > buflen) > - return -ERANGE; > + if (res.xattr_len > buflen) { > + ret = -ERANGE; > + goto out_free; > + } > _copy_from_pages(buf, pages, 0, res.xattr_len); > } > > - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); > +out_free: > while (--np >= 0) > __free_page(pages[np]); > > - return res.xattr_len; > + kfree(pages); > + > + return ret; > } > > static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c > index 6e060a88f98c..8dfe674d1301 100644 > --- a/fs/nfs/nfs42xdr.c > +++ b/fs/nfs/nfs42xdr.c > @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream *xdr, > return -EIO; > > len = be32_to_cpup(p); > + > + /* > + * Only check against the page length here. The actual > + * requested length may be smaller, but that is only > + * checked against after possibly caching a valid reply. > + */ > if (len > req->rq_rcv_buf.page_len) > return -ERANGE; > > @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, > encode_putfh(xdr, args->fh, &hdr); > encode_getxattr(xdr, args->xattr_name, &hdr); > > - plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; > - > - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, > - hdr.replen); > - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; > + /* > + * The GETXATTR op has no length field in the call, and the > + * xattr data is at the end of the reply. > + * > + * Since the pages have already been allocated, there is no > + * downside in using the page-aligned length. It will allow > + * receiving and caching xattrs that are too large for the > + * caller but still fit in the page-rounded value. > + */ > + plen = round_up(args->xattr_len, PAGE_SIZE); > + rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, hdr.replen); > > encode_nops(&hdr); > } > -- > 2.23.3 > -- Chuck Lever
On Tue, Dec 1, 2020 at 4:44 PM Frank van der Linden <fllinden@amazon.com> wrote: > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, > and it's easy enough to allocate enough pages for the request > up front, so do that. > > Also, since we've allocated the pages anyway, use the full > page aligned length for the receive buffer. This will allow > caching of valid replies that are too large for the caller, > but that still fit in the allocated pages. > > Signed-off-by: Frank van der Linden <fllinden@amazon.com> > --- > fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- > fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- > 2 files changed, 47 insertions(+), 14 deletions(-) > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > index 2b2211d1234e..bfe15ac77bd9 100644 > --- a/fs/nfs/nfs42proc.c > +++ b/fs/nfs/nfs42proc.c > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > void *buf, size_t buflen) > { > struct nfs_server *server = NFS_SERVER(inode); > - struct page *pages[NFS4XATTR_MAXPAGES] = {}; > + struct page **pages; > struct nfs42_getxattrargs arg = { > .fh = NFS_FH(inode), > - .xattr_pages = pages, > - .xattr_len = buflen, > .xattr_name = name, > }; > struct nfs42_getxattrres res; > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > .rpc_argp = &arg, > .rpc_resp = &res, > }; > - int ret, np; > + ssize_t ret, np, i; > + > + arg.xattr_len = buflen ?: XATTR_SIZE_MAX; > + > + ret = -ENOMEM; > + np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE); > + > + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); > + if (pages == NULL) > + return ret; > + > + for (i = 0; i < np; i++) { > + pages[i] = alloc_page(GFP_KERNEL); > + if (!pages[i]) > + goto out_free; > + } > + > + arg.xattr_pages = pages; > > ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, > &res.seq_res, 0); > if (ret < 0) > - return ret; > + goto out_free; > + > + ret = res.xattr_len; > > /* > * Normally, the caching is done one layer up, but for successful > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); > > if (buflen) { > - if (res.xattr_len > buflen) > - return -ERANGE; > + if (res.xattr_len > buflen) { > + ret = -ERANGE; > + goto out_free; > + } > _copy_from_pages(buf, pages, 0, res.xattr_len); > } > > - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); > +out_free: > while (--np >= 0) > __free_page(pages[np]); Looking at Chuck's page for listxattr, I think we need to check if (pages[np) before freeing? Otherwise, looks fine to me. Reviewed-by. > - return res.xattr_len; > + kfree(pages); > + > + return ret; > } > > static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c > index 6e060a88f98c..8dfe674d1301 100644 > --- a/fs/nfs/nfs42xdr.c > +++ b/fs/nfs/nfs42xdr.c > @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream *xdr, > return -EIO; > > len = be32_to_cpup(p); > + > + /* > + * Only check against the page length here. The actual > + * requested length may be smaller, but that is only > + * checked against after possibly caching a valid reply. > + */ > if (len > req->rq_rcv_buf.page_len) > return -ERANGE; > > @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, > encode_putfh(xdr, args->fh, &hdr); > encode_getxattr(xdr, args->xattr_name, &hdr); > > - plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; > - > - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, > - hdr.replen); > - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; > + /* > + * The GETXATTR op has no length field in the call, and the > + * xattr data is at the end of the reply. > + * > + * Since the pages have already been allocated, there is no > + * downside in using the page-aligned length. It will allow > + * receiving and caching xattrs that are too large for the > + * caller but still fit in the page-rounded value. > + */ > + plen = round_up(args->xattr_len, PAGE_SIZE); > + rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, hdr.replen); > > encode_nops(&hdr); > } > -- > 2.23.3 >
On Tue, 2020-12-01 at 21:31 +0000, Frank van der Linden wrote: > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, > and it's easy enough to allocate enough pages for the request > up front, so do that. > > Also, since we've allocated the pages anyway, use the full > page aligned length for the receive buffer. This will allow > caching of valid replies that are too large for the caller, > but that still fit in the allocated pages. > > Signed-off-by: Frank van der Linden <fllinden@amazon.com> > --- > fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- > fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- > 2 files changed, 47 insertions(+), 14 deletions(-) > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > index 2b2211d1234e..bfe15ac77bd9 100644 > --- a/fs/nfs/nfs42proc.c > +++ b/fs/nfs/nfs42proc.c > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct > inode *inode, const char *name, > void *buf, size_t buflen) > { > struct nfs_server *server = NFS_SERVER(inode); > - struct page *pages[NFS4XATTR_MAXPAGES] = {}; > + struct page **pages; > struct nfs42_getxattrargs arg = { > .fh = NFS_FH(inode), > - .xattr_pages = pages, > - .xattr_len = buflen, > .xattr_name = name, > }; > struct nfs42_getxattrres res; > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct Why can't we set up the page array in nfs42_proc_getxattr()? This means that if we get a retryable error from the server, then we perform multiple allocations of the same buffer. > inode *inode, const char *name, > .rpc_argp = &arg, > .rpc_resp = &res, > }; > - int ret, np; > + ssize_t ret, np, i; > + > + arg.xattr_len = buflen ?: XATTR_SIZE_MAX; > + > + ret = -ENOMEM; > + np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE); Please just use nfs_page_array_len(). It should be more efficient than the above. > + > + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); Perhaps do this as kmalloc_array() so we don't have to zero out the array? If the alloc_page() fails below, you can always just truncate the value of 'np' before jumping to 'out_free'. Also note that we prefer to use 'sizeof(*pages)' rather than sizeof(struct page *). > + if (pages == NULL) > + return ret; > + > + for (i = 0; i < np; i++) { > + pages[i] = alloc_page(GFP_KERNEL); > + if (!pages[i]) > + goto out_free; > + } > + > + arg.xattr_pages = pages; > > ret = nfs4_call_sync(server->client, server, &msg, > &arg.seq_args, > &res.seq_res, 0); > if (ret < 0) > - return ret; > + goto out_free; > + > + ret = res.xattr_len; > > /* > * Normally, the caching is done one layer up, but for > successful > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct > inode *inode, const char *name, > nfs4_xattr_cache_add(inode, name, NULL, pages, > res.xattr_len); > > if (buflen) { > - if (res.xattr_len > buflen) > - return -ERANGE; > + if (res.xattr_len > buflen) { > + ret = -ERANGE; > + goto out_free; > + } > _copy_from_pages(buf, pages, 0, res.xattr_len); > } > > - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); > +out_free: > while (--np >= 0) > __free_page(pages[np]); > > - return res.xattr_len; > + kfree(pages); > + > + return ret; > } > > static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void > *buf, > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c > index 6e060a88f98c..8dfe674d1301 100644 > --- a/fs/nfs/nfs42xdr.c > +++ b/fs/nfs/nfs42xdr.c > @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream > *xdr, > return -EIO; > > len = be32_to_cpup(p); > + > + /* > + * Only check against the page length here. The actual > + * requested length may be smaller, but that is only > + * checked against after possibly caching a valid reply. > + */ > if (len > req->rq_rcv_buf.page_len) > return -ERANGE; > > @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct > rpc_rqst *req, struct xdr_stream *xdr, > encode_putfh(xdr, args->fh, &hdr); > encode_getxattr(xdr, args->xattr_name, &hdr); > > - plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; > - > - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, > - hdr.replen); > - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; > + /* > + * The GETXATTR op has no length field in the call, and the > + * xattr data is at the end of the reply. > + * > + * Since the pages have already been allocated, there is no > + * downside in using the page-aligned length. It will allow > + * receiving and caching xattrs that are too large for the > + * caller but still fit in the page-rounded value. > + */ > + plen = round_up(args->xattr_len, PAGE_SIZE); This is the wrong place to be recalculating page buffer sizes. This function should have no idea what was allocated or by whom. Please just set the correct value for args->xattr_len in nfs42_proc_getxattr(). > + rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, > hdr.replen); > > encode_nops(&hdr); > }
On Tue, Dec 01, 2020 at 05:03:46PM -0500, Olga Kornievskaia wrote: > > On Tue, Dec 1, 2020 at 4:44 PM Frank van der Linden <fllinden@amazon.com> wrote: > > > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, > > and it's easy enough to allocate enough pages for the request > > up front, so do that. > > > > Also, since we've allocated the pages anyway, use the full > > page aligned length for the receive buffer. This will allow > > caching of valid replies that are too large for the caller, > > but that still fit in the allocated pages. > > > > Signed-off-by: Frank van der Linden <fllinden@amazon.com> > > --- > > fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- > > fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- > > 2 files changed, 47 insertions(+), 14 deletions(-) > > > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > > index 2b2211d1234e..bfe15ac77bd9 100644 > > --- a/fs/nfs/nfs42proc.c > > +++ b/fs/nfs/nfs42proc.c > > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > > void *buf, size_t buflen) > > { > > struct nfs_server *server = NFS_SERVER(inode); > > - struct page *pages[NFS4XATTR_MAXPAGES] = {}; > > + struct page **pages; > > struct nfs42_getxattrargs arg = { > > .fh = NFS_FH(inode), > > - .xattr_pages = pages, > > - .xattr_len = buflen, > > .xattr_name = name, > > }; > > struct nfs42_getxattrres res; > > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > > .rpc_argp = &arg, > > .rpc_resp = &res, > > }; > > - int ret, np; > > + ssize_t ret, np, i; > > + > > + arg.xattr_len = buflen ?: XATTR_SIZE_MAX; > > + > > + ret = -ENOMEM; > > + np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE); > > + > > + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); > > + if (pages == NULL) > > + return ret; > > + > > + for (i = 0; i < np; i++) { > > + pages[i] = alloc_page(GFP_KERNEL); > > + if (!pages[i]) > > + goto out_free; > > + } > > + > > + arg.xattr_pages = pages; > > > > ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, > > &res.seq_res, 0); > > if (ret < 0) > > - return ret; > > + goto out_free; > > + > > + ret = res.xattr_len; > > > > /* > > * Normally, the caching is done one layer up, but for successful > > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > > nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); > > > > if (buflen) { > > - if (res.xattr_len > buflen) > > - return -ERANGE; > > + if (res.xattr_len > buflen) { > > + ret = -ERANGE; > > + goto out_free; > > + } > > _copy_from_pages(buf, pages, 0, res.xattr_len); > > } > > > > - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); > > +out_free: > > while (--np >= 0) > > __free_page(pages[np]); > > Looking at Chuck's page for listxattr, I think we need to check if > (pages[np) before freeing? > > Otherwise, looks fine to me. Reviewed-by. Hm, originally, yes, because of SPARSE_PAGES use. So that was actually a bug in the original code, which is now fixed. In other words, we need a Cc: -stable here (which I assume we want anyway, since this fixes RDMA). It's not needed anymore, and it's not needed for listxattr anymore either, although there the code was originally correct. - Frank
On Tue, Dec 01, 2020 at 10:24:11PM +0000, Frank van der Linden wrote: > On Tue, Dec 01, 2020 at 05:03:46PM -0500, Olga Kornievskaia wrote: > > > > On Tue, Dec 1, 2020 at 4:44 PM Frank van der Linden <fllinden@amazon.com> wrote: > > > > > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, > > > and it's easy enough to allocate enough pages for the request > > > up front, so do that. > > > > > > Also, since we've allocated the pages anyway, use the full > > > page aligned length for the receive buffer. This will allow > > > caching of valid replies that are too large for the caller, > > > but that still fit in the allocated pages. > > > > > > Signed-off-by: Frank van der Linden <fllinden@amazon.com> > > > --- > > > fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- > > > fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- > > > 2 files changed, 47 insertions(+), 14 deletions(-) > > > > > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > > > index 2b2211d1234e..bfe15ac77bd9 100644 > > > --- a/fs/nfs/nfs42proc.c > > > +++ b/fs/nfs/nfs42proc.c > > > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > > > void *buf, size_t buflen) > > > { > > > struct nfs_server *server = NFS_SERVER(inode); > > > - struct page *pages[NFS4XATTR_MAXPAGES] = {}; > > > + struct page **pages; > > > struct nfs42_getxattrargs arg = { > > > .fh = NFS_FH(inode), > > > - .xattr_pages = pages, > > > - .xattr_len = buflen, > > > .xattr_name = name, > > > }; > > > struct nfs42_getxattrres res; > > > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > > > .rpc_argp = &arg, > > > .rpc_resp = &res, > > > }; > > > - int ret, np; > > > + ssize_t ret, np, i; > > > + > > > + arg.xattr_len = buflen ?: XATTR_SIZE_MAX; > > > + > > > + ret = -ENOMEM; > > > + np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE); > > > + > > > + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); > > > + if (pages == NULL) > > > + return ret; > > > + > > > + for (i = 0; i < np; i++) { > > > + pages[i] = alloc_page(GFP_KERNEL); > > > + if (!pages[i]) > > > + goto out_free; > > > + } > > > + > > > + arg.xattr_pages = pages; > > > > > > ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, > > > &res.seq_res, 0); > > > if (ret < 0) > > > - return ret; > > > + goto out_free; > > > + > > > + ret = res.xattr_len; > > > > > > /* > > > * Normally, the caching is done one layer up, but for successful > > > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, > > > nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); > > > > > > if (buflen) { > > > - if (res.xattr_len > buflen) > > > - return -ERANGE; > > > + if (res.xattr_len > buflen) { > > > + ret = -ERANGE; > > > + goto out_free; > > > + } > > > _copy_from_pages(buf, pages, 0, res.xattr_len); > > > } > > > > > > - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); > > > +out_free: > > > while (--np >= 0) > > > __free_page(pages[np]); > > > > Looking at Chuck's page for listxattr, I think we need to check if > > (pages[np) before freeing? > > > > Otherwise, looks fine to me. Reviewed-by. > > Hm, originally, yes, because of SPARSE_PAGES use. So that was actually > a bug in the original code, which is now fixed. In other words, we > need a Cc: -stable here (which I assume we want anyway, since this > fixes RDMA). Oh, no, actually the original code was fine - it only freed the pages based on the actual returned result - so no issue there. Let me address Trond's comments, though, I'll send a v2. Frank
On Tue, Dec 1, 2020 at 5:15 PM Trond Myklebust <trondmy@hammerspace.com> wrote: > > On Tue, 2020-12-01 at 21:31 +0000, Frank van der Linden wrote: > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, > > and it's easy enough to allocate enough pages for the request > > up front, so do that. > > > > Also, since we've allocated the pages anyway, use the full > > page aligned length for the receive buffer. This will allow > > caching of valid replies that are too large for the caller, > > but that still fit in the allocated pages. > > > > Signed-off-by: Frank van der Linden <fllinden@amazon.com> > > --- > > fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- > > fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- > > 2 files changed, 47 insertions(+), 14 deletions(-) > > > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > > index 2b2211d1234e..bfe15ac77bd9 100644 > > --- a/fs/nfs/nfs42proc.c > > +++ b/fs/nfs/nfs42proc.c > > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct > > inode *inode, const char *name, > > void *buf, size_t buflen) > > { > > struct nfs_server *server = NFS_SERVER(inode); > > - struct page *pages[NFS4XATTR_MAXPAGES] = {}; > > + struct page **pages; > > struct nfs42_getxattrargs arg = { > > .fh = NFS_FH(inode), > > - .xattr_pages = pages, > > - .xattr_len = buflen, > > .xattr_name = name, > > }; > > struct nfs42_getxattrres res; > > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct > > Why can't we set up the page array in nfs42_proc_getxattr()? This means > that if we get a retryable error from the server, then we perform > multiple allocations of the same buffer. I wonder if the same then is needed for the LISTXATTR as we allocated pages in _nfs42_proc_listxattr() and instead should do it in nfs42_proc_listxattr().... > > inode *inode, const char *name, > > .rpc_argp = &arg, > > .rpc_resp = &res, > > }; > > - int ret, np; > > + ssize_t ret, np, i; > > + > > + arg.xattr_len = buflen ?: XATTR_SIZE_MAX; > > + > > + ret = -ENOMEM; > > + np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE); > > Please just use nfs_page_array_len(). It should be more efficient than > the above. > > > + > > + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); > > Perhaps do this as kmalloc_array() so we don't have to zero out the > array? If the alloc_page() fails below, you can always just truncate > the value of 'np' before jumping to 'out_free'. > > Also note that we prefer to use 'sizeof(*pages)' rather than > sizeof(struct page *). > > > + if (pages == NULL) > > + return ret; > > + > > + for (i = 0; i < np; i++) { > > + pages[i] = alloc_page(GFP_KERNEL); > > + if (!pages[i]) > > + goto out_free; > > + } > > + > > + arg.xattr_pages = pages; > > > > ret = nfs4_call_sync(server->client, server, &msg, > > &arg.seq_args, > > &res.seq_res, 0); > > if (ret < 0) > > - return ret; > > + goto out_free; > > + > > + ret = res.xattr_len; > > > > /* > > * Normally, the caching is done one layer up, but for > > successful > > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct > > inode *inode, const char *name, > > nfs4_xattr_cache_add(inode, name, NULL, pages, > > res.xattr_len); > > > > if (buflen) { > > - if (res.xattr_len > buflen) > > - return -ERANGE; > > + if (res.xattr_len > buflen) { > > + ret = -ERANGE; > > + goto out_free; > > + } > > _copy_from_pages(buf, pages, 0, res.xattr_len); > > } > > > > - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); > > +out_free: > > while (--np >= 0) > > __free_page(pages[np]); > > > > - return res.xattr_len; > > + kfree(pages); > > + > > + return ret; > > } > > > > static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void > > *buf, > > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c > > index 6e060a88f98c..8dfe674d1301 100644 > > --- a/fs/nfs/nfs42xdr.c > > +++ b/fs/nfs/nfs42xdr.c > > @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream > > *xdr, > > return -EIO; > > > > len = be32_to_cpup(p); > > + > > + /* > > + * Only check against the page length here. The actual > > + * requested length may be smaller, but that is only > > + * checked against after possibly caching a valid reply. > > + */ > > if (len > req->rq_rcv_buf.page_len) > > return -ERANGE; > > > > @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct > > rpc_rqst *req, struct xdr_stream *xdr, > > encode_putfh(xdr, args->fh, &hdr); > > encode_getxattr(xdr, args->xattr_name, &hdr); > > > > - plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; > > - > > - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, > > - hdr.replen); > > - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; > > + /* > > + * The GETXATTR op has no length field in the call, and the > > + * xattr data is at the end of the reply. > > + * > > + * Since the pages have already been allocated, there is no > > + * downside in using the page-aligned length. It will allow > > + * receiving and caching xattrs that are too large for the > > + * caller but still fit in the page-rounded value. > > + */ > > + plen = round_up(args->xattr_len, PAGE_SIZE); > > This is the wrong place to be recalculating page buffer sizes. This > function should have no idea what was allocated or by whom. Please just > set the correct value for args->xattr_len in nfs42_proc_getxattr(). > > > + rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, > > hdr.replen); > > > > encode_nops(&hdr); > > } > > -- > Trond Myklebust > Linux NFS client maintainer, Hammerspace > trond.myklebust@hammerspace.com > >
On Tue, Dec 01, 2020 at 10:15:47PM +0000, Trond Myklebust wrote: > On Tue, 2020-12-01 at 21:31 +0000, Frank van der Linden wrote: > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, > > and it's easy enough to allocate enough pages for the request > > up front, so do that. > > > > Also, since we've allocated the pages anyway, use the full > > page aligned length for the receive buffer. This will allow > > caching of valid replies that are too large for the caller, > > but that still fit in the allocated pages. > > > > Signed-off-by: Frank van der Linden <fllinden@amazon.com> > > --- > > fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- > > fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- > > 2 files changed, 47 insertions(+), 14 deletions(-) > > > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > > index 2b2211d1234e..bfe15ac77bd9 100644 > > --- a/fs/nfs/nfs42proc.c > > +++ b/fs/nfs/nfs42proc.c > > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct > > inode *inode, const char *name, > > void *buf, size_t buflen) > > { > > struct nfs_server *server = NFS_SERVER(inode); > > - struct page *pages[NFS4XATTR_MAXPAGES] = {}; > > + struct page **pages; > > struct nfs42_getxattrargs arg = { > > .fh = NFS_FH(inode), > > - .xattr_pages = pages, > > - .xattr_len = buflen, > > .xattr_name = name, > > }; > > struct nfs42_getxattrres res; > > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct > > Why can't we set up the page array in nfs42_proc_getxattr()? This means > that if we get a retryable error from the server, then we perform > multiple allocations of the same buffer. > > > inode *inode, const char *name, > > .rpc_argp = &arg, > > .rpc_resp = &res, > > }; > > - int ret, np; > > + ssize_t ret, np, i; > > + > > + arg.xattr_len = buflen ?: XATTR_SIZE_MAX; > > + > > + ret = -ENOMEM; > > + np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE); > > Please just use nfs_page_array_len(). It should be more efficient than > the above. > > > + > > + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); > > Perhaps do this as kmalloc_array() so we don't have to zero out the > array? If the alloc_page() fails below, you can always just truncate > the value of 'np' before jumping to 'out_free'. > > Also note that we prefer to use 'sizeof(*pages)' rather than > sizeof(struct page *). > > > + if (pages == NULL) > > + return ret; > > + > > + for (i = 0; i < np; i++) { > > + pages[i] = alloc_page(GFP_KERNEL); > > + if (!pages[i]) > > + goto out_free; > > + } > > + > > + arg.xattr_pages = pages; > > > > ret = nfs4_call_sync(server->client, server, &msg, > > &arg.seq_args, > > &res.seq_res, 0); > > if (ret < 0) > > - return ret; > > + goto out_free; > > + > > + ret = res.xattr_len; > > > > /* > > * Normally, the caching is done one layer up, but for > > successful > > @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct > > inode *inode, const char *name, > > nfs4_xattr_cache_add(inode, name, NULL, pages, > > res.xattr_len); > > > > if (buflen) { > > - if (res.xattr_len > buflen) > > - return -ERANGE; > > + if (res.xattr_len > buflen) { > > + ret = -ERANGE; > > + goto out_free; > > + } > > _copy_from_pages(buf, pages, 0, res.xattr_len); > > } > > > > - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); > > +out_free: > > while (--np >= 0) > > __free_page(pages[np]); > > > > - return res.xattr_len; > > + kfree(pages); > > + > > + return ret; > > } > > > > static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void > > *buf, > > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c > > index 6e060a88f98c..8dfe674d1301 100644 > > --- a/fs/nfs/nfs42xdr.c > > +++ b/fs/nfs/nfs42xdr.c > > @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream > > *xdr, > > return -EIO; > > > > len = be32_to_cpup(p); > > + > > + /* > > + * Only check against the page length here. The actual > > + * requested length may be smaller, but that is only > > + * checked against after possibly caching a valid reply. > > + */ > > if (len > req->rq_rcv_buf.page_len) > > return -ERANGE; > > > > @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct > > rpc_rqst *req, struct xdr_stream *xdr, > > encode_putfh(xdr, args->fh, &hdr); > > encode_getxattr(xdr, args->xattr_name, &hdr); > > > > - plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; > > - > > - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, > > - hdr.replen); > > - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; > > + /* > > + * The GETXATTR op has no length field in the call, and the > > + * xattr data is at the end of the reply. > > + * > > + * Since the pages have already been allocated, there is no > > + * downside in using the page-aligned length. It will allow > > + * receiving and caching xattrs that are too large for the > > + * caller but still fit in the page-rounded value. > > + */ > > + plen = round_up(args->xattr_len, PAGE_SIZE); > > This is the wrong place to be recalculating page buffer sizes. This > function should have no idea what was allocated or by whom. Please just > set the correct value for args->xattr_len in nfs42_proc_getxattr(). > > > + rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, > > hdr.replen); > > > > encode_nops(&hdr); > > } > > -- > Trond Myklebust > Linux NFS client maintainer, Hammerspace > trond.myklebust@hammerspace.com Thanks for the comments - v2 sent. - Frank
On Tue, Dec 01, 2020 at 06:03:31PM -0500, Olga Kornievskaia wrote: > On Tue, Dec 1, 2020 at 5:15 PM Trond Myklebust <trondmy@hammerspace.com> wrote: > > > > On Tue, 2020-12-01 at 21:31 +0000, Frank van der Linden wrote: > > > XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, > > > and it's easy enough to allocate enough pages for the request > > > up front, so do that. > > > > > > Also, since we've allocated the pages anyway, use the full > > > page aligned length for the receive buffer. This will allow > > > caching of valid replies that are too large for the caller, > > > but that still fit in the allocated pages. > > > > > > Signed-off-by: Frank van der Linden <fllinden@amazon.com> > > > --- > > > fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- > > > fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- > > > 2 files changed, 47 insertions(+), 14 deletions(-) > > > > > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > > > index 2b2211d1234e..bfe15ac77bd9 100644 > > > --- a/fs/nfs/nfs42proc.c > > > +++ b/fs/nfs/nfs42proc.c > > > @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct > > > inode *inode, const char *name, > > > void *buf, size_t buflen) > > > { > > > struct nfs_server *server = NFS_SERVER(inode); > > > - struct page *pages[NFS4XATTR_MAXPAGES] = {}; > > > + struct page **pages; > > > struct nfs42_getxattrargs arg = { > > > .fh = NFS_FH(inode), > > > - .xattr_pages = pages, > > > - .xattr_len = buflen, > > > .xattr_name = name, > > > }; > > > struct nfs42_getxattrres res; > > > @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct > > > > Why can't we set up the page array in nfs42_proc_getxattr()? This means > > that if we get a retryable error from the server, then we perform > > multiple allocations of the same buffer. > > I wonder if the same then is needed for the LISTXATTR as we allocated > pages in _nfs42_proc_listxattr() and instead should do it in > nfs42_proc_listxattr().... There are a few more places of which this is true, yep. getactl is the other one. In any case, fixed for the getxattr case in v2. - Frank
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 2b2211d1234e..bfe15ac77bd9 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1176,11 +1176,9 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); - struct page *pages[NFS4XATTR_MAXPAGES] = {}; + struct page **pages; struct nfs42_getxattrargs arg = { .fh = NFS_FH(inode), - .xattr_pages = pages, - .xattr_len = buflen, .xattr_name = name, }; struct nfs42_getxattrres res; @@ -1189,12 +1187,31 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, .rpc_argp = &arg, .rpc_resp = &res, }; - int ret, np; + ssize_t ret, np, i; + + arg.xattr_len = buflen ?: XATTR_SIZE_MAX; + + ret = -ENOMEM; + np = DIV_ROUND_UP(arg.xattr_len, PAGE_SIZE); + + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) + return ret; + + for (i = 0; i < np; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; + } + + arg.xattr_pages = pages; ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); if (ret < 0) - return ret; + goto out_free; + + ret = res.xattr_len; /* * Normally, the caching is done one layer up, but for successful @@ -1209,16 +1226,20 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); if (buflen) { - if (res.xattr_len > buflen) - return -ERANGE; + if (res.xattr_len > buflen) { + ret = -ERANGE; + goto out_free; + } _copy_from_pages(buf, pages, 0, res.xattr_len); } - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); +out_free: while (--np >= 0) __free_page(pages[np]); - return res.xattr_len; + kfree(pages); + + return ret; } static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6e060a88f98c..8dfe674d1301 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream *xdr, return -EIO; len = be32_to_cpup(p); + + /* + * Only check against the page length here. The actual + * requested length may be smaller, but that is only + * checked against after possibly caching a valid reply. + */ if (len > req->rq_rcv_buf.page_len) return -ERANGE; @@ -1483,11 +1489,17 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_getxattr(xdr, args->xattr_name, &hdr); - plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; - - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, - hdr.replen); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + /* + * The GETXATTR op has no length field in the call, and the + * xattr data is at the end of the reply. + * + * Since the pages have already been allocated, there is no + * downside in using the page-aligned length. It will allow + * receiving and caching xattrs that are too large for the + * caller but still fit in the page-rounded value. + */ + plen = round_up(args->xattr_len, PAGE_SIZE); + rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, hdr.replen); encode_nops(&hdr); }
XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, and it's easy enough to allocate enough pages for the request up front, so do that. Also, since we've allocated the pages anyway, use the full page aligned length for the receive buffer. This will allow caching of valid replies that are too large for the caller, but that still fit in the allocated pages. Signed-off-by: Frank van der Linden <fllinden@amazon.com> --- fs/nfs/nfs42proc.c | 39 ++++++++++++++++++++++++++++++--------- fs/nfs/nfs42xdr.c | 22 +++++++++++++++++----- 2 files changed, 47 insertions(+), 14 deletions(-)