Message ID | 1407396229-4785-18-git-send-email-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Aug 7, 2014 at 3:23 PM, Christoph Hellwig <hch@lst.de> wrote: > Instead of overflowing the XDR send buffer with our extent list allocate > pages and pre-encode the layoutupdate payload into them. We optimistically > allocate a single page use alloc_page and only switch to vmalloc when we > have more extents outstanding. Currently there is only a single testcase > (xfstests generic/113) which can reproduce large enough extent lists for > this to occur. > depending on how badly the extents are fragmented, it might worth starting an async layoutcommit within blocklayout client when bl_count reaches certain limit, so that we don't send an unlimited amount of extents in one layoutcommit. Cheers, Tao > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/nfs/blocklayout/blocklayout.c | 9 ++-- > fs/nfs/blocklayout/blocklayout.h | 6 ++- > fs/nfs/blocklayout/extent_tree.c | 89 +++++++++++++++++++++++++++++++--------- > 3 files changed, 78 insertions(+), 26 deletions(-) > > diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c > index d5a2b87..fae8144 100644 > --- a/fs/nfs/blocklayout/blocklayout.c > +++ b/fs/nfs/blocklayout/blocklayout.c > @@ -498,12 +498,11 @@ bl_return_range(struct pnfs_layout_hdr *lo, > err = ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end); > } > > -static void > -bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, > - const struct nfs4_layoutcommit_args *arg) > +static int > +bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg) > { > dprintk("%s enter\n", __func__); > - ext_tree_encode_commit(BLK_LO2EXT(lo), xdr); > + return ext_tree_prepare_commit(arg); > } > > static void > @@ -808,7 +807,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { > .alloc_lseg = bl_alloc_lseg, > .free_lseg = bl_free_lseg, > .return_range = bl_return_range, > - .encode_layoutcommit = bl_encode_layoutcommit, > + .prepare_layoutcommit = bl_prepare_layoutcommit, > .cleanup_layoutcommit = bl_cleanup_layoutcommit, > .set_layoutdriver = bl_set_layoutdriver, > .clear_layoutdriver = bl_clear_layoutdriver, > diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h > index b4f66d8..caae202 100644 > --- a/fs/nfs/blocklayout/blocklayout.h > +++ b/fs/nfs/blocklayout/blocklayout.h > @@ -80,6 +80,9 @@ struct pnfs_block_extent { > unsigned int be_tag; > }; > > +/* on the wire size of the extent */ > +#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) > + > struct pnfs_block_layout { > struct pnfs_layout_hdr bl_layout; > struct rb_root bl_ext_rw; > @@ -138,8 +141,7 @@ int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, > sector_t len); > bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, > struct pnfs_block_extent *ret, bool rw); > -int ext_tree_encode_commit(struct pnfs_block_layout *bl, > - struct xdr_stream *xdr); > +int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); > void ext_tree_mark_committed(struct pnfs_block_layout *bl, int status); > > #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ > diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c > index c7dacfa..3945221 100644 > --- a/fs/nfs/blocklayout/extent_tree.c > +++ b/fs/nfs/blocklayout/extent_tree.c > @@ -465,32 +465,23 @@ out: > return err; > } > > -int > -ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr) > +static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, > + size_t buffer_size, size_t *count) > { > struct pnfs_block_extent *be; > - unsigned int count = 0; > - __be32 *p, *xdr_start; > int ret = 0; > > - dprintk("%s enter\n", __func__); > - > - xdr_start = xdr_reserve_space(xdr, 8); > - if (!xdr_start) > - return -ENOSPC; > - > spin_lock(&bl->bl_ext_lock); > for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { > if (be->be_state != PNFS_BLOCK_INVALID_DATA || > be->be_tag != EXTENT_WRITTEN) > continue; > > - p = xdr_reserve_space(xdr, 7 * sizeof(__be32) + > - NFS4_DEVICEID4_SIZE); > - if (!p) { > - printk("%s: out of space for extent list\n", __func__); > + (*count)++; > + if (*count * BL_EXTENT_SIZE > buffer_size) { > + /* keep counting.. */ > ret = -ENOSPC; > - break; > + continue; > } > > p = xdr_encode_opaque_fixed(p, be->be_devid.data, > @@ -501,15 +492,75 @@ ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr) > *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); > > be->be_tag = EXTENT_COMMITTING; > - count++; > } > spin_unlock(&bl->bl_ext_lock); > > - xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); > - xdr_start[1] = cpu_to_be32(count); > + return ret; > +} > + > +int > +ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) > +{ > + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); > + size_t count = 0, buffer_size = PAGE_SIZE; > + __be32 *start_p; > + int ret; > + > + dprintk("%s enter\n", __func__); > + > + arg->layoutupdate_page = alloc_page(GFP_NOFS); > + if (!arg->layoutupdate_page) > + return -ENOMEM; > + start_p = page_address(arg->layoutupdate_page); > + arg->layoutupdate_pages = &arg->layoutupdate_page; > + > +retry: > + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); > + if (unlikely(ret)) { > + if (arg->layoutupdate_pages != &arg->layoutupdate_page) { > + int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; > + > + for (i = 0; i < nr_pages; i++) > + put_page(arg->layoutupdate_pages[i]); > + kfree(arg->layoutupdate_pages); > + } else { > + put_page(arg->layoutupdate_page); > + } > + > + buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; > + count = 0; > + > + arg->layoutupdate_pages = > + kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), > + sizeof(struct page *), GFP_NOFS); > + if (!arg->layoutupdate_pages) > + return -ENOMEM; > + > + start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); > + if (!start_p) { > + kfree(arg->layoutupdate_pages); > + return -ENOMEM; > + } > + > + goto retry; > + } > + > + *start_p = cpu_to_be32(count); > + arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; > + > + if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { > + __be32 *p = start_p; > + int i = 0; > + > + for (p = start_p; > + p < start_p + arg->layoutupdate_len; > + p += PAGE_SIZE) { > + arg->layoutupdate_pages[i++] = vmalloc_to_page(p); > + } > + } > > dprintk("%s found %i ranges\n", __func__, count); > - return ret; > + return 0; > } > > void > -- > 1.9.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 07, 2014 at 07:05:03PM +0800, Peng Tao wrote: > On Thu, Aug 7, 2014 at 3:23 PM, Christoph Hellwig <hch@lst.de> wrote: > > Instead of overflowing the XDR send buffer with our extent list allocate > > pages and pre-encode the layoutupdate payload into them. We optimistically > > allocate a single page use alloc_page and only switch to vmalloc when we > > have more extents outstanding. Currently there is only a single testcase > > (xfstests generic/113) which can reproduce large enough extent lists for > > this to occur. > > > depending on how badly the extents are fragmented, it might worth > starting an async layoutcommit within blocklayout client when bl_count > reaches certain limit, so that we don't send an unlimited amount of > extents in one layoutcommit. I though about this, but this only reduces the problem and we'd still need to be able to commit all extents for a data integrity operation, so we'd still need someting like the above. The other option I considered and prototyped was to send multiple on the wire layoutcommit calls from pnfs_layoutcommit_inode, but that did prove to be even more cumbersome. -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 7, 2014 at 7:27 PM, Christoph Hellwig <hch@lst.de> wrote: > On Thu, Aug 07, 2014 at 07:05:03PM +0800, Peng Tao wrote: >> On Thu, Aug 7, 2014 at 3:23 PM, Christoph Hellwig <hch@lst.de> wrote: >> > Instead of overflowing the XDR send buffer with our extent list allocate >> > pages and pre-encode the layoutupdate payload into them. We optimistically >> > allocate a single page use alloc_page and only switch to vmalloc when we >> > have more extents outstanding. Currently there is only a single testcase >> > (xfstests generic/113) which can reproduce large enough extent lists for >> > this to occur. >> > >> depending on how badly the extents are fragmented, it might worth >> starting an async layoutcommit within blocklayout client when bl_count >> reaches certain limit, so that we don't send an unlimited amount of >> extents in one layoutcommit. > > I though about this, but this only reduces the problem and we'd still > need to be able to commit all extents for a data integrity operation, > so we'd still need someting like the above. > yeah, I wasn't against your patch. What I suggested is a way to limit the amount of commit extents such that we don't end up sending more that one RPC, whose size is still limited even if we allocate large enough XDR buffer. > The other option I considered and prototyped was to send multiple on the > wire layoutcommit calls from pnfs_layoutcommit_inode, but that did prove > to be even more cumbersome. -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d5a2b87..fae8144 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -498,12 +498,11 @@ bl_return_range(struct pnfs_layout_hdr *lo, err = ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end); } -static void -bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg) +static int +bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg) { dprintk("%s enter\n", __func__); - ext_tree_encode_commit(BLK_LO2EXT(lo), xdr); + return ext_tree_prepare_commit(arg); } static void @@ -808,7 +807,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .alloc_lseg = bl_alloc_lseg, .free_lseg = bl_free_lseg, .return_range = bl_return_range, - .encode_layoutcommit = bl_encode_layoutcommit, + .prepare_layoutcommit = bl_prepare_layoutcommit, .cleanup_layoutcommit = bl_cleanup_layoutcommit, .set_layoutdriver = bl_set_layoutdriver, .clear_layoutdriver = bl_clear_layoutdriver, diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index b4f66d8..caae202 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -80,6 +80,9 @@ struct pnfs_block_extent { unsigned int be_tag; }; +/* on the wire size of the extent */ +#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) + struct pnfs_block_layout { struct pnfs_layout_hdr bl_layout; struct rb_root bl_ext_rw; @@ -138,8 +141,7 @@ int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, sector_t len); bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, struct pnfs_block_extent *ret, bool rw); -int ext_tree_encode_commit(struct pnfs_block_layout *bl, - struct xdr_stream *xdr); +int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); void ext_tree_mark_committed(struct pnfs_block_layout *bl, int status); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index c7dacfa..3945221 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -465,32 +465,23 @@ out: return err; } -int -ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr) +static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, + size_t buffer_size, size_t *count) { struct pnfs_block_extent *be; - unsigned int count = 0; - __be32 *p, *xdr_start; int ret = 0; - dprintk("%s enter\n", __func__); - - xdr_start = xdr_reserve_space(xdr, 8); - if (!xdr_start) - return -ENOSPC; - spin_lock(&bl->bl_ext_lock); for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag != EXTENT_WRITTEN) continue; - p = xdr_reserve_space(xdr, 7 * sizeof(__be32) + - NFS4_DEVICEID4_SIZE); - if (!p) { - printk("%s: out of space for extent list\n", __func__); + (*count)++; + if (*count * BL_EXTENT_SIZE > buffer_size) { + /* keep counting.. */ ret = -ENOSPC; - break; + continue; } p = xdr_encode_opaque_fixed(p, be->be_devid.data, @@ -501,15 +492,75 @@ ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr) *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); be->be_tag = EXTENT_COMMITTING; - count++; } spin_unlock(&bl->bl_ext_lock); - xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); - xdr_start[1] = cpu_to_be32(count); + return ret; +} + +int +ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + size_t count = 0, buffer_size = PAGE_SIZE; + __be32 *start_p; + int ret; + + dprintk("%s enter\n", __func__); + + arg->layoutupdate_page = alloc_page(GFP_NOFS); + if (!arg->layoutupdate_page) + return -ENOMEM; + start_p = page_address(arg->layoutupdate_page); + arg->layoutupdate_pages = &arg->layoutupdate_page; + +retry: + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + if (unlikely(ret)) { + if (arg->layoutupdate_pages != &arg->layoutupdate_page) { + int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; + + for (i = 0; i < nr_pages; i++) + put_page(arg->layoutupdate_pages[i]); + kfree(arg->layoutupdate_pages); + } else { + put_page(arg->layoutupdate_page); + } + + buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + count = 0; + + arg->layoutupdate_pages = + kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), + sizeof(struct page *), GFP_NOFS); + if (!arg->layoutupdate_pages) + return -ENOMEM; + + start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + if (!start_p) { + kfree(arg->layoutupdate_pages); + return -ENOMEM; + } + + goto retry; + } + + *start_p = cpu_to_be32(count); + arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + + if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { + __be32 *p = start_p; + int i = 0; + + for (p = start_p; + p < start_p + arg->layoutupdate_len; + p += PAGE_SIZE) { + arg->layoutupdate_pages[i++] = vmalloc_to_page(p); + } + } dprintk("%s found %i ranges\n", __func__, count); - return ret; + return 0; } void
Instead of overflowing the XDR send buffer with our extent list allocate pages and pre-encode the layoutupdate payload into them. We optimistically allocate a single page use alloc_page and only switch to vmalloc when we have more extents outstanding. Currently there is only a single testcase (xfstests generic/113) which can reproduce large enough extent lists for this to occur. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/nfs/blocklayout/blocklayout.c | 9 ++-- fs/nfs/blocklayout/blocklayout.h | 6 ++- fs/nfs/blocklayout/extent_tree.c | 89 +++++++++++++++++++++++++++++++--------- 3 files changed, 78 insertions(+), 26 deletions(-)