Message ID | 1484053051-23685-1-git-send-email-jlayton@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> On 10 Jan 2017, at 20:57, Jeff Layton <jlayton@redhat.com> wrote: > > v2: fix bug in offset handling in iov_iter_pvec_size > > xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses > fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying > to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it > to pick up a wrong offset and get stuck in an infinite loop while trying > to populate the page array. dio_get_pagev_size has a similar problem. > > To fix the first problem, add a new iov_iter helper to determine the > offset into the page for the current segment and have ceph call that. > I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc, > but that will only return a single page at a time for ITER_BVEC and > it's better to make larger requests when possible. > > For the second problem, we simply replace it with a new helper that does > what it does, but properly for all iov_iter types. > > Since we're moving that into generic code, we can also utilize the > iterate_all_kinds macro to simplify this. That means that we need to > rework the logic a bit since we can't advance to the next vector while > checking the current one. > > Link: http://tracker.ceph.com/issues/18130 > Cc: "Zhu, Caifeng" <zhucaifeng@unissoft-nj.com> > Signed-off-by: Jeff Layton <jlayton@redhat.com> > --- > fs/ceph/file.c | 28 ++------------------- > include/linux/uio.h | 2 ++ > lib/iov_iter.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 75 insertions(+), 26 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index f633165f3fdc..0cd9fc68a04e 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -35,29 +35,6 @@ > */ > > /* > - * Calculate the length sum of direct io vectors that can > - * be combined into one page vector. > - */ > -static size_t dio_get_pagev_size(const struct iov_iter *it) > -{ > - const struct iovec *iov = it->iov; > - const struct iovec *iovend = iov + it->nr_segs; > - size_t size; > - > - size = iov->iov_len - it->iov_offset; > - /* > - * An iov can be page vectored when both the current tail > - * and the next base are page aligned. > - */ > - while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) && > - (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) { > - size += iov->iov_len; > - } > - dout("dio_get_pagevlen len = %zu\n", size); > - return size; > -} > - > -/* > * Allocate a page vector based on (@it, @nbytes). > * The return value is the tuple describing a page vector, > * that is (@pages, @page_align, @num_pages). > @@ -71,8 +48,7 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, > struct page **pages; > int ret = 0, idx, npages; > > - align = (unsigned long)(it->iov->iov_base + it->iov_offset) & > - (PAGE_SIZE - 1); > + align = iov_iter_single_seg_page_offset(it); > npages = calc_pages_for(align, nbytes); > pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL); > if (!pages) { > @@ -927,7 +903,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, > } > > while (iov_iter_count(iter) > 0) { > - u64 size = dio_get_pagev_size(iter); > + u64 size = iov_iter_pvec_size(iter); > size_t start = 0; > ssize_t len; > > diff --git a/include/linux/uio.h b/include/linux/uio.h > index 6e22b544d039..46fdfff3d7d6 100644 > --- a/include/linux/uio.h > +++ b/include/linux/uio.h > @@ -83,6 +83,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, > void iov_iter_advance(struct iov_iter *i, size_t bytes); > int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); > size_t iov_iter_single_seg_count(const struct iov_iter *i); > +size_t iov_iter_single_seg_page_offset(const struct iov_iter *i); > size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, > struct iov_iter *i); > size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, > @@ -93,6 +94,7 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); > size_t iov_iter_zero(size_t bytes, struct iov_iter *); > unsigned long iov_iter_alignment(const struct iov_iter *i); > unsigned long iov_iter_gap_alignment(const struct iov_iter *i); > +size_t iov_iter_pvec_size(const struct iov_iter *i); > void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, > unsigned long nr_segs, size_t count); > void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, > diff --git a/lib/iov_iter.c b/lib/iov_iter.c > index 6b415b5a100d..b8fa377b0cef 100644 > --- a/lib/iov_iter.c > +++ b/lib/iov_iter.c > @@ -745,6 +745,24 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) > } > EXPORT_SYMBOL(iov_iter_single_seg_count); > > +/* Return offset into page for current iov_iter segment */ > +size_t iov_iter_single_seg_page_offset(const struct iov_iter *i) > +{ > + size_t base; > + > + if (i->type & ITER_PIPE) > + base = i->pipe->bufs[i->idx].offset; > + else if (i->type & ITER_BVEC) > + base = i->bvec->bv_offset; > + else if (i->type & ITER_KVEC) > + base = (size_t)i->kvec->iov_base; > + else > + base = (size_t)i->iov->iov_base; > + > + return (base + i->iov_offset) & (PAGE_SIZE - 1); > +} > +EXPORT_SYMBOL(iov_iter_single_seg_page_offset); > + > void iov_iter_kvec(struct iov_iter *i, int direction, > const struct kvec *kvec, unsigned long nr_segs, > size_t count) > @@ -830,6 +848,59 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) > } > EXPORT_SYMBOL(iov_iter_gap_alignment); > > +/** > + * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter > + * @i: iov_iter to in which to find the size > + * > + * Some filesystems can stitch together multiple iovecs into a single > + * page vector when both the previous tail and current base are page > + * aligned. This function discovers the length that can fit in a single > + * pagevec and returns it. > + */ > +size_t iov_iter_pvec_size(const struct iov_iter *i) > +{ > + size_t size = i->count; > + size_t pv_size = 0; > + bool contig = false, first = true; > + > + if (!size) > + return 0; > + > + /* Pipes are naturally aligned for this */ > + if (unlikely(i->type & ITER_PIPE)) > + return size; > + > + /* > + * An iov can be page vectored when the current base and previous > + * tail are both page aligned. Note that we don't require that the > + * initial base in the first iovec also be page aligned. > + */ > + iterate_all_kinds(i, size, v, > + ({ > + if (first || (contig && PAGE_ALIGNED(v.iov_base))) { > + pv_size += v.iov_len; > + first = false; > + contig = PAGE_ALIGNED(v.iov_base + v.iov_len); > + }; 0; > + }), > + ({ > + if (first || (contig && v.bv_offset == 0)) { > + pv_size += v.bv_len; > + first = false; > + contig = PAGE_ALIGNED(v.bv_offset + v.bv_len); > + } > + }), > + ({ > + if (first || (contig && PAGE_ALIGNED(v.iov_base))) { > + pv_size += v.iov_len; > + first = false; > + contig = PAGE_ALIGNED(v.iov_base + v.iov_len); > + } > + })) > + return pv_size; > +} > +EXPORT_SYMBOL(iov_iter_pvec_size); > + > static inline size_t __pipe_get_pages(struct iov_iter *i, > size_t maxsize, > struct page **pages, Reviewed-by: Yan, Zheng <zyan@redhat.com> > -- > 2.7.4 > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote: > v2: fix bug in offset handling in iov_iter_pvec_size > > xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses > fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying > to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it > to pick up a wrong offset and get stuck in an infinite loop while trying > to populate the page array. dio_get_pagev_size has a similar problem. > > To fix the first problem, add a new iov_iter helper to determine the > offset into the page for the current segment and have ceph call that. > I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc, > but that will only return a single page at a time for ITER_BVEC and > it's better to make larger requests when possible. > > For the second problem, we simply replace it with a new helper that does > what it does, but properly for all iov_iter types. > > Since we're moving that into generic code, we can also utilize the > iterate_all_kinds macro to simplify this. That means that we need to > rework the logic a bit since we can't advance to the next vector while > checking the current one. Yecchhh... That really looks like exposing way too low-level stuff instead of coming up with saner primitive ;-/ Is page vector + offset in the first page + number of bytes really what ceph wants? Would e.g. an array of bio_vec be saner? Because _that_ would make a lot more natural iov_iter_get_pages_alloc() analogue... And yes, I realize that you have ->pages wired into the struct ceph_osd_request; how painful would it be to have it switched to struct bio_vec array instead? -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 12, 2017 at 8:59 AM, Al Viro <viro@zeniv.linux.org.uk> wrote: > On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote: >> v2: fix bug in offset handling in iov_iter_pvec_size >> >> xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses >> fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying >> to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it >> to pick up a wrong offset and get stuck in an infinite loop while trying >> to populate the page array. dio_get_pagev_size has a similar problem. >> >> To fix the first problem, add a new iov_iter helper to determine the >> offset into the page for the current segment and have ceph call that. >> I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc, >> but that will only return a single page at a time for ITER_BVEC and >> it's better to make larger requests when possible. >> >> For the second problem, we simply replace it with a new helper that does >> what it does, but properly for all iov_iter types. >> >> Since we're moving that into generic code, we can also utilize the >> iterate_all_kinds macro to simplify this. That means that we need to >> rework the logic a bit since we can't advance to the next vector while >> checking the current one. > > Yecchhh... That really looks like exposing way too low-level stuff instead > of coming up with saner primitive ;-/ > > Is page vector + offset in the first page + number of bytes really what > ceph wants? Would e.g. an array of bio_vec be saner? Because _that_ > would make a lot more natural iov_iter_get_pages_alloc() analogue... > > And yes, I realize that you have ->pages wired into the struct ceph_osd_request; > how painful would it be to have it switched to struct bio_vec array instead? It would be a significant and wide-reaching change, but I've been meaning to look into switching to iov_iter for a couple of releases now. There is a lot of ugly code in net/ceph/messenger.c to hangle iteration over "page vectors", "page lists" and "bio lists". All of it predates iov_iter proliferation and is mostly incomplete anyway: IIRC you can send out of a pagelist but can't recv into a pagelist, etc. That said, Jeff's patch doesn't look too bad to me... Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2017-01-12 at 07:59 +0000, Al Viro wrote: > On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote: > > > > v2: fix bug in offset handling in iov_iter_pvec_size > > > > xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses > > fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying > > to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it > > to pick up a wrong offset and get stuck in an infinite loop while trying > > to populate the page array. dio_get_pagev_size has a similar problem. > > > > To fix the first problem, add a new iov_iter helper to determine the > > offset into the page for the current segment and have ceph call that. > > I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc, > > but that will only return a single page at a time for ITER_BVEC and > > it's better to make larger requests when possible. > > > > For the second problem, we simply replace it with a new helper that does > > what it does, but properly for all iov_iter types. > > > > Since we're moving that into generic code, we can also utilize the > > iterate_all_kinds macro to simplify this. That means that we need to > > rework the logic a bit since we can't advance to the next vector while > > checking the current one. > > Yecchhh... That really looks like exposing way too low-level stuff instead > of coming up with saner primitive ;-/ > Fair point. That said, I'm not terribly thrilled with how iov_iter_get_pages* works right now. Note that it only ever touches the first vector. Would it not be better to keep getting page references if the bvec/iov elements are aligned properly? It seems quite plausible that they often would be, and being able to hand back a larger list of pages in most cases would be advantageous. IOW, should we have iov_iter_get_pages basically do what dio_get_pages_alloc does -- try to build as long an array of pages as possible before returning, provided that the alignment works out? The NFS DIO code, for instance, could also benefit there. I know we've had reports there in the past that sending down a bunch of small iovecs causes a lot of small-sized requests on the wire. > Is page vector + offset in the first page + number of bytes really what > ceph wants? Would e.g. an array of bio_vec be saner? Because _that_ > would make a lot more natural iov_iter_get_pages_alloc() analogue... > > And yes, I realize that you have ->pages wired into the struct ceph_osd_request; > how painful would it be to have it switched to struct bio_vec array instead? Actually...it looks like that might not be too hard. The low-level OSD handling code can already handle bio_vec arrays in order to service RBD. It looks like we could switch cephfs to use osd_req_op_extent_osd_data_bio instead of osd_req_op_extent_osd_data_pages. That would add a dependency in cephfs on CONFIG_BLOCK, but I think we could probably live with that.
On Thu, Jan 12, 2017 at 12:27 PM, Jeff Layton <jlayton@redhat.com> wrote: > On Thu, 2017-01-12 at 07:59 +0000, Al Viro wrote: >> On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote: >> > >> > v2: fix bug in offset handling in iov_iter_pvec_size >> > >> > xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses >> > fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying >> > to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it >> > to pick up a wrong offset and get stuck in an infinite loop while trying >> > to populate the page array. dio_get_pagev_size has a similar problem. >> > >> > To fix the first problem, add a new iov_iter helper to determine the >> > offset into the page for the current segment and have ceph call that. >> > I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc, >> > but that will only return a single page at a time for ITER_BVEC and >> > it's better to make larger requests when possible. >> > >> > For the second problem, we simply replace it with a new helper that does >> > what it does, but properly for all iov_iter types. >> > >> > Since we're moving that into generic code, we can also utilize the >> > iterate_all_kinds macro to simplify this. That means that we need to >> > rework the logic a bit since we can't advance to the next vector while >> > checking the current one. >> >> Yecchhh... That really looks like exposing way too low-level stuff instead >> of coming up with saner primitive ;-/ >> > > Fair point. That said, I'm not terribly thrilled with how > iov_iter_get_pages* works right now. > > Note that it only ever touches the first vector. Would it not be better > to keep getting page references if the bvec/iov elements are aligned > properly? It seems quite plausible that they often would be, and being > able to hand back a larger list of pages in most cases would be > advantageous. > > IOW, should we have iov_iter_get_pages basically do what > dio_get_pages_alloc does -- try to build as long an array of pages as > possible before returning, provided that the alignment works out? > > The NFS DIO code, for instance, could also benefit there. I know we've > had reports there in the past that sending down a bunch of small iovecs > causes a lot of small-sized requests on the wire. > >> Is page vector + offset in the first page + number of bytes really what >> ceph wants? Would e.g. an array of bio_vec be saner? Because _that_ >> would make a lot more natural iov_iter_get_pages_alloc() analogue... >> >> And yes, I realize that you have ->pages wired into the struct ceph_osd_request; >> how painful would it be to have it switched to struct bio_vec array instead? > > Actually...it looks like that might not be too hard. The low-level OSD > handling code can already handle bio_vec arrays in order to service RBD. > It looks like we could switch cephfs to use > osd_req_op_extent_osd_data_bio instead of > osd_req_op_extent_osd_data_pages. That would add a dependency in cephfs > on CONFIG_BLOCK, but I think we could probably live with that. Ah, just that part might be easy enough ;) Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote: > It would be a significant and wide-reaching change, but I've been > meaning to look into switching to iov_iter for a couple of releases > now. There is a lot of ugly code in net/ceph/messenger.c to hangle > iteration over "page vectors", "page lists" and "bio lists". All of it > predates iov_iter proliferation and is mostly incomplete anyway: IIRC > you can send out of a pagelist but can't recv into a pagelist, etc. Wait a sec... Is it done from the same thread that has issued a syscall? If so, we certainly could just pass iov_iter without bothering with any form of ..._get_pages(); if not, we'll need at least to get from iovec to bio_vec, since userland addresses make sense only in the caller's context... -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 12, 2017 at 12:37 PM, Al Viro <viro@zeniv.linux.org.uk> wrote: > On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote: > >> It would be a significant and wide-reaching change, but I've been >> meaning to look into switching to iov_iter for a couple of releases >> now. There is a lot of ugly code in net/ceph/messenger.c to hangle >> iteration over "page vectors", "page lists" and "bio lists". All of it >> predates iov_iter proliferation and is mostly incomplete anyway: IIRC >> you can send out of a pagelist but can't recv into a pagelist, etc. > > Wait a sec... Is it done from the same thread that has issued a syscall? > If so, we certainly could just pass iov_iter without bothering with any > form of ..._get_pages(); if not, we'll need at least to get from iovec > to bio_vec, since userland addresses make sense only in the caller's > context... No, not necessarily - it's also used by rbd (all of net/ceph has two users: fs/ceph and drivers/block/rbd.c). Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 12, 2017 at 12:46:42PM +0100, Ilya Dryomov wrote: > On Thu, Jan 12, 2017 at 12:37 PM, Al Viro <viro@zeniv.linux.org.uk> wrote: > > On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote: > > > >> It would be a significant and wide-reaching change, but I've been > >> meaning to look into switching to iov_iter for a couple of releases > >> now. There is a lot of ugly code in net/ceph/messenger.c to hangle > >> iteration over "page vectors", "page lists" and "bio lists". All of it > >> predates iov_iter proliferation and is mostly incomplete anyway: IIRC > >> you can send out of a pagelist but can't recv into a pagelist, etc. > > > > Wait a sec... Is it done from the same thread that has issued a syscall? > > If so, we certainly could just pass iov_iter without bothering with any > > form of ..._get_pages(); if not, we'll need at least to get from iovec > > to bio_vec, since userland addresses make sense only in the caller's > > context... > > No, not necessarily - it's also used by rbd (all of net/ceph has two > users: fs/ceph and drivers/block/rbd.c). Yes, but rbd doesn't deal with userland-pointing iovecs at all, does it? -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2017-01-12 at 12:46 +0100, Ilya Dryomov wrote: > On Thu, Jan 12, 2017 at 12:37 PM, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote: > > > > > > > > It would be a significant and wide-reaching change, but I've been > > > meaning to look into switching to iov_iter for a couple of releases > > > now. There is a lot of ugly code in net/ceph/messenger.c to hangle > > > iteration over "page vectors", "page lists" and "bio lists". All of it > > > predates iov_iter proliferation and is mostly incomplete anyway: IIRC > > > you can send out of a pagelist but can't recv into a pagelist, etc. > > > > Wait a sec... Is it done from the same thread that has issued a syscall? > > If so, we certainly could just pass iov_iter without bothering with any > > form of ..._get_pages(); if not, we'll need at least to get from iovec > > to bio_vec, since userland addresses make sense only in the caller's > > context... > > No, not necessarily - it's also used by rbd (all of net/ceph has two > users: fs/ceph and drivers/block/rbd.c). > > ...and note that the actual send/receive is done from workqueue context (AFAICT), so we might be operating over the array from a completely different thread context from where it was submitted. I think we do need to get page references at the point of submission like this.
On Thu, Jan 12, 2017 at 12:53 PM, Al Viro <viro@zeniv.linux.org.uk> wrote: > On Thu, Jan 12, 2017 at 12:46:42PM +0100, Ilya Dryomov wrote: >> On Thu, Jan 12, 2017 at 12:37 PM, Al Viro <viro@zeniv.linux.org.uk> wrote: >> > On Thu, Jan 12, 2017 at 12:13:31PM +0100, Ilya Dryomov wrote: >> > >> >> It would be a significant and wide-reaching change, but I've been >> >> meaning to look into switching to iov_iter for a couple of releases >> >> now. There is a lot of ugly code in net/ceph/messenger.c to hangle >> >> iteration over "page vectors", "page lists" and "bio lists". All of it >> >> predates iov_iter proliferation and is mostly incomplete anyway: IIRC >> >> you can send out of a pagelist but can't recv into a pagelist, etc. >> > >> > Wait a sec... Is it done from the same thread that has issued a syscall? >> > If so, we certainly could just pass iov_iter without bothering with any >> > form of ..._get_pages(); if not, we'll need at least to get from iovec >> > to bio_vec, since userland addresses make sense only in the caller's >> > context... >> >> No, not necessarily - it's also used by rbd (all of net/ceph has two >> users: fs/ceph and drivers/block/rbd.c). > > Yes, but rbd doesn't deal with userland-pointing iovecs at all, does it? Correct. Normal I/O is all bios + currently it uses some of that custom page vector/list machinery for maintenance operations (rbd metadata, locks, etc). No userland pointers whatsoever for rbd, but the messenger (checksumming + {send,recv}{msg,page}) runs out of the kworker for both rbd and cephfs. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2017-01-12 at 12:37 +0100, Ilya Dryomov wrote: > On Thu, Jan 12, 2017 at 12:27 PM, Jeff Layton <jlayton@redhat.com> wrote: > > On Thu, 2017-01-12 at 07:59 +0000, Al Viro wrote: > > > On Tue, Jan 10, 2017 at 07:57:31AM -0500, Jeff Layton wrote: > > > > > > > > v2: fix bug in offset handling in iov_iter_pvec_size > > > > > > > > xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses > > > > fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying > > > > to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it > > > > to pick up a wrong offset and get stuck in an infinite loop while trying > > > > to populate the page array. dio_get_pagev_size has a similar problem. > > > > > > > > To fix the first problem, add a new iov_iter helper to determine the > > > > offset into the page for the current segment and have ceph call that. > > > > I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc, > > > > but that will only return a single page at a time for ITER_BVEC and > > > > it's better to make larger requests when possible. > > > > > > > > For the second problem, we simply replace it with a new helper that does > > > > what it does, but properly for all iov_iter types. > > > > > > > > Since we're moving that into generic code, we can also utilize the > > > > iterate_all_kinds macro to simplify this. That means that we need to > > > > rework the logic a bit since we can't advance to the next vector while > > > > checking the current one. > > > > > > Yecchhh... That really looks like exposing way too low-level stuff instead > > > of coming up with saner primitive ;-/ > > > > > > > Fair point. That said, I'm not terribly thrilled with how > > iov_iter_get_pages* works right now. > > > > Note that it only ever touches the first vector. Would it not be better > > to keep getting page references if the bvec/iov elements are aligned > > properly? It seems quite plausible that they often would be, and being > > able to hand back a larger list of pages in most cases would be > > advantageous. > > > > IOW, should we have iov_iter_get_pages basically do what > > dio_get_pages_alloc does -- try to build as long an array of pages as > > possible before returning, provided that the alignment works out? > > > > The NFS DIO code, for instance, could also benefit there. I know we've > > had reports there in the past that sending down a bunch of small iovecs > > causes a lot of small-sized requests on the wire. > > > > > Is page vector + offset in the first page + number of bytes really what > > > ceph wants? Would e.g. an array of bio_vec be saner? Because _that_ > > > would make a lot more natural iov_iter_get_pages_alloc() analogue... > > > > > > And yes, I realize that you have ->pages wired into the struct ceph_osd_request; > > > how painful would it be to have it switched to struct bio_vec array instead? > > > > Actually...it looks like that might not be too hard. The low-level OSD > > handling code can already handle bio_vec arrays in order to service RBD. > > It looks like we could switch cephfs to use > > osd_req_op_extent_osd_data_bio instead of > > osd_req_op_extent_osd_data_pages. That would add a dependency in cephfs > > on CONFIG_BLOCK, but I think we could probably live with that. > > Ah, just that part might be easy enough ;) > > Yeah, that part doesn't look too bad. Regardless though, I think we need to get a fix in for this sooner rather than later as it's trivial to get the kernel stuck in this loop today, by any user with write access to a ceph mount. Al, when you mentioned switching this over to a bio_vec based interface, were you planning to roll up the iov_iter->bio_vec array helper for this, or should I be looking into doing that? Thanks,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f633165f3fdc..0cd9fc68a04e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -35,29 +35,6 @@ */ /* - * Calculate the length sum of direct io vectors that can - * be combined into one page vector. - */ -static size_t dio_get_pagev_size(const struct iov_iter *it) -{ - const struct iovec *iov = it->iov; - const struct iovec *iovend = iov + it->nr_segs; - size_t size; - - size = iov->iov_len - it->iov_offset; - /* - * An iov can be page vectored when both the current tail - * and the next base are page aligned. - */ - while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) && - (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) { - size += iov->iov_len; - } - dout("dio_get_pagevlen len = %zu\n", size); - return size; -} - -/* * Allocate a page vector based on (@it, @nbytes). * The return value is the tuple describing a page vector, * that is (@pages, @page_align, @num_pages). @@ -71,8 +48,7 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, struct page **pages; int ret = 0, idx, npages; - align = (unsigned long)(it->iov->iov_base + it->iov_offset) & - (PAGE_SIZE - 1); + align = iov_iter_single_seg_page_offset(it); npages = calc_pages_for(align, nbytes); pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL); if (!pages) { @@ -927,7 +903,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } while (iov_iter_count(iter) > 0) { - u64 size = dio_get_pagev_size(iter); + u64 size = iov_iter_pvec_size(iter); size_t start = 0; ssize_t len; diff --git a/include/linux/uio.h b/include/linux/uio.h index 6e22b544d039..46fdfff3d7d6 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -83,6 +83,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, void iov_iter_advance(struct iov_iter *i, size_t bytes); int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); +size_t iov_iter_single_seg_page_offset(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, @@ -93,6 +94,7 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); +size_t iov_iter_pvec_size(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 6b415b5a100d..b8fa377b0cef 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -745,6 +745,24 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_single_seg_count); +/* Return offset into page for current iov_iter segment */ +size_t iov_iter_single_seg_page_offset(const struct iov_iter *i) +{ + size_t base; + + if (i->type & ITER_PIPE) + base = i->pipe->bufs[i->idx].offset; + else if (i->type & ITER_BVEC) + base = i->bvec->bv_offset; + else if (i->type & ITER_KVEC) + base = (size_t)i->kvec->iov_base; + else + base = (size_t)i->iov->iov_base; + + return (base + i->iov_offset) & (PAGE_SIZE - 1); +} +EXPORT_SYMBOL(iov_iter_single_seg_page_offset); + void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) @@ -830,6 +848,59 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_gap_alignment); +/** + * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter + * @i: iov_iter to in which to find the size + * + * Some filesystems can stitch together multiple iovecs into a single + * page vector when both the previous tail and current base are page + * aligned. This function discovers the length that can fit in a single + * pagevec and returns it. + */ +size_t iov_iter_pvec_size(const struct iov_iter *i) +{ + size_t size = i->count; + size_t pv_size = 0; + bool contig = false, first = true; + + if (!size) + return 0; + + /* Pipes are naturally aligned for this */ + if (unlikely(i->type & ITER_PIPE)) + return size; + + /* + * An iov can be page vectored when the current base and previous + * tail are both page aligned. Note that we don't require that the + * initial base in the first iovec also be page aligned. + */ + iterate_all_kinds(i, size, v, + ({ + if (first || (contig && PAGE_ALIGNED(v.iov_base))) { + pv_size += v.iov_len; + first = false; + contig = PAGE_ALIGNED(v.iov_base + v.iov_len); + }; 0; + }), + ({ + if (first || (contig && v.bv_offset == 0)) { + pv_size += v.bv_len; + first = false; + contig = PAGE_ALIGNED(v.bv_offset + v.bv_len); + } + }), + ({ + if (first || (contig && PAGE_ALIGNED(v.iov_base))) { + pv_size += v.iov_len; + first = false; + contig = PAGE_ALIGNED(v.iov_base + v.iov_len); + } + })) + return pv_size; +} +EXPORT_SYMBOL(iov_iter_pvec_size); + static inline size_t __pipe_get_pages(struct iov_iter *i, size_t maxsize, struct page **pages,
v2: fix bug in offset handling in iov_iter_pvec_size xfstest generic/095 triggers soft lockups in kcephfs. Basically it uses fio to drive some I/O via vmsplice ane splice. Ceph then ends up trying to access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it to pick up a wrong offset and get stuck in an infinite loop while trying to populate the page array. dio_get_pagev_size has a similar problem. To fix the first problem, add a new iov_iter helper to determine the offset into the page for the current segment and have ceph call that. I would just replace dio_get_pages_alloc with iov_iter_get_pages_alloc, but that will only return a single page at a time for ITER_BVEC and it's better to make larger requests when possible. For the second problem, we simply replace it with a new helper that does what it does, but properly for all iov_iter types. Since we're moving that into generic code, we can also utilize the iterate_all_kinds macro to simplify this. That means that we need to rework the logic a bit since we can't advance to the next vector while checking the current one. Link: http://tracker.ceph.com/issues/18130 Cc: "Zhu, Caifeng" <zhucaifeng@unissoft-nj.com> Signed-off-by: Jeff Layton <jlayton@redhat.com> --- fs/ceph/file.c | 28 ++------------------- include/linux/uio.h | 2 ++ lib/iov_iter.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 26 deletions(-)