Message ID | 1481230746-16741-4-git-send-email-sstabellini@kernel.org (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Stefano Stabellini wrote on Thu, Dec 08, 2016: > If the read is an async operation, send a 9p request and return > EIOCBQUEUED. Do not wait for completion. > > Complete the read operation from a callback instead. > > Signed-off-by: Stefano Stabellini <sstabellini@kernel.org> > --- > net/9p/client.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 86 insertions(+), 2 deletions(-) > > diff --git a/net/9p/client.c b/net/9p/client.c > index eb589ef..f9f09db 100644 > --- a/net/9p/client.c > +++ b/net/9p/client.c > @@ -28,6 +28,7 @@ > #include <linux/module.h> > #include <linux/errno.h> > #include <linux/fs.h> > +#include <linux/pagemap.h> > #include <linux/poll.h> > #include <linux/idr.h> > #include <linux/mutex.h> > @@ -1554,13 +1555,68 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) > } > EXPORT_SYMBOL(p9_client_unlinkat); > > +static void > +p9_client_read_complete(struct p9_client *clnt, struct p9_req_t *req, int status) > +{ > + int err, count, n, i, total = 0; > + char *dataptr, *to; > + > + if (req->status == REQ_STATUS_ERROR) { > + p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); > + err = req->t_err; > + goto out; > + } > + err = p9_check_errors(clnt, req); > + if (err) > + goto out; > + > + err = p9pdu_readf(req->rc, clnt->proto_version, > + "D", &count, &dataptr); > + if (err) { > + trace_9p_protocol_dump(clnt, req->rc); > + goto out; > + } > + if (!count) { > + p9_debug(P9_DEBUG_ERROR, "count=%d\n", count); > + err = 0; > + goto out; > + } > + > + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); > + if (count > req->rsize) > + count = req->rsize; > + > + for (i = 0; i < ((req->rsize + PAGE_SIZE - 1) / PAGE_SIZE); i++) { > + to = kmap(req->pagevec[i]); > + to += req->offset; > + n = PAGE_SIZE - req->offset; > + if (n > count) > + n = count; > + memcpy(to, dataptr, n); > + kunmap(req->pagevec[i]); > + req->offset = 0; > + count -= n; > + total += n; > + } > + > + err = total; > + req->kiocb->ki_pos += total; > + > +out: > + req->kiocb->ki_complete(req->kiocb, err, 0); > + > + release_pages(req->pagevec, (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, false); > + kvfree(req->pagevec); > + p9_free_req(clnt, req); > +} > + > int > p9_client_read(struct p9_fid *fid, struct kiocb *iocb, u64 offset, > struct iov_iter *to, int *err) > { > struct p9_client *clnt = fid->clnt; > struct p9_req_t *req; > - int total = 0; > + int total = 0, i; > *err = 0; > > p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", > @@ -1587,10 +1643,38 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) > req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, > 0, 11, "dqd", fid->fid, > offset, rsize); > - } else { > + /* sync request */ > + } else if(iocb == NULL || is_sync_kiocb(iocb)) { > non_zc = 1; > req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, > rsize); > + /* async request */ > + } else { I'm not too familiar with iocb/how async IOs should work, but a logic question just to make sure that has been thought out: We prefer zc here to async, even if zc can be slow? Ideally at some point zc and async aren't exclusive so we'll have async zc and async normal, but for now I'd say async comes before zc - yes there will be an extra copy in memory, but it will be done asynchronously. Was it intentional to prefer zc here? > + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); > + if (IS_ERR(req)) { > + *err = PTR_ERR(req); > + break; > + } > + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec, > + (size_t)rsize, &req->offset); > + req->kiocb = iocb; > + for (i = 0; i < req->rsize; i += PAGE_SIZE) > + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]); > + req->callback = p9_client_read_complete; > + > + *err = clnt->trans_mod->request(clnt, req); > + if (*err < 0) { > + clnt->status = Disconnected; > + release_pages(req->pagevec, > + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, > + true); > + kvfree(req->pagevec); > + p9_free_req(clnt, req); > + break; > + } > + > + *err = -EIOCBQUEUED; > + break; (Just a note to myself (or anyone who wants to do this) that a couple of places only look at err if size written is 0... They should check if err has been set) > } > if (IS_ERR(req)) { > *err = PTR_ERR(req); ------------------------------------------------------------------------------ Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon Phi processor-based developer platforms. With one year of Intel Parallel Studio XE. Training and support from Colfax. Order your platform today.http://sdm.link/xeonphi
On Fri, 9 Dec 2016, Dominique Martinet wrote: > Stefano Stabellini wrote on Thu, Dec 08, 2016: > > If the read is an async operation, send a 9p request and return > > EIOCBQUEUED. Do not wait for completion. > > > > Complete the read operation from a callback instead. > > > > Signed-off-by: Stefano Stabellini <sstabellini@kernel.org> > > --- > > net/9p/client.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > > 1 file changed, 86 insertions(+), 2 deletions(-) > > > > diff --git a/net/9p/client.c b/net/9p/client.c > > index eb589ef..f9f09db 100644 > > --- a/net/9p/client.c > > +++ b/net/9p/client.c > > @@ -28,6 +28,7 @@ > > #include <linux/module.h> > > #include <linux/errno.h> > > #include <linux/fs.h> > > +#include <linux/pagemap.h> > > #include <linux/poll.h> > > #include <linux/idr.h> > > #include <linux/mutex.h> > > @@ -1554,13 +1555,68 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) > > } > > EXPORT_SYMBOL(p9_client_unlinkat); > > > > +static void > > +p9_client_read_complete(struct p9_client *clnt, struct p9_req_t *req, int status) > > +{ > > + int err, count, n, i, total = 0; > > + char *dataptr, *to; > > + > > + if (req->status == REQ_STATUS_ERROR) { > > + p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); > > + err = req->t_err; > > + goto out; > > + } > > + err = p9_check_errors(clnt, req); > > + if (err) > > + goto out; > > + > > + err = p9pdu_readf(req->rc, clnt->proto_version, > > + "D", &count, &dataptr); > > + if (err) { > > + trace_9p_protocol_dump(clnt, req->rc); > > + goto out; > > + } > > + if (!count) { > > + p9_debug(P9_DEBUG_ERROR, "count=%d\n", count); > > + err = 0; > > + goto out; > > + } > > + > > + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); > > + if (count > req->rsize) > > + count = req->rsize; > > + > > + for (i = 0; i < ((req->rsize + PAGE_SIZE - 1) / PAGE_SIZE); i++) { > > + to = kmap(req->pagevec[i]); > > + to += req->offset; > > + n = PAGE_SIZE - req->offset; > > + if (n > count) > > + n = count; > > + memcpy(to, dataptr, n); > > + kunmap(req->pagevec[i]); > > + req->offset = 0; > > + count -= n; > > + total += n; > > + } > > + > > + err = total; > > + req->kiocb->ki_pos += total; > > + > > +out: > > + req->kiocb->ki_complete(req->kiocb, err, 0); > > + > > + release_pages(req->pagevec, (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, false); > > + kvfree(req->pagevec); > > + p9_free_req(clnt, req); > > +} > > + > > int > > p9_client_read(struct p9_fid *fid, struct kiocb *iocb, u64 offset, > > struct iov_iter *to, int *err) > > { > > struct p9_client *clnt = fid->clnt; > > struct p9_req_t *req; > > - int total = 0; > > + int total = 0, i; > > *err = 0; > > > > p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", > > @@ -1587,10 +1643,38 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) > > req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, > > 0, 11, "dqd", fid->fid, > > offset, rsize); > > - } else { > > + /* sync request */ > > + } else if(iocb == NULL || is_sync_kiocb(iocb)) { > > non_zc = 1; > > req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, > > rsize); > > + /* async request */ > > + } else { > > I'm not too familiar with iocb/how async IOs should work, but a logic > question just to make sure that has been thought out: > We prefer zc here to async, even if zc can be slow? > > Ideally at some point zc and async aren't exclusive so we'll have async > zc and async normal, but for now I'd say async comes before zc - yes > there will be an extra copy in memory, but it will be done > asynchronously. > Was it intentional to prefer zc here? I wasn't sure what to do about zc. The backends I am testing with don't support zc, so I didn't feel confident in changing its behavior. I think whether zc is faster than async+copy depends on the specific benchmark. iodepth and blocksize parameters in fio, for example. With iodepth=1, zc would be faster, the higher the iodepth, the faster async+copy would become in comparison. At some point async+copy will be faster than zc, but I am not sure where is the threshold, it would probably be storage backend dependent too. Maybe around iodepth=3. This is a reasonable guess but I haven't run any numbers to confirm it. That said, I am happy to follow any strategy you suggest in regards to zc. > > + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); > > + if (IS_ERR(req)) { > > + *err = PTR_ERR(req); > > + break; > > + } > > + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec, > > + (size_t)rsize, &req->offset); > > + req->kiocb = iocb; > > + for (i = 0; i < req->rsize; i += PAGE_SIZE) > > + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]); > > + req->callback = p9_client_read_complete; > > + > > + *err = clnt->trans_mod->request(clnt, req); > > + if (*err < 0) { > > + clnt->status = Disconnected; > > + release_pages(req->pagevec, > > + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, > > + true); > > + kvfree(req->pagevec); > > + p9_free_req(clnt, req); > > + break; > > + } > > + > > + *err = -EIOCBQUEUED; > > + break; > > (Just a note to myself (or anyone who wants to do this) that a couple of > places only look at err if size written is 0... They should check if err > has been set) > > > } > > if (IS_ERR(req)) { > > *err = PTR_ERR(req); > ------------------------------------------------------------------------------ Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon Phi processor-based developer platforms. With one year of Intel Parallel Studio XE. Training and support from Colfax. Order your platform today.http://sdm.link/xeonphi
On Thu, Dec 08, 2016 at 12:59:05PM -0800, Stefano Stabellini wrote: > + } else { > + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); > + if (IS_ERR(req)) { > + *err = PTR_ERR(req); > + break; > + } > + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec, > + (size_t)rsize, &req->offset); > + req->kiocb = iocb; > + for (i = 0; i < req->rsize; i += PAGE_SIZE) > + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]); > + req->callback = p9_client_read_complete; > + > + *err = clnt->trans_mod->request(clnt, req); > + if (*err < 0) { > + clnt->status = Disconnected; > + release_pages(req->pagevec, > + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, > + true); > + kvfree(req->pagevec); > + p9_free_req(clnt, req); > + break; > + } > + > + *err = -EIOCBQUEUED; IDGI. AFAICS, your code will result in shitloads of short reads - every time when you give it a multi-iovec array, only the first one will be issued and the rest won't be even looked at. Sure, it is technically legal, but I very much doubt that aio users will be happy with that. What am I missing here? ------------------------------------------------------------------------------ Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon Phi processor-based developer platforms. With one year of Intel Parallel Studio XE. Training and support from Colfax. Order your platform today.http://sdm.link/xeonphi
Hi Al, thanks for looking at the patch. On Sat, 10 Dec 2016, Al Viro wrote: > On Thu, Dec 08, 2016 at 12:59:05PM -0800, Stefano Stabellini wrote: > > > > + } else { > > + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); > > + if (IS_ERR(req)) { > > + *err = PTR_ERR(req); > > + break; > > + } > > + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec, > > + (size_t)rsize, &req->offset); > > + req->kiocb = iocb; > > + for (i = 0; i < req->rsize; i += PAGE_SIZE) > > + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]); > > + req->callback = p9_client_read_complete; > > + > > + *err = clnt->trans_mod->request(clnt, req); > > + if (*err < 0) { > > + clnt->status = Disconnected; > > + release_pages(req->pagevec, > > + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, > > + true); > > + kvfree(req->pagevec); > > + p9_free_req(clnt, req); > > + break; > > + } > > + > > + *err = -EIOCBQUEUED; > > IDGI. AFAICS, your code will result in shitloads of short reads - every > time when you give it a multi-iovec array, only the first one will be > issued and the rest won't be even looked at. Sure, it is technically > legal, but I very much doubt that aio users will be happy with that. > > What am I missing here? There is a problem with short reads, but I don't think it is the one you described, unless I made a mistake somewhere. The code above will issue one request as big as possible (size is limited by clnt->msize, which is the size of the channel). No matter how many segs in the iov_iter, the request will cover the maximum amount of bytes allowed by the channel, typically 4K but can be larger. So multi-iovec arrays should be handled correctly up to msize. Please let me know if I am wrong, I am not an expert on this. I tried, but couldn't actually get any multi-iovec arrays in my tests. That said, reading the code again, there are indeed two possible causes of short reads. The first one are responses which have a smaller byte count than requested. Currently this case is not handled, forwarding the short read up to the user. But I wrote and tested successfully a patch that issues another follow-up request from the completion function. Example: p9_client_read, issue request, size 4K p9_client_read_complete, receive response, count 2K p9_client_read_complete, issue request size 4K-2K = 2K p9_client_read_complete, receive response size 2K p9_client_read_complete, call ki_complete The second possible cause of short reads are requests which are bigger than msize. For example a 2MB read over a 4K channel. In this patch we simply issue one request as big as msize, and eventually return to the caller a smaller byte count. One option is to simply fall back to sync IO in these cases. Another solution is to use the same technique described above: we issue the first request as big as msize, then, from the callback function, we issue a follow-up request, and again, and again, until we fully complete the large read. This way, although we are not issuing any simultaneous requests for a specific large read, at least we can issue other aio requests in parallel for other reads or writes. It should still be more performant than sync IO. I am thinking of implementing this second option in the next version of the series. ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot
If the user asked for more than msize-iohdrsz (or rather iounit) bytes, you should do your best to read as much as possible before you return to user space. That means that if a read returns the maximum possible count you HAVE to issue another read until you get a short read. What Al is hinting at is that you can issue multiple read requests simultaneously, assuming that most of them will return data. So the way I see your options are two, with an additional tweak. The first option is to issue more read calls when the previous ones complete (your second option). The second option is at the beginning to issue all the read calls simultaneously. And the tweak is to do something in between and issue up to a limit of simultaneous read calls. Thanks, Lucho On Mon, Dec 12, 2016 at 6:15 PM, Stefano Stabellini <sstabellini@kernel.org> wrote: > Hi Al, > thanks for looking at the patch. > > On Sat, 10 Dec 2016, Al Viro wrote: >> On Thu, Dec 08, 2016 at 12:59:05PM -0800, Stefano Stabellini wrote: >> >> >> > + } else { >> > + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); >> > + if (IS_ERR(req)) { >> > + *err = PTR_ERR(req); >> > + break; >> > + } >> > + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec, >> > + (size_t)rsize, &req->offset); >> > + req->kiocb = iocb; >> > + for (i = 0; i < req->rsize; i += PAGE_SIZE) >> > + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]); >> > + req->callback = p9_client_read_complete; >> > + >> > + *err = clnt->trans_mod->request(clnt, req); >> > + if (*err < 0) { >> > + clnt->status = Disconnected; >> > + release_pages(req->pagevec, >> > + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, >> > + true); >> > + kvfree(req->pagevec); >> > + p9_free_req(clnt, req); >> > + break; >> > + } >> > + >> > + *err = -EIOCBQUEUED; >> >> IDGI. AFAICS, your code will result in shitloads of short reads - every >> time when you give it a multi-iovec array, only the first one will be >> issued and the rest won't be even looked at. Sure, it is technically >> legal, but I very much doubt that aio users will be happy with that. >> >> What am I missing here? > > There is a problem with short reads, but I don't think it is the one > you described, unless I made a mistake somewhere. > > The code above will issue one request as big as possible (size is > limited by clnt->msize, which is the size of the channel). No matter how > many segs in the iov_iter, the request will cover the maximum amount of > bytes allowed by the channel, typically 4K but can be larger. So > multi-iovec arrays should be handled correctly up to msize. Please let > me know if I am wrong, I am not an expert on this. I tried, but couldn't > actually get any multi-iovec arrays in my tests. > > > That said, reading the code again, there are indeed two possible causes > of short reads. The first one are responses which have a smaller byte > count than requested. Currently this case is not handled, forwarding > the short read up to the user. But I wrote and tested successfully a > patch that issues another follow-up request from the completion > function. Example: > p9_client_read, issue request, size 4K > p9_client_read_complete, receive response, count 2K > p9_client_read_complete, issue request size 4K-2K = 2K > p9_client_read_complete, receive response size 2K > p9_client_read_complete, call ki_complete > > > The second possible cause of short reads are requests which are bigger > than msize. For example a 2MB read over a 4K channel. In this patch we > simply issue one request as big as msize, and eventually return to the > caller a smaller byte count. One option is to simply fall back to sync > IO in these cases. Another solution is to use the same technique > described above: we issue the first request as big as msize, then, from > the callback function, we issue a follow-up request, and again, and > again, until we fully complete the large read. This way, although we are > not issuing any simultaneous requests for a specific large read, at > least we can issue other aio requests in parallel for other reads or > writes. It should still be more performant than sync IO. > > I am thinking of implementing this second option in the next version of > the series. ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot
diff --git a/net/9p/client.c b/net/9p/client.c index eb589ef..f9f09db 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -28,6 +28,7 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/poll.h> #include <linux/idr.h> #include <linux/mutex.h> @@ -1554,13 +1555,68 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) } EXPORT_SYMBOL(p9_client_unlinkat); +static void +p9_client_read_complete(struct p9_client *clnt, struct p9_req_t *req, int status) +{ + int err, count, n, i, total = 0; + char *dataptr, *to; + + if (req->status == REQ_STATUS_ERROR) { + p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); + err = req->t_err; + goto out; + } + err = p9_check_errors(clnt, req); + if (err) + goto out; + + err = p9pdu_readf(req->rc, clnt->proto_version, + "D", &count, &dataptr); + if (err) { + trace_9p_protocol_dump(clnt, req->rc); + goto out; + } + if (!count) { + p9_debug(P9_DEBUG_ERROR, "count=%d\n", count); + err = 0; + goto out; + } + + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + if (count > req->rsize) + count = req->rsize; + + for (i = 0; i < ((req->rsize + PAGE_SIZE - 1) / PAGE_SIZE); i++) { + to = kmap(req->pagevec[i]); + to += req->offset; + n = PAGE_SIZE - req->offset; + if (n > count) + n = count; + memcpy(to, dataptr, n); + kunmap(req->pagevec[i]); + req->offset = 0; + count -= n; + total += n; + } + + err = total; + req->kiocb->ki_pos += total; + +out: + req->kiocb->ki_complete(req->kiocb, err, 0); + + release_pages(req->pagevec, (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, false); + kvfree(req->pagevec); + p9_free_req(clnt, req); +} + int p9_client_read(struct p9_fid *fid, struct kiocb *iocb, u64 offset, struct iov_iter *to, int *err) { struct p9_client *clnt = fid->clnt; struct p9_req_t *req; - int total = 0; + int total = 0, i; *err = 0; p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", @@ -1587,10 +1643,38 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, 0, 11, "dqd", fid->fid, offset, rsize); - } else { + /* sync request */ + } else if(iocb == NULL || is_sync_kiocb(iocb)) { non_zc = 1; req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); + /* async request */ + } else { + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); + if (IS_ERR(req)) { + *err = PTR_ERR(req); + break; + } + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec, + (size_t)rsize, &req->offset); + req->kiocb = iocb; + for (i = 0; i < req->rsize; i += PAGE_SIZE) + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]); + req->callback = p9_client_read_complete; + + *err = clnt->trans_mod->request(clnt, req); + if (*err < 0) { + clnt->status = Disconnected; + release_pages(req->pagevec, + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, + true); + kvfree(req->pagevec); + p9_free_req(clnt, req); + break; + } + + *err = -EIOCBQUEUED; + break; } if (IS_ERR(req)) { *err = PTR_ERR(req);
If the read is an async operation, send a 9p request and return EIOCBQUEUED. Do not wait for completion. Complete the read operation from a callback instead. Signed-off-by: Stefano Stabellini <sstabellini@kernel.org> --- net/9p/client.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 2 deletions(-)