Message ID | 20250221205146.1210952-2-dw@davidwei.uk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring zc rx fixed len recvzc | expand |
Just a few minor drive-by nits. > @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > zc->ifq = req->ctx->ifq; > if (!zc->ifq) > return -EINVAL; > + zc->len = READ_ONCE(sqe->len); > + if (zc->len == UINT_MAX) > + return -EINVAL; > + /* UINT_MAX means no limit on readlen */ > + if (!zc->len) > + zc->len = UINT_MAX; Why not just make UINT_MAX allowed, meaning no limit? Would avoid two branches here, and as far as I can tell not really change anything in terms of API niceness. > @@ -1269,6 +1276,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) > { > struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); > + bool limit = zc->len != UINT_MAX; > struct socket *sock; > int ret; Doesn't seem to be used? > @@ -1296,6 +1304,13 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) > return IOU_OK; > } > > + if (zc->len == 0) { > + io_req_set_res(req, 0, 0); > + > + if (issue_flags & IO_URING_F_MULTISHOT) > + return IOU_STOP_MULTISHOT; > + return IOU_OK; > + } > if (issue_flags & IO_URING_F_MULTISHOT) > return IOU_ISSUE_SKIP_COMPLETE; > return -EAGAIN; Might be cleaner as: ret = -EAGAIN; if (!zc->len) { io_req_set_res(req, 0, 0); ret = -IOU_OK; } if (issue_flags & IO_URING_F_MULTISHOT) return IOU_ISSUE_SKIP_COMPLETE; return ret; rather than duplicate the flag checking. > static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > struct sock *sk, int flags, > - unsigned issue_flags) > + unsigned issue_flags, unsigned int *outlen) > { > + unsigned int len = *outlen; > + bool limit = len != UINT_MAX; > struct io_zcrx_args args = { > .req = req, > .ifq = ifq, > .sock = sk->sk_socket, > }; > read_descriptor_t rd_desc = { > - .count = 1, > + .count = len, > .arg.data = &args, > }; > int ret; > > lock_sock(sk); > ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); > + if (limit && ret) > + *outlen = len - ret; > if (ret <= 0) { > if (ret < 0 || sock_flag(sk, SOCK_DONE)) > goto out; > @@ -930,7 +937,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > ret = IOU_REQUEUE; > } else if (sock_flag(sk, SOCK_DONE)) { > /* Make it to retry until it finally gets 0. */ > - if (issue_flags & IO_URING_F_MULTISHOT) > + if (!limit && (issue_flags & IO_URING_F_MULTISHOT)) > ret = IOU_REQUEUE; > else > ret = -EAGAIN; In the two above hunks, the limit checking feels a bit hackish. For example, why is it related to multishot or not? I think the patch would benefit from being split into separate patches for singleshot and length support. Eg one that adds singleshot support, and then one that adds length capping on top. That would make it much easier to reason about hunks like the above one. > @@ -942,7 +949,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > > int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > struct socket *sock, unsigned int flags, > - unsigned issue_flags) > + unsigned issue_flags, unsigned int *len) > { > struct sock *sk = sock->sk; > const struct proto *prot = READ_ONCE(sk->sk_prot); Shouldn't recv support it too?
On 2/21/25 20:51, David Wei wrote: > Currently only multishot recvzc requests are supported, but sometimes > there is a need to do a single recv e.g. peeking at some data in the > socket. Add single shot recvzc requests where IORING_RECV_MULTISHOT is > _not_ set and the sqe->len field is set to the number of bytes to read > N. There is no oneshot, we need to change the message. > There could be multiple completions containing data, like the multishot > case, since N bytes could be split across multiple frags. This is > followed by a final completion with res and cflags both set to 0 that > indicate the completion of the request, or a -res that indicate an > error. > > Signed-off-by: David Wei <dw@davidwei.uk> > --- > io_uring/net.c | 19 +++++++++++++++++-- > io_uring/zcrx.c | 17 ++++++++++++----- > io_uring/zcrx.h | 2 +- > 3 files changed, 30 insertions(+), 8 deletions(-) > > diff --git a/io_uring/net.c b/io_uring/net.c > index 000dc70d08d0..cae34a24266c 100644 > --- a/io_uring/net.c > +++ b/io_uring/net.c > @@ -94,6 +94,7 @@ struct io_recvzc { > struct file *file; > unsigned msg_flags; > u16 flags; > + u32 len; > struct io_zcrx_ifq *ifq; > }; > > @@ -1241,7 +1242,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > unsigned ifq_idx; > > if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || > - sqe->len || sqe->addr3)) > + sqe->addr3)) > return -EINVAL; > > ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); > @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > zc->ifq = req->ctx->ifq; > if (!zc->ifq) > return -EINVAL; > + zc->len = READ_ONCE(sqe->len); > + if (zc->len == UINT_MAX) > + return -EINVAL; The uapi gives u32, if we're using a special value it should match the type. ~(u32)0 > + /* UINT_MAX means no limit on readlen */ > + if (!zc->len) > + zc->len = UINT_MAX; > > zc->flags = READ_ONCE(sqe->ioprio); > zc->msg_flags = READ_ONCE(sqe->msg_flags); > @@ -1269,6 +1276,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) > { > struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); > + bool limit = zc->len != UINT_MAX; > struct socket *sock; > int ret; > > @@ -1281,7 +1289,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) > return -ENOTSOCK; > > ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, > - issue_flags); > + issue_flags, &zc->len); > if (unlikely(ret <= 0) && ret != -EAGAIN) { > if (ret == -ERESTARTSYS) > ret = -EINTR; > @@ -1296,6 +1304,13 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) > return IOU_OK; > } > > + if (zc->len == 0) { If len hits zero we should always complete it, regardless of errors the stack might have returned, so might be cleaner if you do that check right after io_zcrx_recv(). > + io_req_set_res(req, 0, 0); > + > + if (issue_flags & IO_URING_F_MULTISHOT) > + return IOU_STOP_MULTISHOT; > + return IOU_OK; > + } > if (issue_flags & IO_URING_F_MULTISHOT) > return IOU_ISSUE_SKIP_COMPLETE; > return -EAGAIN; > diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c > index f2d326e18e67..74bca4e471bc 100644 > --- a/io_uring/zcrx.c > +++ b/io_uring/zcrx.c > @@ -817,6 +817,7 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, > int i, copy, end, off; > int ret = 0; > > + len = min_t(size_t, len, desc->count); > if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) > return -EAGAIN; > > @@ -894,26 +895,32 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, > out: > if (offset == start_off) > return ret; > + if (desc->count != UINT_MAX) > + desc->count -= (offset - start_off); I'd say just set desc->count to it's max value (size_t), and never care about checking for limits after. > return offset - start_off; > } > > static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > struct sock *sk, int flags, > - unsigned issue_flags) > + unsigned issue_flags, unsigned int *outlen) > { > + unsigned int len = *outlen; > + bool limit = len != UINT_MAX; > struct io_zcrx_args args = { > .req = req, > .ifq = ifq, > .sock = sk->sk_socket, > }; > read_descriptor_t rd_desc = { > - .count = 1, > + .count = len, > .arg.data = &args, > }; > int ret; > > lock_sock(sk); > ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); > + if (limit && ret) > + *outlen = len - ret; > if (ret <= 0) { > if (ret < 0 || sock_flag(sk, SOCK_DONE)) > goto out; > @@ -930,7 +937,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > ret = IOU_REQUEUE; > } else if (sock_flag(sk, SOCK_DONE)) { > /* Make it to retry until it finally gets 0. */ > - if (issue_flags & IO_URING_F_MULTISHOT) > + if (!limit && (issue_flags & IO_URING_F_MULTISHOT)) > ret = IOU_REQUEUE; And with earlier len check in net.c you don't need this change, which feels wrong, as it's only here to circumvent some handling in net.c, I assume
>> @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >> zc->ifq = req->ctx->ifq; >> if (!zc->ifq) >> return -EINVAL; >> + zc->len = READ_ONCE(sqe->len); >> + if (zc->len == UINT_MAX) >> + return -EINVAL; > > The uapi gives u32, if we're using a special value it should > match the type. ~(u32)0 Any syscall in Linux is capped at 2G anyway, so I think all of this special meaning of ->len just needs to go away. Just ask for whatever bytes you want, but yes more than 2G will not be supported anyway.
On 2/21/25 20:51, David Wei wrote: > diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c > index f2d326e18e67..74bca4e471bc 100644 > --- a/io_uring/zcrx.c > +++ b/io_uring/zcrx.c ... > static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > struct sock *sk, int flags, > - unsigned issue_flags) > + unsigned issue_flags, unsigned int *outlen) > { > + unsigned int len = *outlen; > + bool limit = len != UINT_MAX; > struct io_zcrx_args args = { > .req = req, > .ifq = ifq, > .sock = sk->sk_socket, > }; > read_descriptor_t rd_desc = { > - .count = 1, > + .count = len, > .arg.data = &args, > }; > int ret; > > lock_sock(sk); > ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); > + if (limit && ret) > + *outlen = len - ret; ret can be negative, the check will pass and the calculations will turn it into something weird.
On 2/22/25 00:08, Jens Axboe wrote: > Just a few minor drive-by nits. > >> @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >> zc->ifq = req->ctx->ifq; >> if (!zc->ifq) >> return -EINVAL; >> + zc->len = READ_ONCE(sqe->len); >> + if (zc->len == UINT_MAX) >> + return -EINVAL; >> + /* UINT_MAX means no limit on readlen */ >> + if (!zc->len) >> + zc->len = UINT_MAX; > > Why not just make UINT_MAX allowed, meaning no limit? Would avoid two > branches here, and as far as I can tell not really change anything in > terms of API niceness. I think 0 goes better as a special uapi value. It doesn't alter the uapi, and commonly understood as "no limits", which is the opposite to the other option, especially since UINT_MAX is not a max value for an unlimited request, I'd easily expect it to drive more than 4GB. >> @@ -1269,6 +1276,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >> int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) >> { >> struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); >> + bool limit = zc->len != UINT_MAX; >> struct socket *sock; >> int ret; > > Doesn't seem to be used? > >> @@ -1296,6 +1304,13 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) >> return IOU_OK; >> } >> >> + if (zc->len == 0) { >> + io_req_set_res(req, 0, 0); >> + >> + if (issue_flags & IO_URING_F_MULTISHOT) >> + return IOU_STOP_MULTISHOT; >> + return IOU_OK; >> + } >> if (issue_flags & IO_URING_F_MULTISHOT) >> return IOU_ISSUE_SKIP_COMPLETE; >> return -EAGAIN; > > Might be cleaner as: > > ret = -EAGAIN; > if (!zc->len) { > io_req_set_res(req, 0, 0); > ret = -IOU_OK; > } > if (issue_flags & IO_URING_F_MULTISHOT) > return IOU_ISSUE_SKIP_COMPLETE; > return ret; > > rather than duplicate the flag checking. You missed IOU_STOP_MULTISHOT, but even without it separate error codes for polling is already an utter mess to try be smart about it. I'll try to clean it up, but that's orthogonal. ... >> @@ -930,7 +937,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, >> ret = IOU_REQUEUE; >> } else if (sock_flag(sk, SOCK_DONE)) { >> /* Make it to retry until it finally gets 0. */ >> - if (issue_flags & IO_URING_F_MULTISHOT) >> + if (!limit && (issue_flags & IO_URING_F_MULTISHOT)) >> ret = IOU_REQUEUE; >> else >> ret = -EAGAIN; > > In the two above hunks, the limit checking feels a bit hackish. For > example, why is it related to multishot or not? I think the patch would > benefit from being split into separate patches for singleshot and length I agree with the statement, but fwiw there are no single shots and can't be that, the message is misleading. > support. Eg one that adds singleshot support, and then one that adds > length capping on top. That would make it much easier to reason about > hunks like the above one. > >> @@ -942,7 +949,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, >> >> int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, >> struct socket *sock, unsigned int flags, >> - unsigned issue_flags) >> + unsigned issue_flags, unsigned int *len) >> { >> struct sock *sk = sock->sk; >> const struct proto *prot = READ_ONCE(sk->sk_prot); > > Shouldn't recv support it too? There is no "msg" variant of the request, if that's what you mean.
On 2/22/25 00:52, Jens Axboe wrote: >>> @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>> zc->ifq = req->ctx->ifq; >>> if (!zc->ifq) >>> return -EINVAL; >>> + zc->len = READ_ONCE(sqe->len); >>> + if (zc->len == UINT_MAX) >>> + return -EINVAL; >> >> The uapi gives u32, if we're using a special value it should >> match the type. ~(u32)0 > > Any syscall in Linux is capped at 2G anyway, so I think all of this I don't see how it related, you don't have to have a weird 00111111b as a special value. > special meaning of ->len just needs to go away. Just ask for whatever > bytes you want, but yes more than 2G will not be supported anyway. That's not the case here, the request does support more than 2G, it's just spread across multiple CQEs, and the limit accounts for multiple CQEs.
On 2/21/25 6:01 PM, Pavel Begunkov wrote: > On 2/22/25 00:08, Jens Axboe wrote: >> Just a few minor drive-by nits. >> >>> @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>> zc->ifq = req->ctx->ifq; >>> if (!zc->ifq) >>> return -EINVAL; >>> + zc->len = READ_ONCE(sqe->len); >>> + if (zc->len == UINT_MAX) >>> + return -EINVAL; >>> + /* UINT_MAX means no limit on readlen */ >>> + if (!zc->len) >>> + zc->len = UINT_MAX; >> >> Why not just make UINT_MAX allowed, meaning no limit? Would avoid two >> branches here, and as far as I can tell not really change anything in >> terms of API niceness. > > I think 0 goes better as a special uapi value. It doesn't alter the > uapi, and commonly understood as "no limits", which is the opposite > to the other option, especially since UINT_MAX is not a max value for > an unlimited request, I'd easily expect it to drive more than 4GB. Yeah that's certainly better, and as you say also has the same (forced) semantics for multishot.
On 2/21/25 6:06 PM, Pavel Begunkov wrote: > On 2/22/25 00:52, Jens Axboe wrote: >>>> @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>> zc->ifq = req->ctx->ifq; >>>> if (!zc->ifq) >>>> return -EINVAL; >>>> + zc->len = READ_ONCE(sqe->len); >>>> + if (zc->len == UINT_MAX) >>>> + return -EINVAL; >>> >>> The uapi gives u32, if we're using a special value it should >>> match the type. ~(u32)0 >> >> Any syscall in Linux is capped at 2G anyway, so I think all of this > > I don't see how it related, you don't have to have a weird > 00111111b as a special value. > >> special meaning of ->len just needs to go away. Just ask for whatever >> bytes you want, but yes more than 2G will not be supported anyway. > > That's not the case here, the request does support more than 2G, > it's just spread across multiple CQEs, and the limit accounts > for multiple CQEs. All pretty moot if we just go with 0 as the "transfer whatever length you want", as obviously each individual transfer will be limited anyway. Which is the better choice than having some odd 4G value.
On 2/22/25 01:06, Pavel Begunkov wrote: > On 2/22/25 00:52, Jens Axboe wrote: >>>> @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>> zc->ifq = req->ctx->ifq; >>>> if (!zc->ifq) >>>> return -EINVAL; >>>> + zc->len = READ_ONCE(sqe->len); >>>> + if (zc->len == UINT_MAX) >>>> + return -EINVAL; >>> >>> The uapi gives u32, if we're using a special value it should >>> match the type. ~(u32)0 >> >> Any syscall in Linux is capped at 2G anyway, so I think all of this > > I don't see how it related, you don't have to have a weird > 00111111b as a special value. > >> special meaning of ->len just needs to go away. Just ask for whatever And there will need to be some special value for that. Setting the maximum limit to 4GB is reasonable, but when it's crunching data without limits 4GB is nothing. IMHO not worth growing the uapi field u32 -> u64. >> bytes you want, but yes more than 2G will not be supported anyway. > > That's not the case here, the request does support more than 2G, > it's just spread across multiple CQEs, and the limit accounts > for multiple CQEs. >
On 2/22/25 01:09, Jens Axboe wrote: > On 2/21/25 6:06 PM, Pavel Begunkov wrote: >> On 2/22/25 00:52, Jens Axboe wrote: >>>>> @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>>> zc->ifq = req->ctx->ifq; >>>>> if (!zc->ifq) >>>>> return -EINVAL; >>>>> + zc->len = READ_ONCE(sqe->len); >>>>> + if (zc->len == UINT_MAX) >>>>> + return -EINVAL; >>>> >>>> The uapi gives u32, if we're using a special value it should >>>> match the type. ~(u32)0 >>> >>> Any syscall in Linux is capped at 2G anyway, so I think all of this >> >> I don't see how it related, you don't have to have a weird >> 00111111b as a special value. >> >>> special meaning of ->len just needs to go away. Just ask for whatever >>> bytes you want, but yes more than 2G will not be supported anyway. >> >> That's not the case here, the request does support more than 2G, >> it's just spread across multiple CQEs, and the limit accounts >> for multiple CQEs. > > All pretty moot if we just go with 0 as the "transfer whatever length > you want", as obviously each individual transfer will be limited anyway. > Which is the better choice than having some odd 4G value. I think so as well, and we'd need to check for 0 otherwise. And it's probably fine to reserve a U32_MAX as above if it has a chance to make (even if in the future) handling easier, not like prep is performance sensitive.
Hi, > -----Original Message----- > From: David Wei <dw@davidwei.uk> > Sent: Saturday, February 22, 2025 4:52 AM > To: io-uring@vger.kernel.org > Cc: Jens Axboe <axboe@kernel.dk>; Pavel Begunkov <asml.silence@gmail.com> > Subject: [PATCH v2 1/2] io_uring/zcrx: add single shot recvzc > > Currently only multishot recvzc requests are supported, but sometimes there is > a need to do a single recv e.g. peeking at some data in the socket. Add single > shot recvzc requests where IORING_RECV_MULTISHOT is _not_ set and the sqe- > >len field is set to the number of bytes to read N. > > There could be multiple completions containing data, like the multishot case, > since N bytes could be split across multiple frags. This is followed by a final > completion with res and cflags both set to 0 that indicate the completion of the > request, or a -res that indicate an error. > > Signed-off-by: David Wei <dw@davidwei.uk> > --- > io_uring/net.c | 19 +++++++++++++++++-- io_uring/zcrx.c | 17 ++++++++++++-- > --- io_uring/zcrx.h | 2 +- > 3 files changed, 30 insertions(+), 8 deletions(-) > > diff --git a/io_uring/net.c b/io_uring/net.c index 000dc70d08d0..cae34a24266c > 100644 > --- a/io_uring/net.c > +++ b/io_uring/net.c > @@ -94,6 +94,7 @@ struct io_recvzc { > struct file *file; > unsigned msg_flags; > u16 flags; > + u32 len; > struct io_zcrx_ifq *ifq; > }; > > @@ -1241,7 +1242,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct > io_uring_sqe *sqe) > unsigned ifq_idx; > > if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || > - sqe->len || sqe->addr3)) > + sqe->addr3)) > return -EINVAL; > > ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); @@ -1250,6 +1251,12 @@ > int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > zc->ifq = req->ctx->ifq; > if (!zc->ifq) > return -EINVAL; > + zc->len = READ_ONCE(sqe->len); > + if (zc->len == UINT_MAX) > + return -EINVAL; > + /* UINT_MAX means no limit on readlen */ What is the difference between unlimited read length and IO_URING_F_MULTISHOT? In what specific scenarios would unlimited read length be used? > + if (!zc->len) > + zc->len = UINT_MAX; > > zc->flags = READ_ONCE(sqe->ioprio); > zc->msg_flags = READ_ONCE(sqe->msg_flags); @@ -1269,6 +1276,7 > @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) { > struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); > + bool limit = zc->len != UINT_MAX; It is not used. > struct socket *sock; > int ret; > > @@ -1281,7 +1289,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int > issue_flags) > return -ENOTSOCK; > > ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, > - issue_flags); > + issue_flags, &zc->len); > if (unlikely(ret <= 0) && ret != -EAGAIN) { > if (ret == -ERESTARTSYS) > ret = -EINTR; > @@ -1296,6 +1304,13 @@ int io_recvzc(struct io_kiocb *req, unsigned int > issue_flags) > return IOU_OK; > } > > + if (zc->len == 0) { > + io_req_set_res(req, 0, 0); > + > + if (issue_flags & IO_URING_F_MULTISHOT) > + return IOU_STOP_MULTISHOT; > + return IOU_OK; > + } > if (issue_flags & IO_URING_F_MULTISHOT) > return IOU_ISSUE_SKIP_COMPLETE; > return -EAGAIN; > diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index > f2d326e18e67..74bca4e471bc 100644 > --- a/io_uring/zcrx.c > +++ b/io_uring/zcrx.c > @@ -817,6 +817,7 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct > sk_buff *skb, > int i, copy, end, off; > int ret = 0; > > + len = min_t(size_t, len, desc->count); > if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) > return -EAGAIN; > > @@ -894,26 +895,32 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct > sk_buff *skb, > out: > if (offset == start_off) > return ret; > + if (desc->count != UINT_MAX) > + desc->count -= (offset - start_off); > return offset - start_off; > } > > static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > struct sock *sk, int flags, > - unsigned issue_flags) > + unsigned issue_flags, unsigned int *outlen) > { > + unsigned int len = *outlen; > + bool limit = len != UINT_MAX; > struct io_zcrx_args args = { > .req = req, > .ifq = ifq, > .sock = sk->sk_socket, > }; > read_descriptor_t rd_desc = { > - .count = 1, > + .count = len, > .arg.data = &args, > }; > int ret; > > lock_sock(sk); > ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); > + if (limit && ret) > + *outlen = len - ret; > if (ret <= 0) { > if (ret < 0 || sock_flag(sk, SOCK_DONE)) > goto out; > @@ -930,7 +937,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, > struct io_zcrx_ifq *ifq, > ret = IOU_REQUEUE; > } else if (sock_flag(sk, SOCK_DONE)) { > /* Make it to retry until it finally gets 0. */ > - if (issue_flags & IO_URING_F_MULTISHOT) > + if (!limit && (issue_flags & IO_URING_F_MULTISHOT)) > ret = IOU_REQUEUE; > else > ret = -EAGAIN; > @@ -942,7 +949,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, > struct io_zcrx_ifq *ifq, > > int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > struct socket *sock, unsigned int flags, > - unsigned issue_flags) > + unsigned issue_flags, unsigned int *len) > { > struct sock *sk = sock->sk; > const struct proto *prot = READ_ONCE(sk->sk_prot); @@ -951,5 > +958,5 @@ int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, > return -EPROTONOSUPPORT; > > sock_rps_record_flow(sk); > - return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags); > + return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); > } > diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index > a16bdd921f03..1b4042dc48e2 100644 > --- a/io_uring/zcrx.h > +++ b/io_uring/zcrx.h > @@ -46,7 +46,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); void > io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb > *req, struct io_zcrx_ifq *ifq, > struct socket *sock, unsigned int flags, > - unsigned issue_flags); > + unsigned issue_flags, unsigned int *len); > #else > static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, > struct io_uring_zcrx_ifq_reg __user > *arg) > -- > 2.43.5 > > --- Li Zetao
diff --git a/io_uring/net.c b/io_uring/net.c index 000dc70d08d0..cae34a24266c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -94,6 +94,7 @@ struct io_recvzc { struct file *file; unsigned msg_flags; u16 flags; + u32 len; struct io_zcrx_ifq *ifq; }; @@ -1241,7 +1242,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned ifq_idx; if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || - sqe->len || sqe->addr3)) + sqe->addr3)) return -EINVAL; ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); @@ -1250,6 +1251,12 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->ifq = req->ctx->ifq; if (!zc->ifq) return -EINVAL; + zc->len = READ_ONCE(sqe->len); + if (zc->len == UINT_MAX) + return -EINVAL; + /* UINT_MAX means no limit on readlen */ + if (!zc->len) + zc->len = UINT_MAX; zc->flags = READ_ONCE(sqe->ioprio); zc->msg_flags = READ_ONCE(sqe->msg_flags); @@ -1269,6 +1276,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) { struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + bool limit = zc->len != UINT_MAX; struct socket *sock; int ret; @@ -1281,7 +1289,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, - issue_flags); + issue_flags, &zc->len); if (unlikely(ret <= 0) && ret != -EAGAIN) { if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1296,6 +1304,13 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } + if (zc->len == 0) { + io_req_set_res(req, 0, 0); + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_STOP_MULTISHOT; + return IOU_OK; + } if (issue_flags & IO_URING_F_MULTISHOT) return IOU_ISSUE_SKIP_COMPLETE; return -EAGAIN; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index f2d326e18e67..74bca4e471bc 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -817,6 +817,7 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, int i, copy, end, off; int ret = 0; + len = min_t(size_t, len, desc->count); if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) return -EAGAIN; @@ -894,26 +895,32 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, out: if (offset == start_off) return ret; + if (desc->count != UINT_MAX) + desc->count -= (offset - start_off); return offset - start_off; } static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct sock *sk, int flags, - unsigned issue_flags) + unsigned issue_flags, unsigned int *outlen) { + unsigned int len = *outlen; + bool limit = len != UINT_MAX; struct io_zcrx_args args = { .req = req, .ifq = ifq, .sock = sk->sk_socket, }; read_descriptor_t rd_desc = { - .count = 1, + .count = len, .arg.data = &args, }; int ret; lock_sock(sk); ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); + if (limit && ret) + *outlen = len - ret; if (ret <= 0) { if (ret < 0 || sock_flag(sk, SOCK_DONE)) goto out; @@ -930,7 +937,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, ret = IOU_REQUEUE; } else if (sock_flag(sk, SOCK_DONE)) { /* Make it to retry until it finally gets 0. */ - if (issue_flags & IO_URING_F_MULTISHOT) + if (!limit && (issue_flags & IO_URING_F_MULTISHOT)) ret = IOU_REQUEUE; else ret = -EAGAIN; @@ -942,7 +949,7 @@ static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, - unsigned issue_flags) + unsigned issue_flags, unsigned int *len) { struct sock *sk = sock->sk; const struct proto *prot = READ_ONCE(sk->sk_prot); @@ -951,5 +958,5 @@ int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, return -EPROTONOSUPPORT; sock_rps_record_flow(sk); - return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags); + return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); } diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index a16bdd921f03..1b4042dc48e2 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -46,7 +46,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, - unsigned issue_flags); + unsigned issue_flags, unsigned int *len); #else static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg)
Currently only multishot recvzc requests are supported, but sometimes there is a need to do a single recv e.g. peeking at some data in the socket. Add single shot recvzc requests where IORING_RECV_MULTISHOT is _not_ set and the sqe->len field is set to the number of bytes to read N. There could be multiple completions containing data, like the multishot case, since N bytes could be split across multiple frags. This is followed by a final completion with res and cflags both set to 0 that indicate the completion of the request, or a -res that indicate an error. Signed-off-by: David Wei <dw@davidwei.uk> --- io_uring/net.c | 19 +++++++++++++++++-- io_uring/zcrx.c | 17 ++++++++++++----- io_uring/zcrx.h | 2 +- 3 files changed, 30 insertions(+), 8 deletions(-)