Message ID | acad81e4-c2ef-59cc-5f0c-33b99082d270@samba.org (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | IORING_SEND_NOTIF_REPORT_USAGE (was Re: IORING_CQE_F_COPIED) | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
On 10/20/22 11:04, Stefan Metzmacher wrote: > Hi Pavel, [...] > > So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag > and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001) > and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also > able to notice that some parts were able to use zero copy, while other > fragments were copied. Are we really interested in multihoming and probably some very edge cases? I'd argue we're not and it should be a single bool hint indicating whether zc is viable or not. It can do more complex calculations _if_ needed, e.g. looking inside skb's and figure out how many bytes were copied but as for me it should better be turned into a single bool in the end. Could also be the number of bytes copied, but I don't think we can't have the accuracy for that (e.g. what we're going to return if some protocol duplicates an skb and sends to 2 different devices or is processing it in a pipeline?) So the question is what is the use case for having 2 flags? btw, now we've got another example why the report flag is a good idea, we can't use cqe.res unconditionally because we want to have a "one CQE per request" mode, but it's fine if we make it and the report flag mutually exclusive. > I haven't tested it yet, but I want to post it early... > > What do you think? Keeping in mind potential backporting let's make it as simple and short as possible first and then do optimisations on top. > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index ab7458033ee3..751fc4eff8d1 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -296,10 +296,28 @@ enum io_uring_op { > * > * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in > * the buf_index field. > + * > + * IORING_SEND_NOTIF_REPORT_USAGE > + * If SEND[MSG]_ZC should report > + * the zerocopy usage in cqe.res > + * for the IORING_CQE_F_NOTIF cqe. > + * IORING_NOTIF_USAGE_ZC_USED if zero copy was used > + * (at least partially). > + * IORING_NOTIF_USAGE_ZC_COPIED if data was copied > + * (at least partially). > */ > #define IORING_RECVSEND_POLL_FIRST (1U << 0) > #define IORING_RECV_MULTISHOT (1U << 1) > #define IORING_RECVSEND_FIXED_BUF (1U << 2) > +#define IORING_SEND_NOTIF_REPORT_USAGE (1U << 3) > + > +/* > + * cqe.res for IORING_CQE_F_NOTIF if > + * IORING_SEND_NOTIF_REPORT_USAGE was requested > + */ > +#define IORING_NOTIF_USAGE_ZC_USED (1U << 0) > +#define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) > + > > /* > * accept flags stored in sqe->ioprio > diff --git a/io_uring/net.c b/io_uring/net.c > index 735eec545115..a79d7d349e19 100644 > --- a/io_uring/net.c > +++ b/io_uring/net.c > @@ -946,9 +946,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > > zc->flags = READ_ONCE(sqe->ioprio); > if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | > - IORING_RECVSEND_FIXED_BUF)) > + IORING_RECVSEND_FIXED_BUF | > + IORING_SEND_NOTIF_REPORT_USAGE)) > return -EINVAL; > - notif = zc->notif = io_alloc_notif(ctx); > + notif = zc->notif = io_alloc_notif(ctx, > + zc->flags & IORING_SEND_NOTIF_REPORT_USAGE); > if (!notif) > return -ENOMEM; > notif->cqe.user_data = req->cqe.user_data; > diff --git a/io_uring/notif.c b/io_uring/notif.c > index e37c6569d82e..3844e3c8ad7e 100644 > --- a/io_uring/notif.c > +++ b/io_uring/notif.c > @@ -3,13 +3,14 @@ > #include <linux/file.h> > #include <linux/slab.h> > #include <linux/net.h> > +#include <linux/errqueue.h> Is it needed? > #include <linux/io_uring.h> > > #include "io_uring.h" > #include "notif.h" > #include "rsrc.h" > > -static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) > +static inline void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) Let's remove this hunk with inlining and do it later > { > struct io_notif_data *nd = io_notif_to_data(notif); > struct io_ring_ctx *ctx = notif->ctx; > @@ -21,20 +22,46 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) > io_req_task_complete(notif, locked); > } > > -static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, > - struct ubuf_info *uarg, > - bool success) > +static inline void io_uring_tx_zerocopy_callback(struct sk_buff *skb, > + struct ubuf_info *uarg, > + bool success) This one as well. > { > struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); > struct io_kiocb *notif = cmd_to_io_kiocb(nd); > > if (refcount_dec_and_test(&uarg->refcnt)) { > - notif->io_task_work.func = __io_notif_complete_tw; > io_req_task_work_add(notif); > } > } > > -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) > +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked) Just shove all that into __io_notif_complete_tw(). > +{ > + struct io_notif_data *nd = io_notif_to_data(notif); > + > + if (likely(nd->zc_used)) > + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_USED; > + > + if (unlikely(nd->zc_copied)) > + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; > + > + __io_notif_complete_tw(notif, locked); > +} > + > +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb, > + struct ubuf_info *uarg, > + bool success) > +{ > + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); > + > + if (success && !nd->zc_used && skb) > + nd->zc_used = true; > + else if (unlikely(!success && !nd->zc_copied)) > + nd->zc_copied = true; It's fine but racy, so let's WRITE_ONCE() to indicate it. > + > + io_uring_tx_zerocopy_callback(skb, uarg, success); > +} > + > +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, bool report_usage) > __must_hold(&ctx->uring_lock) And it's better to kill this argument and init zc_used/copied unconditionally. > { > struct io_kiocb *notif; > @@ -54,7 +81,14 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) > nd = io_notif_to_data(notif); > nd->account_pages = 0; > nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; > - nd->uarg.callback = io_uring_tx_zerocopy_callback; > + if (report_usage) { > + nd->zc_used = nd->zc_copied = false; > + nd->uarg.callback = io_uring_tx_zerocopy_callback_report_usage; > + notif->io_task_work.func = __io_notif_complete_tw_report_usage; > + } else { > + nd->uarg.callback = io_uring_tx_zerocopy_callback; > + notif->io_task_work.func = __io_notif_complete_tw; > + } > refcount_set(&nd->uarg.refcnt, 1); > return notif; > } > @@ -66,7 +100,6 @@ void io_notif_flush(struct io_kiocb *notif) > > /* drop slot's master ref */ > if (refcount_dec_and_test(&nd->uarg.refcnt)) { > - notif->io_task_work.func = __io_notif_complete_tw; > io_req_task_work_add(notif); > } > } > diff --git a/io_uring/notif.h b/io_uring/notif.h > index 5b4d710c8ca5..5ac7a2745e52 100644 > --- a/io_uring/notif.h > +++ b/io_uring/notif.h > @@ -13,10 +13,12 @@ struct io_notif_data { > struct file *file; > struct ubuf_info uarg; > unsigned long account_pages; > + bool zc_used; > + bool zc_copied; IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied} there might complicate backporting (if any). We can place them in io_kiocb directly and move in 6.2. Alternatively account_pages doesn't have to be long. > }; > > void io_notif_flush(struct io_kiocb *notif); > -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); > +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, bool report_usage); > > static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) > { >
Hi Pavel, >> So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag >> and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001) >> and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also >> able to notice that some parts were able to use zero copy, while other >> fragments were copied. > > Are we really interested in multihoming and probably some very edge cases? > I'd argue we're not and it should be a single bool hint indicating whether > zc is viable or not. It can do more complex calculations _if_ needed, e.g. > looking inside skb's and figure out how many bytes were copied but as for me > it should better be turned into a single bool in the end. Could also be the > number of bytes copied, but I don't think we can't have the accuracy for > that (e.g. what we're going to return if some protocol duplicates an skb > and sends to 2 different devices or is processing it in a pipeline?) > > So the question is what is the use case for having 2 flags? It's mostly for debugging. > btw, now we've got another example why the report flag is a good idea, I don't understand that line... > we can't use cqe.res unconditionally because we want to have a "one CQE > per request" mode, but it's fine if we make it and the report flag > mutually exclusive. You mean we can add an optimized case where SEND[MSG]_ZC would not generate F_MORE and skips F_NOTIF, because we copied or the transmission path was really fast? Then I'd move to IORING_CQE_F_COPIED again... >> I haven't tested it yet, but I want to post it early... >> >> What do you think? > > Keeping in mind potential backporting let's make it as simple and > short as possible first and then do optimisations on top. ok. >> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >> index ab7458033ee3..751fc4eff8d1 100644 >> --- a/include/uapi/linux/io_uring.h >> +++ b/include/uapi/linux/io_uring.h >> @@ -296,10 +296,28 @@ enum io_uring_op { >> * >> * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in >> * the buf_index field. >> + * >> + * IORING_SEND_NOTIF_REPORT_USAGE >> + * If SEND[MSG]_ZC should report >> + * the zerocopy usage in cqe.res >> + * for the IORING_CQE_F_NOTIF cqe. >> + * IORING_NOTIF_USAGE_ZC_USED if zero copy was used >> + * (at least partially). >> + * IORING_NOTIF_USAGE_ZC_COPIED if data was copied >> + * (at least partially). >> */ >> #define IORING_RECVSEND_POLL_FIRST (1U << 0) >> #define IORING_RECV_MULTISHOT (1U << 1) >> #define IORING_RECVSEND_FIXED_BUF (1U << 2) >> +#define IORING_SEND_NOTIF_REPORT_USAGE (1U << 3) >> + >> +/* >> + * cqe.res for IORING_CQE_F_NOTIF if >> + * IORING_SEND_NOTIF_REPORT_USAGE was requested >> + */ >> +#define IORING_NOTIF_USAGE_ZC_USED (1U << 0) >> +#define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) >> + >> >> /* >> * accept flags stored in sqe->ioprio >> diff --git a/io_uring/net.c b/io_uring/net.c >> index 735eec545115..a79d7d349e19 100644 >> --- a/io_uring/net.c >> +++ b/io_uring/net.c >> @@ -946,9 +946,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >> >> zc->flags = READ_ONCE(sqe->ioprio); >> if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | >> - IORING_RECVSEND_FIXED_BUF)) >> + IORING_RECVSEND_FIXED_BUF | >> + IORING_SEND_NOTIF_REPORT_USAGE)) >> return -EINVAL; >> - notif = zc->notif = io_alloc_notif(ctx); >> + notif = zc->notif = io_alloc_notif(ctx, >> + zc->flags & IORING_SEND_NOTIF_REPORT_USAGE); >> if (!notif) >> return -ENOMEM; >> notif->cqe.user_data = req->cqe.user_data; >> diff --git a/io_uring/notif.c b/io_uring/notif.c >> index e37c6569d82e..3844e3c8ad7e 100644 >> --- a/io_uring/notif.c >> +++ b/io_uring/notif.c >> @@ -3,13 +3,14 @@ >> #include <linux/file.h> >> #include <linux/slab.h> >> #include <linux/net.h> >> +#include <linux/errqueue.h> > > Is it needed? No >> #include <linux/io_uring.h> >> >> #include "io_uring.h" >> #include "notif.h" >> #include "rsrc.h" >> >> -static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) >> +static inline void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) > > Let's remove this hunk with inlining and do it later > >> { >> struct io_notif_data *nd = io_notif_to_data(notif); >> struct io_ring_ctx *ctx = notif->ctx; >> @@ -21,20 +22,46 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) >> io_req_task_complete(notif, locked); >> } >> >> -static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, >> - struct ubuf_info *uarg, >> - bool success) >> +static inline void io_uring_tx_zerocopy_callback(struct sk_buff *skb, >> + struct ubuf_info *uarg, >> + bool success) > > This one as well. > > >> { >> struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); >> struct io_kiocb *notif = cmd_to_io_kiocb(nd); >> >> if (refcount_dec_and_test(&uarg->refcnt)) { >> - notif->io_task_work.func = __io_notif_complete_tw; >> io_req_task_work_add(notif); >> } >> } >> >> -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) >> +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked) > > Just shove all that into __io_notif_complete_tw(). Ok, and then optimze later? Otherwise we could have IORING_CQE_F_COPIED by default without opt-in flag... >> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb, >> + struct ubuf_info *uarg, >> + bool success) >> +{ >> + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); >> + >> + if (success && !nd->zc_used && skb) >> + nd->zc_used = true; >> + else if (unlikely(!success && !nd->zc_copied)) >> + nd->zc_copied = true; > > It's fine but racy, so let's WRITE_ONCE() to indicate it. I don't see how this could be a problem, but I can add it. >> diff --git a/io_uring/notif.h b/io_uring/notif.h >> index 5b4d710c8ca5..5ac7a2745e52 100644 >> --- a/io_uring/notif.h >> +++ b/io_uring/notif.h >> @@ -13,10 +13,12 @@ struct io_notif_data { >> struct file *file; >> struct ubuf_info uarg; >> unsigned long account_pages; >> + bool zc_used; >> + bool zc_copied; > > IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied} > there might complicate backporting (if any). We can place them in io_kiocb > directly and move in 6.2. Alternatively account_pages doesn't have to be > long. As far as I can see kernel-dk-block/io_uring-6.1 alread has your shrink patches included...
On 10/20/22 15:51, Stefan Metzmacher wrote: > Hi Pavel, > >>> So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag >>> and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001) >>> and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also >>> able to notice that some parts were able to use zero copy, while other >>> fragments were copied. >> >> Are we really interested in multihoming and probably some very edge cases? >> I'd argue we're not and it should be a single bool hint indicating whether >> zc is viable or not. It can do more complex calculations _if_ needed, e.g. >> looking inside skb's and figure out how many bytes were copied but as for me >> it should better be turned into a single bool in the end. Could also be the >> number of bytes copied, but I don't think we can't have the accuracy for >> that (e.g. what we're going to return if some protocol duplicates an skb >> and sends to 2 different devices or is processing it in a pipeline?) >> >> So the question is what is the use case for having 2 flags? > > It's mostly for debugging. Ok, than it sounds like we don't need it. >> btw, now we've got another example why the report flag is a good idea, > > I don't understand that line... I'm just telling that IORING_SEND_NOTIF_* instead of unconditional reporting is more flexible and extendible from the uapi perspective. >> we can't use cqe.res unconditionally because we want to have a "one CQE >> per request" mode, but it's fine if we make it and the report flag >> mutually exclusive. > > You mean we can add an optimized case where SEND[MSG]_ZC would not > generate F_MORE and skips F_NOTIF, because we copied or the transmission > path was really fast? It is rather about optionally omitting the first (aka completion) cqe and posting only the notification cqe, which makes a lot of sense for UDP and some TCP use cases. > Then I'd move to IORING_CQE_F_COPIED again... [...] >>> -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) >>> +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked) >> >> Just shove all that into __io_notif_complete_tw(). > > Ok, and then optimze later? Right, I'm just tired of back porting patches by hand :) > Otherwise we could have IORING_CQE_F_COPIED by default without opt-in > flag... > >>> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb, >>> + struct ubuf_info *uarg, >>> + bool success) >>> +{ >>> + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); >>> + >>> + if (success && !nd->zc_used && skb) >>> + nd->zc_used = true; >>> + else if (unlikely(!success && !nd->zc_copied)) >>> + nd->zc_copied = true; >> >> It's fine but racy, so let's WRITE_ONCE() to indicate it. > > I don't see how this could be a problem, but I can add it. It's not a problem, but better to be a little be more explicit about parallel writes. >>> diff --git a/io_uring/notif.h b/io_uring/notif.h >>> index 5b4d710c8ca5..5ac7a2745e52 100644 >>> --- a/io_uring/notif.h >>> +++ b/io_uring/notif.h >>> @@ -13,10 +13,12 @@ struct io_notif_data { >>> struct file *file; >>> struct ubuf_info uarg; >>> unsigned long account_pages; >>> + bool zc_used; >>> + bool zc_copied; >> >> IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied} >> there might complicate backporting (if any). We can place them in io_kiocb >> directly and move in 6.2. Alternatively account_pages doesn't have to be >> long. > > As far as I can see kernel-dk-block/io_uring-6.1 alread has your > shrink patches included... Sorry, I mean 6.0
Hi Pavel, >>>> So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag >>>> and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001) >>>> and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also >>>> able to notice that some parts were able to use zero copy, while other >>>> fragments were copied. >>> >>> Are we really interested in multihoming and probably some very edge cases? >>> I'd argue we're not and it should be a single bool hint indicating whether >>> zc is viable or not. It can do more complex calculations _if_ needed, e.g. >>> looking inside skb's and figure out how many bytes were copied but as for me >>> it should better be turned into a single bool in the end. Could also be the >>> number of bytes copied, but I don't think we can't have the accuracy for >>> that (e.g. what we're going to return if some protocol duplicates an skb >>> and sends to 2 different devices or is processing it in a pipeline?) >>> >>> So the question is what is the use case for having 2 flags? >> >> It's mostly for debugging. > > Ok, than it sounds like we don't need it. Maybe I could add some trace points to the callback? >>> btw, now we've got another example why the report flag is a good idea, >> >> I don't understand that line... > > I'm just telling that IORING_SEND_NOTIF_* instead of unconditional reporting > is more flexible and extendible from the uapi perspective. ok >>> we can't use cqe.res unconditionally because we want to have a "one CQE >>> per request" mode, but it's fine if we make it and the report flag >>> mutually exclusive. >> >> You mean we can add an optimized case where SEND[MSG]_ZC would not >> generate F_MORE and skips F_NOTIF, because we copied or the transmission >> path was really fast? > > It is rather about optionally omitting the first (aka completion) cqe and > posting only the notification cqe, which makes a lot of sense for UDP and > some TCP use cases. OK. >> Then I'd move to IORING_CQE_F_COPIED again... > [...] >>>> -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) >>>> +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked) >>> >>> Just shove all that into __io_notif_complete_tw(). >> >> Ok, and then optimze later? > > Right, I'm just tired of back porting patches by hand :) ok, I just assumed it would be 6.1 only. >> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in >> flag... Do you still want an opt-in flag to get IORING_CQE_F_COPIED? If so what name do you want it to be? >>>> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb, >>>> + struct ubuf_info *uarg, >>>> + bool success) >>>> +{ >>>> + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); >>>> + >>>> + if (success && !nd->zc_used && skb) >>>> + nd->zc_used = true; >>>> + else if (unlikely(!success && !nd->zc_copied)) >>>> + nd->zc_copied = true; >>> >>> It's fine but racy, so let's WRITE_ONCE() to indicate it. >> >> I don't see how this could be a problem, but I can add it. > > It's not a problem, but better to be a little be more explicit > about parallel writes. ok. >>>> diff --git a/io_uring/notif.h b/io_uring/notif.h >>>> index 5b4d710c8ca5..5ac7a2745e52 100644 >>>> --- a/io_uring/notif.h >>>> +++ b/io_uring/notif.h >>>> @@ -13,10 +13,12 @@ struct io_notif_data { >>>> struct file *file; >>>> struct ubuf_info uarg; >>>> unsigned long account_pages; >>>> + bool zc_used; >>>> + bool zc_copied; >>> >>> IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied} >>> there might complicate backporting (if any). We can place them in io_kiocb >>> directly and move in 6.2. Alternatively account_pages doesn't have to be >>> long. >> >> As far as I can see kernel-dk-block/io_uring-6.1 alread has your >> shrink patches included... > > Sorry, I mean 6.0 So you want to backport to 6.0? Find the current version below, sizeof(struct io_kiocb) will grow from 3*64 + 24 to 3*64 + 32 (on x64_64) to it stays within 4 cache lines. I tried this first: union { u8 iopoll_completed; struct { u8 zc_used:1; u8 zc_copied:1; }; }; But then WRITE_ONCE() complains about a bitfield write. So let me now about the opt-in flag and I'll prepare real commits including a patch that moves from struct io_kiocb to struct io_notif_data on top. metze diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f5b687a787a3..189152ad78d6 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -515,6 +515,9 @@ struct io_kiocb { u8 opcode; /* polled IO has completed */ u8 iopoll_completed; + /* these will be moved to struct io_notif_data in 6.1 */ + bool zc_used; + bool zc_copied; /* * Can be either a fixed buffer index, or used with provided buffers. * For the latter, before issue it points to the buffer group ID, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ab7458033ee3..738d6234d1d9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -350,6 +350,7 @@ struct io_uring_cqe { #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) +#define IORING_CQE_F_COPIED (1U << 4) enum { IORING_CQE_BUFFER_SHIFT = 16, diff --git a/io_uring/notif.c b/io_uring/notif.c index e37c6569d82e..033aca064b10 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; } + + if (notif->zc_copied || !notif->zc_used) + notif->cqe.flags |= IORING_CQE_F_COPIED; + io_req_task_complete(notif, locked); } @@ -28,6 +32,11 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); + if (success && !notif->zc_used && skb) + WRITE_ONCE(notif->zc_used, true); + else if (!success && !notif->zc_copied) + WRITE_ONCE(notif->zc_copied, true); + if (refcount_dec_and_test(&uarg->refcnt)) { notif->io_task_work.func = __io_notif_complete_tw; io_req_task_work_add(notif); @@ -55,6 +64,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) nd->account_pages = 0; nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; nd->uarg.callback = io_uring_tx_zerocopy_callback; + notif->zc_used = notif->zc_copied = false; refcount_set(&nd->uarg.refcnt, 1); return notif; }
On 10/21/22 10:36, Stefan Metzmacher wrote: > Hi Pavel, [...] >> Right, I'm just tired of back porting patches by hand :) > > ok, I just assumed it would be 6.1 only. I'm fine with 6.1 only, it'd make things easier. I thought from your first postings you wanted it 6.0. Then we don't need to care about the placing of the copied/used flags. >>> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in >>> flag... > > Do you still want an opt-in flag to get IORING_CQE_F_COPIED? > If so what name do you want it to be? Ala a IORING_SEND_* flag? Yes please. *_REPORT_USAGE was fine but I'd make it IORING_SEND_ZC_REPORT_USAGE. And can be extended if there is more info needed in the future. And I don't mind using a bit in cqe->res, makes cflags less polluted. >>>>> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb, >>>>> + struct ubuf_info *uarg, >>>>> + bool success) >>>>> +{ >>>>> + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); >>>>> + >>>>> + if (success && !nd->zc_used && skb) >>>>> + nd->zc_used = true; >>>>> + else if (unlikely(!success && !nd->zc_copied)) >>>>> + nd->zc_copied = true; >>>> >>>> It's fine but racy, so let's WRITE_ONCE() to indicate it. >>> >>> I don't see how this could be a problem, but I can add it. >> >> It's not a problem, but better to be a little be more explicit >> about parallel writes. > > ok. > >>>>> diff --git a/io_uring/notif.h b/io_uring/notif.h >>>>> index 5b4d710c8ca5..5ac7a2745e52 100644 >>>>> --- a/io_uring/notif.h >>>>> +++ b/io_uring/notif.h >>>>> @@ -13,10 +13,12 @@ struct io_notif_data { >>>>> struct file *file; >>>>> struct ubuf_info uarg; >>>>> unsigned long account_pages; >>>>> + bool zc_used; >>>>> + bool zc_copied; >>>> >>>> IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied} >>>> there might complicate backporting (if any). We can place them in io_kiocb >>>> directly and move in 6.2. Alternatively account_pages doesn't have to be >>>> long. >>> >>> As far as I can see kernel-dk-block/io_uring-6.1 alread has your >>> shrink patches included... >> >> Sorry, I mean 6.0 > > So you want to backport to 6.0? > > Find the current version below, sizeof(struct io_kiocb) will grow from > 3*64 + 24 to 3*64 + 32 (on x64_64) to it stays within 4 cache lines. > > I tried this first: > > union { > u8 iopoll_completed; > struct { > u8 zc_used:1; > u8 zc_copied:1; > }; > }; > > But then WRITE_ONCE() complains about a bitfield write. rightfully so, it can't be a bitfield as it would be racy and not only in theory this time. > So let me now about the opt-in flag and I'll prepare real commits > including a patch that moves from struct io_kiocb to struct io_notif_data > on top. Yeah, better to be opt-in, but apart from it and comments above looks good. > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index f5b687a787a3..189152ad78d6 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -515,6 +515,9 @@ struct io_kiocb { > u8 opcode; > /* polled IO has completed */ > u8 iopoll_completed; > + /* these will be moved to struct io_notif_data in 6.1 */ > + bool zc_used; > + bool zc_copied; > /* > * Can be either a fixed buffer index, or used with provided buffers. > * For the latter, before issue it points to the buffer group ID, > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index ab7458033ee3..738d6234d1d9 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -350,6 +350,7 @@ struct io_uring_cqe { > #define IORING_CQE_F_MORE (1U << 1) > #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) > #define IORING_CQE_F_NOTIF (1U << 3) > +#define IORING_CQE_F_COPIED (1U << 4) > > enum { > IORING_CQE_BUFFER_SHIFT = 16, > diff --git a/io_uring/notif.c b/io_uring/notif.c > index e37c6569d82e..033aca064b10 100644 > --- a/io_uring/notif.c > +++ b/io_uring/notif.c > @@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) > __io_unaccount_mem(ctx->user, nd->account_pages); > nd->account_pages = 0; > } > + > + if (notif->zc_copied || !notif->zc_used) > + notif->cqe.flags |= IORING_CQE_F_COPIED; > + As discussed above, should depend on IORING_SEND_ZC_REPORT_USAGE > io_req_task_complete(notif, locked); > } > > @@ -28,6 +32,11 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, > struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); > struct io_kiocb *notif = cmd_to_io_kiocb(nd); > > + if (success && !notif->zc_used && skb) > + WRITE_ONCE(notif->zc_used, true); > + else if (!success && !notif->zc_copied) > + WRITE_ONCE(notif->zc_copied, true); > + > if (refcount_dec_and_test(&uarg->refcnt)) { > notif->io_task_work.func = __io_notif_complete_tw; > io_req_task_work_add(notif); > @@ -55,6 +64,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) > nd->account_pages = 0; > nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; > nd->uarg.callback = io_uring_tx_zerocopy_callback; > + notif->zc_used = notif->zc_copied = false; > refcount_set(&nd->uarg.refcnt, 1); > return notif; > } >
Am 21.10.22 um 13:09 schrieb Pavel Begunkov: > On 10/21/22 10:36, Stefan Metzmacher wrote: >> Hi Pavel, > [...] >>> Right, I'm just tired of back porting patches by hand :) >> >> ok, I just assumed it would be 6.1 only. > > I'm fine with 6.1 only, it'd make things easier. I thought from > your first postings you wanted it 6.0. Then we don't need to care > about the placing of the copied/used flags. > >>>> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in >>>> flag... >> >> Do you still want an opt-in flag to get IORING_CQE_F_COPIED? >> If so what name do you want it to be? > > Ala a IORING_SEND_* flag? Yes please. > > *_REPORT_USAGE was fine but I'd make it IORING_SEND_ZC_REPORT_USAGE. > And can be extended if there is more info needed in the future. > > And I don't mind using a bit in cqe->res, makes cflags less polluted. So no worries about the delayed/skip sendmsg completion anymore? Should I define it like this, ok? #define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) See the full patch below... metze diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d69ae7eba773..32e1f2a55b70 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -296,10 +296,24 @@ enum io_uring_op { * * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in * the buf_index field. + + * IORING_SEND_NOTIF_REPORT_USAGE + * If SEND[MSG]_ZC should report + * the zerocopy usage in cqe.res + * for the IORING_CQE_F_NOTIF cqe. + * IORING_NOTIF_USAGE_ZC_COPIED if data was copied + * (at least partially). */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) +#define IORING_SEND_ZC_REPORT_USAGE (1U << 3) + +/* + * cqe.res for IORING_CQE_F_NOTIF if + * IORING_SEND_ZC_REPORT_USAGE was requested + */ +#define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) /* * accept flags stored in sqe->ioprio diff --git a/io_uring/net.c b/io_uring/net.c index 56078f47efe7..1aa3b50b3e82 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -939,7 +939,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->flags = READ_ONCE(sqe->ioprio); if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | - IORING_RECVSEND_FIXED_BUF)) + IORING_RECVSEND_FIXED_BUF | + IORING_SEND_ZC_REPORT_USAGE)) return -EINVAL; notif = zc->notif = io_alloc_notif(ctx); if (!notif) @@ -957,6 +958,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = READ_ONCE(ctx->user_bufs[idx]); io_req_set_rsrc_node(notif, ctx, 0); } + if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { + io_notif_to_data(notif)->zc_report = true; + } if (req->opcode == IORING_OP_SEND_ZC) { if (READ_ONCE(sqe->__pad3[0])) diff --git a/io_uring/notif.c b/io_uring/notif.c index e37c6569d82e..4bfef10161fa 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; } + + if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + io_req_task_complete(notif, locked); } @@ -28,6 +32,13 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); + if (nd->zc_report) { + if (success && !nd->zc_used && skb) + WRITE_ONCE(nd->zc_used, true); + else if (!success && !nd->zc_copied) + WRITE_ONCE(nd->zc_copied, true); + } + if (refcount_dec_and_test(&uarg->refcnt)) { notif->io_task_work.func = __io_notif_complete_tw; io_req_task_work_add(notif); @@ -55,6 +66,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) nd->account_pages = 0; nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; nd->uarg.callback = io_uring_tx_zerocopy_callback; + nd->zc_report = nd->zc_used = nd->zc_copied = false; refcount_set(&nd->uarg.refcnt, 1); return notif; } diff --git a/io_uring/notif.h b/io_uring/notif.h index e4fbcae0f3fd..6be2e5ae8581 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -15,6 +15,9 @@ struct io_notif_data { struct file *file; struct ubuf_info uarg; unsigned long account_pages; + bool zc_report; + bool zc_used; + bool zc_copied; }; void io_notif_flush(struct io_kiocb *notif);
Hi Pavel, >> Ala a IORING_SEND_* flag? Yes please. >> >> *_REPORT_USAGE was fine but I'd make it IORING_SEND_ZC_REPORT_USAGE. >> And can be extended if there is more info needed in the future. >> >> And I don't mind using a bit in cqe->res, makes cflags less polluted. > > So no worries about the delayed/skip sendmsg completion anymore? > > Should I define it like this, ok? > > #define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) > > See the full patch below... Apart from still having IORING_SEND_NOTIF_REPORT_USAGE in the comment... (which I'll fix...) Is this now fine for you? Then I would post a real patch. Thanks! metze
On 10/21/22 15:03, Stefan Metzmacher wrote: > Am 21.10.22 um 13:09 schrieb Pavel Begunkov: >> On 10/21/22 10:36, Stefan Metzmacher wrote: >>> Hi Pavel, >> [...] >>>> Right, I'm just tired of back porting patches by hand :) >>> >>> ok, I just assumed it would be 6.1 only. >> >> I'm fine with 6.1 only, it'd make things easier. I thought from >> your first postings you wanted it 6.0. Then we don't need to care >> about the placing of the copied/used flags. >> >>>>> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in >>>>> flag... >>> >>> Do you still want an opt-in flag to get IORING_CQE_F_COPIED? >>> If so what name do you want it to be? >> >> Ala a IORING_SEND_* flag? Yes please. >> >> *_REPORT_USAGE was fine but I'd make it IORING_SEND_ZC_REPORT_USAGE. >> And can be extended if there is more info needed in the future. >> >> And I don't mind using a bit in cqe->res, makes cflags less polluted. > > So no worries about the delayed/skip sendmsg completion anymore? I'll just make it incompatible the reporting. > Should I define it like this, ok? > > #define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) > > See the full patch below... Looks good > metze > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index d69ae7eba773..32e1f2a55b70 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -296,10 +296,24 @@ enum io_uring_op { > * > * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in > * the buf_index field. > + > + * IORING_SEND_NOTIF_REPORT_USAGE > + * If SEND[MSG]_ZC should report > + * the zerocopy usage in cqe.res > + * for the IORING_CQE_F_NOTIF cqe. > + * IORING_NOTIF_USAGE_ZC_COPIED if data was copied > + * (at least partially). > */ > #define IORING_RECVSEND_POLL_FIRST (1U << 0) > #define IORING_RECV_MULTISHOT (1U << 1) > #define IORING_RECVSEND_FIXED_BUF (1U << 2) > +#define IORING_SEND_ZC_REPORT_USAGE (1U << 3) > + > +/* > + * cqe.res for IORING_CQE_F_NOTIF if > + * IORING_SEND_ZC_REPORT_USAGE was requested > + */ > +#define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) > > /* > * accept flags stored in sqe->ioprio > diff --git a/io_uring/net.c b/io_uring/net.c > index 56078f47efe7..1aa3b50b3e82 100644 > --- a/io_uring/net.c > +++ b/io_uring/net.c > @@ -939,7 +939,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > > zc->flags = READ_ONCE(sqe->ioprio); > if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | > - IORING_RECVSEND_FIXED_BUF)) > + IORING_RECVSEND_FIXED_BUF | > + IORING_SEND_ZC_REPORT_USAGE)) > return -EINVAL; > notif = zc->notif = io_alloc_notif(ctx); > if (!notif) > @@ -957,6 +958,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > req->imu = READ_ONCE(ctx->user_bufs[idx]); > io_req_set_rsrc_node(notif, ctx, 0); > } > + if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { > + io_notif_to_data(notif)->zc_report = true; > + } > > if (req->opcode == IORING_OP_SEND_ZC) { > if (READ_ONCE(sqe->__pad3[0])) > diff --git a/io_uring/notif.c b/io_uring/notif.c > index e37c6569d82e..4bfef10161fa 100644 > --- a/io_uring/notif.c > +++ b/io_uring/notif.c > @@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) > __io_unaccount_mem(ctx->user, nd->account_pages); > nd->account_pages = 0; > } > + > + if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) > + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; > + > io_req_task_complete(notif, locked); > } > > @@ -28,6 +32,13 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, > struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); > struct io_kiocb *notif = cmd_to_io_kiocb(nd); > > + if (nd->zc_report) { > + if (success && !nd->zc_used && skb) > + WRITE_ONCE(nd->zc_used, true); > + else if (!success && !nd->zc_copied) > + WRITE_ONCE(nd->zc_copied, true); > + } > + > if (refcount_dec_and_test(&uarg->refcnt)) { > notif->io_task_work.func = __io_notif_complete_tw; > io_req_task_work_add(notif); > @@ -55,6 +66,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) > nd->account_pages = 0; > nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; > nd->uarg.callback = io_uring_tx_zerocopy_callback; > + nd->zc_report = nd->zc_used = nd->zc_copied = false; > refcount_set(&nd->uarg.refcnt, 1); > return notif; > } > diff --git a/io_uring/notif.h b/io_uring/notif.h > index e4fbcae0f3fd..6be2e5ae8581 100644 > --- a/io_uring/notif.h > +++ b/io_uring/notif.h > @@ -15,6 +15,9 @@ struct io_notif_data { > struct file *file; > struct ubuf_info uarg; > unsigned long account_pages; > + bool zc_report; > + bool zc_used; > + bool zc_copied; > }; > > void io_notif_flush(struct io_kiocb *notif); >
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ab7458033ee3..751fc4eff8d1 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -296,10 +296,28 @@ enum io_uring_op { * * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in * the buf_index field. + * + * IORING_SEND_NOTIF_REPORT_USAGE + * If SEND[MSG]_ZC should report + * the zerocopy usage in cqe.res + * for the IORING_CQE_F_NOTIF cqe. + * IORING_NOTIF_USAGE_ZC_USED if zero copy was used + * (at least partially). + * IORING_NOTIF_USAGE_ZC_COPIED if data was copied + * (at least partially). */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) +#define IORING_SEND_NOTIF_REPORT_USAGE (1U << 3) + +/* + * cqe.res for IORING_CQE_F_NOTIF if + * IORING_SEND_NOTIF_REPORT_USAGE was requested + */ +#define IORING_NOTIF_USAGE_ZC_USED (1U << 0) +#define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31) + /* * accept flags stored in sqe->ioprio diff --git a/io_uring/net.c b/io_uring/net.c index 735eec545115..a79d7d349e19 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -946,9 +946,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->flags = READ_ONCE(sqe->ioprio); if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | - IORING_RECVSEND_FIXED_BUF)) + IORING_RECVSEND_FIXED_BUF | + IORING_SEND_NOTIF_REPORT_USAGE)) return -EINVAL; - notif = zc->notif = io_alloc_notif(ctx); + notif = zc->notif = io_alloc_notif(ctx, + zc->flags & IORING_SEND_NOTIF_REPORT_USAGE); if (!notif) return -ENOMEM; notif->cqe.user_data = req->cqe.user_data; diff --git a/io_uring/notif.c b/io_uring/notif.c index e37c6569d82e..3844e3c8ad7e 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -3,13 +3,14 @@ #include <linux/file.h> #include <linux/slab.h> #include <linux/net.h> +#include <linux/errqueue.h> #include <linux/io_uring.h> #include "io_uring.h" #include "notif.h" #include "rsrc.h" -static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) +static inline void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) { struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; @@ -21,20 +22,46 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) io_req_task_complete(notif, locked); } -static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, - struct ubuf_info *uarg, - bool success) +static inline void io_uring_tx_zerocopy_callback(struct sk_buff *skb, + struct ubuf_info *uarg, + bool success) { struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); if (refcount_dec_and_test(&uarg->refcnt)) { - notif->io_task_work.func = __io_notif_complete_tw; io_req_task_work_add(notif); } } -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked) +{ + struct io_notif_data *nd = io_notif_to_data(notif); + + if (likely(nd->zc_used)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_USED; + + if (unlikely(nd->zc_copied)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + + __io_notif_complete_tw(notif, locked); +} + +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb, + struct ubuf_info *uarg, + bool success) +{ + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); + + if (success && !nd->zc_used && skb) + nd->zc_used = true; + else if (unlikely(!success && !nd->zc_copied)) + nd->zc_copied = true; + + io_uring_tx_zerocopy_callback(skb, uarg, success); +} + +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, bool report_usage) __must_hold(&ctx->uring_lock) { struct io_kiocb *notif; @@ -54,7 +81,14 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) nd = io_notif_to_data(notif); nd->account_pages = 0; nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; - nd->uarg.callback = io_uring_tx_zerocopy_callback; + if (report_usage) { + nd->zc_used = nd->zc_copied = false; + nd->uarg.callback = io_uring_tx_zerocopy_callback_report_usage; + notif->io_task_work.func = __io_notif_complete_tw_report_usage; + } else { + nd->uarg.callback = io_uring_tx_zerocopy_callback; + notif->io_task_work.func = __io_notif_complete_tw; + } refcount_set(&nd->uarg.refcnt, 1); return notif; } @@ -66,7 +100,6 @@ void io_notif_flush(struct io_kiocb *notif) /* drop slot's master ref */ if (refcount_dec_and_test(&nd->uarg.refcnt)) { - notif->io_task_work.func = __io_notif_complete_tw; io_req_task_work_add(notif); } } diff --git a/io_uring/notif.h b/io_uring/notif.h index 5b4d710c8ca5..5ac7a2745e52 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -13,10 +13,12 @@ struct io_notif_data { struct file *file; struct ubuf_info uarg; unsigned long account_pages; + bool zc_used; + bool zc_copied; }; void io_notif_flush(struct io_kiocb *notif); -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, bool report_usage); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) {