diff mbox series

IORING_SEND_NOTIF_REPORT_USAGE (was Re: IORING_CQE_F_COPIED)

Message ID acad81e4-c2ef-59cc-5f0c-33b99082d270@samba.org (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series IORING_SEND_NOTIF_REPORT_USAGE (was Re: IORING_CQE_F_COPIED) | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Stefan Metzmacher Oct. 20, 2022, 10:04 a.m. UTC
Hi Pavel,

> Yep, sth like that should do, but let's guard against
> spurious net_zcopy_put() just in case.
> 
> used = false;
> copied = false;
> 
> callback(skb, success, ubuf) {
>      if (skb)
>          used = true;
>      if (!success)
>          copied = true;
> }
> complete() {
>      if (!used || copied)
>          set_flag(IORING_CQE_F_COPIED);
> }
> 
>> And __io_notif_complete_tw still needs:
>>
>>          if (!nd->zero_copied)
>>                  notif->cqe.flags |= IORING_CQE_F_COPIED;
> 
> Which can be shoved in a custom callback
> 

Ok, got the idea.

> I'm more concerned about future changes around it, but there won't
> be extra ifs.
> 
> #define COMMON_FLAGS (RECVSEND_FIRST_POLL|...)
> #define ALL_FLAGS (COMMON_FLAGS|RECVSEND_PROBE)
> 
> if (flags & ~COMMON_FLAGS) {
>      if (flags & ~ALL_FLAGS)
>          return err;
>      if (flags & RECVSEND_PROBE)
>          set_callback(notif);
> }

So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag
and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001)
and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also
able to notice that some parts were able to use zero copy, while other
fragments were copied.

I haven't tested it yet, but I want to post it early...

What do you think?

metze

Comments

Pavel Begunkov Oct. 20, 2022, 1:46 p.m. UTC | #1
On 10/20/22 11:04, Stefan Metzmacher wrote:
> Hi Pavel,
[...]
> 
> So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag
> and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001)
> and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also
> able to notice that some parts were able to use zero copy, while other
> fragments were copied.

Are we really interested in multihoming and probably some very edge cases?
I'd argue we're not and it should be a single bool hint indicating whether
zc is viable or not. It can do more complex calculations _if_ needed, e.g.
looking inside skb's and figure out how many bytes were copied but as for me
it should better be turned into a single bool in the end. Could also be the
number of bytes copied, but I don't think we can't have the accuracy for
that (e.g. what we're going to return if some protocol duplicates an skb
and sends to 2 different devices or is processing it in a pipeline?)

So the question is what is the use case for having 2 flags?

btw, now we've got another example why the report flag is a good idea,
we can't use cqe.res unconditionally because we want to have a "one CQE
per request" mode, but it's fine if we make it and the report flag
mutually exclusive.


> I haven't tested it yet, but I want to post it early...
> 
> What do you think?

Keeping in mind potential backporting let's make it as simple and
short as possible first and then do optimisations on top.

> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index ab7458033ee3..751fc4eff8d1 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -296,10 +296,28 @@ enum io_uring_op {
>    *
>    * IORING_RECVSEND_FIXED_BUF    Use registered buffers, the index is stored in
>    *                the buf_index field.
> + *
> + * IORING_SEND_NOTIF_REPORT_USAGE
> + *                If SEND[MSG]_ZC should report
> + *                the zerocopy usage in cqe.res
> + *                for the IORING_CQE_F_NOTIF cqe.
> + *                IORING_NOTIF_USAGE_ZC_USED if zero copy was used
> + *                (at least partially).
> + *                IORING_NOTIF_USAGE_ZC_COPIED if data was copied
> + *                (at least partially).
>    */
>   #define IORING_RECVSEND_POLL_FIRST    (1U << 0)
>   #define IORING_RECV_MULTISHOT        (1U << 1)
>   #define IORING_RECVSEND_FIXED_BUF    (1U << 2)
> +#define IORING_SEND_NOTIF_REPORT_USAGE    (1U << 3)
> +
> +/*
> + * cqe.res for IORING_CQE_F_NOTIF if
> + * IORING_SEND_NOTIF_REPORT_USAGE was requested
> + */
> +#define IORING_NOTIF_USAGE_ZC_USED    (1U << 0)
> +#define IORING_NOTIF_USAGE_ZC_COPIED    (1U << 31)
> +
> 
>   /*
>    * accept flags stored in sqe->ioprio
> diff --git a/io_uring/net.c b/io_uring/net.c
> index 735eec545115..a79d7d349e19 100644
> --- a/io_uring/net.c
> +++ b/io_uring/net.c
> @@ -946,9 +946,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> 
>       zc->flags = READ_ONCE(sqe->ioprio);
>       if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
> -              IORING_RECVSEND_FIXED_BUF))
> +              IORING_RECVSEND_FIXED_BUF |
> +              IORING_SEND_NOTIF_REPORT_USAGE))
>           return -EINVAL;
> -    notif = zc->notif = io_alloc_notif(ctx);
> +    notif = zc->notif = io_alloc_notif(ctx,
> +                       zc->flags & IORING_SEND_NOTIF_REPORT_USAGE);
>       if (!notif)
>           return -ENOMEM;
>       notif->cqe.user_data = req->cqe.user_data;
> diff --git a/io_uring/notif.c b/io_uring/notif.c
> index e37c6569d82e..3844e3c8ad7e 100644
> --- a/io_uring/notif.c
> +++ b/io_uring/notif.c
> @@ -3,13 +3,14 @@
>   #include <linux/file.h>
>   #include <linux/slab.h>
>   #include <linux/net.h>
> +#include <linux/errqueue.h>

Is it needed?

>   #include <linux/io_uring.h>
> 
>   #include "io_uring.h"
>   #include "notif.h"
>   #include "rsrc.h"
> 
> -static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
> +static inline void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)

Let's remove this hunk with inlining and do it later

>   {
>       struct io_notif_data *nd = io_notif_to_data(notif);
>       struct io_ring_ctx *ctx = notif->ctx;
> @@ -21,20 +22,46 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
>       io_req_task_complete(notif, locked);
>   }
> 
> -static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
> -                      struct ubuf_info *uarg,
> -                      bool success)
> +static inline void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
> +                         struct ubuf_info *uarg,
> +                         bool success)

This one as well.


>   {
>       struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
>       struct io_kiocb *notif = cmd_to_io_kiocb(nd);
> 
>       if (refcount_dec_and_test(&uarg->refcnt)) {
> -        notif->io_task_work.func = __io_notif_complete_tw;
>           io_req_task_work_add(notif);
>       }
>   }
> 
> -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
> +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked)

Just shove all that into __io_notif_complete_tw().

> +{
> +    struct io_notif_data *nd = io_notif_to_data(notif);
> +
> +    if (likely(nd->zc_used))
> +        notif->cqe.res |= IORING_NOTIF_USAGE_ZC_USED;
> +
> +    if (unlikely(nd->zc_copied))
> +        notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
> +
> +    __io_notif_complete_tw(notif, locked);
> +}
> +
> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb,
> +                            struct ubuf_info *uarg,
> +                            bool success)
> +{
> +    struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
> +
> +    if (success && !nd->zc_used && skb)
> +        nd->zc_used = true;
> +    else if (unlikely(!success && !nd->zc_copied))
> +        nd->zc_copied = true;

It's fine but racy, so let's WRITE_ONCE() to indicate it.

> +
> +    io_uring_tx_zerocopy_callback(skb, uarg, success);
> +}
> +
> +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, bool report_usage)
>       __must_hold(&ctx->uring_lock)

And it's better to kill this argument and init zc_used/copied
unconditionally.

>   {
>       struct io_kiocb *notif;
> @@ -54,7 +81,14 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
>       nd = io_notif_to_data(notif);
>       nd->account_pages = 0;
>       nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
> -    nd->uarg.callback = io_uring_tx_zerocopy_callback;
> +    if (report_usage) {
> +        nd->zc_used = nd->zc_copied = false;
> +        nd->uarg.callback = io_uring_tx_zerocopy_callback_report_usage;
> +        notif->io_task_work.func = __io_notif_complete_tw_report_usage;
> +    } else {
> +        nd->uarg.callback = io_uring_tx_zerocopy_callback;
> +        notif->io_task_work.func = __io_notif_complete_tw;
> +    }
>       refcount_set(&nd->uarg.refcnt, 1);
>       return notif;
>   }
> @@ -66,7 +100,6 @@ void io_notif_flush(struct io_kiocb *notif)
> 
>       /* drop slot's master ref */
>       if (refcount_dec_and_test(&nd->uarg.refcnt)) {
> -        notif->io_task_work.func = __io_notif_complete_tw;
>           io_req_task_work_add(notif);
>       }
>   }
> diff --git a/io_uring/notif.h b/io_uring/notif.h
> index 5b4d710c8ca5..5ac7a2745e52 100644
> --- a/io_uring/notif.h
> +++ b/io_uring/notif.h
> @@ -13,10 +13,12 @@ struct io_notif_data {
>       struct file        *file;
>       struct ubuf_info    uarg;
>       unsigned long        account_pages;
> +    bool            zc_used;
> +    bool            zc_copied;

IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied}
there might complicate backporting (if any). We can place them in io_kiocb
directly and move in 6.2. Alternatively account_pages doesn't have to be
long.

>   };
> 
>   void io_notif_flush(struct io_kiocb *notif);
> -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx);
> +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, bool report_usage);
> 
>   static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
>   {
>
Stefan Metzmacher Oct. 20, 2022, 2:51 p.m. UTC | #2
Hi Pavel,

>> So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag
>> and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001)
>> and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also
>> able to notice that some parts were able to use zero copy, while other
>> fragments were copied.
> 
> Are we really interested in multihoming and probably some very edge cases?
> I'd argue we're not and it should be a single bool hint indicating whether
> zc is viable or not. It can do more complex calculations _if_ needed, e.g.
> looking inside skb's and figure out how many bytes were copied but as for me
> it should better be turned into a single bool in the end. Could also be the
> number of bytes copied, but I don't think we can't have the accuracy for
> that (e.g. what we're going to return if some protocol duplicates an skb
> and sends to 2 different devices or is processing it in a pipeline?)
> 
> So the question is what is the use case for having 2 flags?

It's mostly for debugging.

> btw, now we've got another example why the report flag is a good idea,

I don't understand that line...

> we can't use cqe.res unconditionally because we want to have a "one CQE
> per request" mode, but it's fine if we make it and the report flag
> mutually exclusive.

You mean we can add an optimized case where SEND[MSG]_ZC would not
generate F_MORE and skips F_NOTIF, because we copied or the transmission
path was really fast?

Then I'd move to IORING_CQE_F_COPIED again...

>> I haven't tested it yet, but I want to post it early...
>>
>> What do you think?
> 
> Keeping in mind potential backporting let's make it as simple and
> short as possible first and then do optimisations on top.

ok.

>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index ab7458033ee3..751fc4eff8d1 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -296,10 +296,28 @@ enum io_uring_op {
>>    *
>>    * IORING_RECVSEND_FIXED_BUF    Use registered buffers, the index is stored in
>>    *                the buf_index field.
>> + *
>> + * IORING_SEND_NOTIF_REPORT_USAGE
>> + *                If SEND[MSG]_ZC should report
>> + *                the zerocopy usage in cqe.res
>> + *                for the IORING_CQE_F_NOTIF cqe.
>> + *                IORING_NOTIF_USAGE_ZC_USED if zero copy was used
>> + *                (at least partially).
>> + *                IORING_NOTIF_USAGE_ZC_COPIED if data was copied
>> + *                (at least partially).
>>    */
>>   #define IORING_RECVSEND_POLL_FIRST    (1U << 0)
>>   #define IORING_RECV_MULTISHOT        (1U << 1)
>>   #define IORING_RECVSEND_FIXED_BUF    (1U << 2)
>> +#define IORING_SEND_NOTIF_REPORT_USAGE    (1U << 3)
>> +
>> +/*
>> + * cqe.res for IORING_CQE_F_NOTIF if
>> + * IORING_SEND_NOTIF_REPORT_USAGE was requested
>> + */
>> +#define IORING_NOTIF_USAGE_ZC_USED    (1U << 0)
>> +#define IORING_NOTIF_USAGE_ZC_COPIED    (1U << 31)
>> +
>>
>>   /*
>>    * accept flags stored in sqe->ioprio
>> diff --git a/io_uring/net.c b/io_uring/net.c
>> index 735eec545115..a79d7d349e19 100644
>> --- a/io_uring/net.c
>> +++ b/io_uring/net.c
>> @@ -946,9 +946,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>
>>       zc->flags = READ_ONCE(sqe->ioprio);
>>       if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
>> -              IORING_RECVSEND_FIXED_BUF))
>> +              IORING_RECVSEND_FIXED_BUF |
>> +              IORING_SEND_NOTIF_REPORT_USAGE))
>>           return -EINVAL;
>> -    notif = zc->notif = io_alloc_notif(ctx);
>> +    notif = zc->notif = io_alloc_notif(ctx,
>> +                       zc->flags & IORING_SEND_NOTIF_REPORT_USAGE);
>>       if (!notif)
>>           return -ENOMEM;
>>       notif->cqe.user_data = req->cqe.user_data;
>> diff --git a/io_uring/notif.c b/io_uring/notif.c
>> index e37c6569d82e..3844e3c8ad7e 100644
>> --- a/io_uring/notif.c
>> +++ b/io_uring/notif.c
>> @@ -3,13 +3,14 @@
>>   #include <linux/file.h>
>>   #include <linux/slab.h>
>>   #include <linux/net.h>
>> +#include <linux/errqueue.h>
> 
> Is it needed?

No

>>   #include <linux/io_uring.h>
>>
>>   #include "io_uring.h"
>>   #include "notif.h"
>>   #include "rsrc.h"
>>
>> -static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
>> +static inline void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
> 
> Let's remove this hunk with inlining and do it later
> 
>>   {
>>       struct io_notif_data *nd = io_notif_to_data(notif);
>>       struct io_ring_ctx *ctx = notif->ctx;
>> @@ -21,20 +22,46 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
>>       io_req_task_complete(notif, locked);
>>   }
>>
>> -static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
>> -                      struct ubuf_info *uarg,
>> -                      bool success)
>> +static inline void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
>> +                         struct ubuf_info *uarg,
>> +                         bool success)
> 
> This one as well.
> 
> 
>>   {
>>       struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
>>       struct io_kiocb *notif = cmd_to_io_kiocb(nd);
>>
>>       if (refcount_dec_and_test(&uarg->refcnt)) {
>> -        notif->io_task_work.func = __io_notif_complete_tw;
>>           io_req_task_work_add(notif);
>>       }
>>   }
>>
>> -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
>> +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked)
> 
> Just shove all that into __io_notif_complete_tw().

Ok, and then optimze later?

Otherwise we could have IORING_CQE_F_COPIED by default without opt-in
flag...

>> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb,
>> +                            struct ubuf_info *uarg,
>> +                            bool success)
>> +{
>> +    struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
>> +
>> +    if (success && !nd->zc_used && skb)
>> +        nd->zc_used = true;
>> +    else if (unlikely(!success && !nd->zc_copied))
>> +        nd->zc_copied = true;
> 
> It's fine but racy, so let's WRITE_ONCE() to indicate it.

I don't see how this could be a problem, but I can add it.

>> diff --git a/io_uring/notif.h b/io_uring/notif.h
>> index 5b4d710c8ca5..5ac7a2745e52 100644
>> --- a/io_uring/notif.h
>> +++ b/io_uring/notif.h
>> @@ -13,10 +13,12 @@ struct io_notif_data {
>>       struct file        *file;
>>       struct ubuf_info    uarg;
>>       unsigned long        account_pages;
>> +    bool            zc_used;
>> +    bool            zc_copied;
> 
> IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied}
> there might complicate backporting (if any). We can place them in io_kiocb
> directly and move in 6.2. Alternatively account_pages doesn't have to be
> long.

As far as I can see kernel-dk-block/io_uring-6.1 alread has your
shrink patches included...
Pavel Begunkov Oct. 20, 2022, 3:31 p.m. UTC | #3
On 10/20/22 15:51, Stefan Metzmacher wrote:
> Hi Pavel,
> 
>>> So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag
>>> and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001)
>>> and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also
>>> able to notice that some parts were able to use zero copy, while other
>>> fragments were copied.
>>
>> Are we really interested in multihoming and probably some very edge cases?
>> I'd argue we're not and it should be a single bool hint indicating whether
>> zc is viable or not. It can do more complex calculations _if_ needed, e.g.
>> looking inside skb's and figure out how many bytes were copied but as for me
>> it should better be turned into a single bool in the end. Could also be the
>> number of bytes copied, but I don't think we can't have the accuracy for
>> that (e.g. what we're going to return if some protocol duplicates an skb
>> and sends to 2 different devices or is processing it in a pipeline?)
>>
>> So the question is what is the use case for having 2 flags?
> 
> It's mostly for debugging.

Ok, than it sounds like we don't need it.


>> btw, now we've got another example why the report flag is a good idea,
> 
> I don't understand that line...

I'm just telling that IORING_SEND_NOTIF_* instead of unconditional reporting
is more flexible and extendible from the uapi perspective.


>> we can't use cqe.res unconditionally because we want to have a "one CQE
>> per request" mode, but it's fine if we make it and the report flag
>> mutually exclusive.
> 
> You mean we can add an optimized case where SEND[MSG]_ZC would not
> generate F_MORE and skips F_NOTIF, because we copied or the transmission
> path was really fast?

It is rather about optionally omitting the first (aka completion) cqe and
posting only the notification cqe, which makes a lot of sense for UDP and
some TCP use cases.


> Then I'd move to IORING_CQE_F_COPIED again...
[...]
>>> -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
>>> +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked)
>>
>> Just shove all that into __io_notif_complete_tw().
> 
> Ok, and then optimze later?

Right, I'm just tired of back porting patches by hand :)


> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in
> flag...
> 
>>> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb,
>>> +                            struct ubuf_info *uarg,
>>> +                            bool success)
>>> +{
>>> +    struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
>>> +
>>> +    if (success && !nd->zc_used && skb)
>>> +        nd->zc_used = true;
>>> +    else if (unlikely(!success && !nd->zc_copied))
>>> +        nd->zc_copied = true;
>>
>> It's fine but racy, so let's WRITE_ONCE() to indicate it.
> 
> I don't see how this could be a problem, but I can add it.

It's not a problem, but better to be a little be more explicit
about parallel writes.


>>> diff --git a/io_uring/notif.h b/io_uring/notif.h
>>> index 5b4d710c8ca5..5ac7a2745e52 100644
>>> --- a/io_uring/notif.h
>>> +++ b/io_uring/notif.h
>>> @@ -13,10 +13,12 @@ struct io_notif_data {
>>>       struct file        *file;
>>>       struct ubuf_info    uarg;
>>>       unsigned long        account_pages;
>>> +    bool            zc_used;
>>> +    bool            zc_copied;
>>
>> IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied}
>> there might complicate backporting (if any). We can place them in io_kiocb
>> directly and move in 6.2. Alternatively account_pages doesn't have to be
>> long.
> 
> As far as I can see kernel-dk-block/io_uring-6.1 alread has your
> shrink patches included...

Sorry, I mean 6.0
Stefan Metzmacher Oct. 21, 2022, 9:36 a.m. UTC | #4
Hi Pavel,

>>>> So far I came up with a IORING_SEND_NOTIF_REPORT_USAGE opt-in flag
>>>> and the reporting is done in cqe.res with IORING_NOTIF_USAGE_ZC_USED (0x00000001)
>>>> and/or IORING_NOTIF_USAGE_ZC_COPIED (0x8000000). So the caller is also
>>>> able to notice that some parts were able to use zero copy, while other
>>>> fragments were copied.
>>>
>>> Are we really interested in multihoming and probably some very edge cases?
>>> I'd argue we're not and it should be a single bool hint indicating whether
>>> zc is viable or not. It can do more complex calculations _if_ needed, e.g.
>>> looking inside skb's and figure out how many bytes were copied but as for me
>>> it should better be turned into a single bool in the end. Could also be the
>>> number of bytes copied, but I don't think we can't have the accuracy for
>>> that (e.g. what we're going to return if some protocol duplicates an skb
>>> and sends to 2 different devices or is processing it in a pipeline?)
>>>
>>> So the question is what is the use case for having 2 flags?
>>
>> It's mostly for debugging.
> 
> Ok, than it sounds like we don't need it.

Maybe I could add some trace points to the callback?

>>> btw, now we've got another example why the report flag is a good idea,
>>
>> I don't understand that line...
> 
> I'm just telling that IORING_SEND_NOTIF_* instead of unconditional reporting
> is more flexible and extendible from the uapi perspective.

ok

>>> we can't use cqe.res unconditionally because we want to have a "one CQE
>>> per request" mode, but it's fine if we make it and the report flag
>>> mutually exclusive.
>>
>> You mean we can add an optimized case where SEND[MSG]_ZC would not
>> generate F_MORE and skips F_NOTIF, because we copied or the transmission
>> path was really fast?
> 
> It is rather about optionally omitting the first (aka completion) cqe and
> posting only the notification cqe, which makes a lot of sense for UDP and
> some TCP use cases.

OK.

>> Then I'd move to IORING_CQE_F_COPIED again...
> [...]
>>>> -struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
>>>> +static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked)
>>>
>>> Just shove all that into __io_notif_complete_tw().
>>
>> Ok, and then optimze later?
> 
> Right, I'm just tired of back porting patches by hand :)

ok, I just assumed it would be 6.1 only.

>> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in
>> flag...

Do you still want an opt-in flag to get IORING_CQE_F_COPIED?
If so what name do you want it to be?

>>>> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb,
>>>> +                            struct ubuf_info *uarg,
>>>> +                            bool success)
>>>> +{
>>>> +    struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
>>>> +
>>>> +    if (success && !nd->zc_used && skb)
>>>> +        nd->zc_used = true;
>>>> +    else if (unlikely(!success && !nd->zc_copied))
>>>> +        nd->zc_copied = true;
>>>
>>> It's fine but racy, so let's WRITE_ONCE() to indicate it.
>>
>> I don't see how this could be a problem, but I can add it.
> 
> It's not a problem, but better to be a little be more explicit
> about parallel writes.

ok.

>>>> diff --git a/io_uring/notif.h b/io_uring/notif.h
>>>> index 5b4d710c8ca5..5ac7a2745e52 100644
>>>> --- a/io_uring/notif.h
>>>> +++ b/io_uring/notif.h
>>>> @@ -13,10 +13,12 @@ struct io_notif_data {
>>>>       struct file        *file;
>>>>       struct ubuf_info    uarg;
>>>>       unsigned long        account_pages;
>>>> +    bool            zc_used;
>>>> +    bool            zc_copied;
>>>
>>> IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied}
>>> there might complicate backporting (if any). We can place them in io_kiocb
>>> directly and move in 6.2. Alternatively account_pages doesn't have to be
>>> long.
>>
>> As far as I can see kernel-dk-block/io_uring-6.1 alread has your
>> shrink patches included...
> 
> Sorry, I mean 6.0

So you want to backport to 6.0?

Find the current version below, sizeof(struct io_kiocb) will grow from
3*64 + 24 to 3*64 + 32 (on x64_64) to it stays within 4 cache lines.

I tried this first:

union {
   u8 iopoll_completed;
   struct {
     u8 zc_used:1;
     u8 zc_copied:1;
   };
};

But then WRITE_ONCE() complains about a bitfield write.

So let me now about the opt-in flag and I'll prepare real commits
including a patch that moves from struct io_kiocb to struct io_notif_data
on top.

metze

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f5b687a787a3..189152ad78d6 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -515,6 +515,9 @@ struct io_kiocb {
  	u8				opcode;
  	/* polled IO has completed */
  	u8				iopoll_completed;
+	/* these will be moved to struct io_notif_data in 6.1 */
+	bool				zc_used;
+	bool				zc_copied;
  	/*
  	 * Can be either a fixed buffer index, or used with provided buffers.
  	 * For the latter, before issue it points to the buffer group ID,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ab7458033ee3..738d6234d1d9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -350,6 +350,7 @@ struct io_uring_cqe {
  #define IORING_CQE_F_MORE		(1U << 1)
  #define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
  #define IORING_CQE_F_NOTIF		(1U << 3)
+#define IORING_CQE_F_COPIED		(1U << 4)

  enum {
  	IORING_CQE_BUFFER_SHIFT		= 16,
diff --git a/io_uring/notif.c b/io_uring/notif.c
index e37c6569d82e..033aca064b10 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
  		__io_unaccount_mem(ctx->user, nd->account_pages);
  		nd->account_pages = 0;
  	}
+
+	if (notif->zc_copied || !notif->zc_used)
+		notif->cqe.flags |= IORING_CQE_F_COPIED;
+
  	io_req_task_complete(notif, locked);
  }

@@ -28,6 +32,11 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
  	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
  	struct io_kiocb *notif = cmd_to_io_kiocb(nd);

+	if (success && !notif->zc_used && skb)
+		WRITE_ONCE(notif->zc_used, true);
+	else if (!success && !notif->zc_copied)
+		WRITE_ONCE(notif->zc_copied, true);
+
  	if (refcount_dec_and_test(&uarg->refcnt)) {
  		notif->io_task_work.func = __io_notif_complete_tw;
  		io_req_task_work_add(notif);
@@ -55,6 +64,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
  	nd->account_pages = 0;
  	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
  	nd->uarg.callback = io_uring_tx_zerocopy_callback;
+	notif->zc_used = notif->zc_copied = false;
  	refcount_set(&nd->uarg.refcnt, 1);
  	return notif;
  }
Pavel Begunkov Oct. 21, 2022, 11:09 a.m. UTC | #5
On 10/21/22 10:36, Stefan Metzmacher wrote:
> Hi Pavel,
[...]
>> Right, I'm just tired of back porting patches by hand :)
> 
> ok, I just assumed it would be 6.1 only.

I'm fine with 6.1 only, it'd make things easier. I thought from
your first postings you wanted it 6.0. Then we don't need to care
about the placing of the copied/used flags.

>>> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in
>>> flag...
> 
> Do you still want an opt-in flag to get IORING_CQE_F_COPIED?
> If so what name do you want it to be?

Ala a IORING_SEND_* flag? Yes please.

*_REPORT_USAGE was fine but I'd make it IORING_SEND_ZC_REPORT_USAGE.
And can be extended if there is more info needed in the future.

And I don't mind using a bit in cqe->res, makes cflags less polluted.


>>>>> +static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb,
>>>>> +                            struct ubuf_info *uarg,
>>>>> +                            bool success)
>>>>> +{
>>>>> +    struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
>>>>> +
>>>>> +    if (success && !nd->zc_used && skb)
>>>>> +        nd->zc_used = true;
>>>>> +    else if (unlikely(!success && !nd->zc_copied))
>>>>> +        nd->zc_copied = true;
>>>>
>>>> It's fine but racy, so let's WRITE_ONCE() to indicate it.
>>>
>>> I don't see how this could be a problem, but I can add it.
>>
>> It's not a problem, but better to be a little be more explicit
>> about parallel writes.
> 
> ok.
> 
>>>>> diff --git a/io_uring/notif.h b/io_uring/notif.h
>>>>> index 5b4d710c8ca5..5ac7a2745e52 100644
>>>>> --- a/io_uring/notif.h
>>>>> +++ b/io_uring/notif.h
>>>>> @@ -13,10 +13,12 @@ struct io_notif_data {
>>>>>       struct file        *file;
>>>>>       struct ubuf_info    uarg;
>>>>>       unsigned long        account_pages;
>>>>> +    bool            zc_used;
>>>>> +    bool            zc_copied;
>>>>
>>>> IIRC io_notif_data is fully packed in 6.1, so placing zc_{used,copied}
>>>> there might complicate backporting (if any). We can place them in io_kiocb
>>>> directly and move in 6.2. Alternatively account_pages doesn't have to be
>>>> long.
>>>
>>> As far as I can see kernel-dk-block/io_uring-6.1 alread has your
>>> shrink patches included...
>>
>> Sorry, I mean 6.0
> 
> So you want to backport to 6.0?
> 
> Find the current version below, sizeof(struct io_kiocb) will grow from
> 3*64 + 24 to 3*64 + 32 (on x64_64) to it stays within 4 cache lines.
> 
> I tried this first:
> 
> union {
>    u8 iopoll_completed;
>    struct {
>      u8 zc_used:1;
>      u8 zc_copied:1;
>    };
> };
> 
> But then WRITE_ONCE() complains about a bitfield write.

rightfully so, it can't be a bitfield as it would be racy
and not only in theory this time.


> So let me now about the opt-in flag and I'll prepare real commits
> including a patch that moves from struct io_kiocb to struct io_notif_data
> on top.

Yeah, better to be opt-in, but apart from it and comments above
looks good.


> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index f5b687a787a3..189152ad78d6 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -515,6 +515,9 @@ struct io_kiocb {
>       u8                opcode;
>       /* polled IO has completed */
>       u8                iopoll_completed;
> +    /* these will be moved to struct io_notif_data in 6.1 */
> +    bool                zc_used;
> +    bool                zc_copied;
>       /*
>        * Can be either a fixed buffer index, or used with provided buffers.
>        * For the latter, before issue it points to the buffer group ID,
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index ab7458033ee3..738d6234d1d9 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -350,6 +350,7 @@ struct io_uring_cqe {
>   #define IORING_CQE_F_MORE        (1U << 1)
>   #define IORING_CQE_F_SOCK_NONEMPTY    (1U << 2)
>   #define IORING_CQE_F_NOTIF        (1U << 3)
> +#define IORING_CQE_F_COPIED        (1U << 4)
> 
>   enum {
>       IORING_CQE_BUFFER_SHIFT        = 16,
> diff --git a/io_uring/notif.c b/io_uring/notif.c
> index e37c6569d82e..033aca064b10 100644
> --- a/io_uring/notif.c
> +++ b/io_uring/notif.c
> @@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
>           __io_unaccount_mem(ctx->user, nd->account_pages);
>           nd->account_pages = 0;
>       }
> +
> +    if (notif->zc_copied || !notif->zc_used)
> +        notif->cqe.flags |= IORING_CQE_F_COPIED;
> +

As discussed above, should depend on IORING_SEND_ZC_REPORT_USAGE

>       io_req_task_complete(notif, locked);
>   }
> 
> @@ -28,6 +32,11 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
>       struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
>       struct io_kiocb *notif = cmd_to_io_kiocb(nd);
> 
> +    if (success && !notif->zc_used && skb)
> +        WRITE_ONCE(notif->zc_used, true);
> +    else if (!success && !notif->zc_copied)
> +        WRITE_ONCE(notif->zc_copied, true);
> +
>       if (refcount_dec_and_test(&uarg->refcnt)) {
>           notif->io_task_work.func = __io_notif_complete_tw;
>           io_req_task_work_add(notif);
> @@ -55,6 +64,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
>       nd->account_pages = 0;
>       nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
>       nd->uarg.callback = io_uring_tx_zerocopy_callback;
> +    notif->zc_used = notif->zc_copied = false;
>       refcount_set(&nd->uarg.refcnt, 1);
>       return notif;
>   }
>
Stefan Metzmacher Oct. 21, 2022, 2:03 p.m. UTC | #6
Am 21.10.22 um 13:09 schrieb Pavel Begunkov:
> On 10/21/22 10:36, Stefan Metzmacher wrote:
>> Hi Pavel,
> [...]
>>> Right, I'm just tired of back porting patches by hand :)
>>
>> ok, I just assumed it would be 6.1 only.
> 
> I'm fine with 6.1 only, it'd make things easier. I thought from
> your first postings you wanted it 6.0. Then we don't need to care
> about the placing of the copied/used flags.
> 
>>>> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in
>>>> flag...
>>
>> Do you still want an opt-in flag to get IORING_CQE_F_COPIED?
>> If so what name do you want it to be?
> 
> Ala a IORING_SEND_* flag? Yes please.
> 
> *_REPORT_USAGE was fine but I'd make it IORING_SEND_ZC_REPORT_USAGE.
> And can be extended if there is more info needed in the future.
> 
> And I don't mind using a bit in cqe->res, makes cflags less polluted.

So no worries about the delayed/skip sendmsg completion anymore?

Should I define it like this, ok?

#define IORING_NOTIF_USAGE_ZC_COPIED    (1U << 31)

See the full patch below...

metze

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d69ae7eba773..32e1f2a55b70 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -296,10 +296,24 @@ enum io_uring_op {
   *
   * IORING_RECVSEND_FIXED_BUF	Use registered buffers, the index is stored in
   *				the buf_index field.
+
+ * IORING_SEND_NOTIF_REPORT_USAGE
+ *				If SEND[MSG]_ZC should report
+ *				the zerocopy usage in cqe.res
+ *				for the IORING_CQE_F_NOTIF cqe.
+ *				IORING_NOTIF_USAGE_ZC_COPIED if data was copied
+ *				(at least partially).
   */
  #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
  #define IORING_RECV_MULTISHOT		(1U << 1)
  #define IORING_RECVSEND_FIXED_BUF	(1U << 2)
+#define IORING_SEND_ZC_REPORT_USAGE	(1U << 3)
+
+/*
+ * cqe.res for IORING_CQE_F_NOTIF if
+ * IORING_SEND_ZC_REPORT_USAGE was requested
+ */
+#define IORING_NOTIF_USAGE_ZC_COPIED    (1U << 31)

  /*
   * accept flags stored in sqe->ioprio
diff --git a/io_uring/net.c b/io_uring/net.c
index 56078f47efe7..1aa3b50b3e82 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -939,7 +939,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

  	zc->flags = READ_ONCE(sqe->ioprio);
  	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
-			  IORING_RECVSEND_FIXED_BUF))
+			  IORING_RECVSEND_FIXED_BUF |
+			  IORING_SEND_ZC_REPORT_USAGE))
  		return -EINVAL;
  	notif = zc->notif = io_alloc_notif(ctx);
  	if (!notif)
@@ -957,6 +958,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  		req->imu = READ_ONCE(ctx->user_bufs[idx]);
  		io_req_set_rsrc_node(notif, ctx, 0);
  	}
+	if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
+		io_notif_to_data(notif)->zc_report = true;
+	}

  	if (req->opcode == IORING_OP_SEND_ZC) {
  		if (READ_ONCE(sqe->__pad3[0]))
diff --git a/io_uring/notif.c b/io_uring/notif.c
index e37c6569d82e..4bfef10161fa 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
  		__io_unaccount_mem(ctx->user, nd->account_pages);
  		nd->account_pages = 0;
  	}
+
+	if (nd->zc_report && (nd->zc_copied || !nd->zc_used))
+		notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
+
  	io_req_task_complete(notif, locked);
  }

@@ -28,6 +32,13 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
  	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
  	struct io_kiocb *notif = cmd_to_io_kiocb(nd);

+	if (nd->zc_report) {
+		if (success && !nd->zc_used && skb)
+			WRITE_ONCE(nd->zc_used, true);
+		else if (!success && !nd->zc_copied)
+			WRITE_ONCE(nd->zc_copied, true);
+	}
+
  	if (refcount_dec_and_test(&uarg->refcnt)) {
  		notif->io_task_work.func = __io_notif_complete_tw;
  		io_req_task_work_add(notif);
@@ -55,6 +66,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
  	nd->account_pages = 0;
  	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
  	nd->uarg.callback = io_uring_tx_zerocopy_callback;
+	nd->zc_report = nd->zc_used = nd->zc_copied = false;
  	refcount_set(&nd->uarg.refcnt, 1);
  	return notif;
  }
diff --git a/io_uring/notif.h b/io_uring/notif.h
index e4fbcae0f3fd..6be2e5ae8581 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -15,6 +15,9 @@ struct io_notif_data {
  	struct file		*file;
  	struct ubuf_info	uarg;
  	unsigned long		account_pages;
+	bool			zc_report;
+	bool			zc_used;
+	bool			zc_copied;
  };

  void io_notif_flush(struct io_kiocb *notif);
Stefan Metzmacher Oct. 27, 2022, 8:47 a.m. UTC | #7
Hi Pavel,

>> Ala a IORING_SEND_* flag? Yes please.
>>
>> *_REPORT_USAGE was fine but I'd make it IORING_SEND_ZC_REPORT_USAGE.
>> And can be extended if there is more info needed in the future.
>>
>> And I don't mind using a bit in cqe->res, makes cflags less polluted.
> 
> So no worries about the delayed/skip sendmsg completion anymore?
> 
> Should I define it like this, ok?
> 
> #define IORING_NOTIF_USAGE_ZC_COPIED    (1U << 31)
> 
> See the full patch below...

Apart from still having IORING_SEND_NOTIF_REPORT_USAGE
in the comment... (which I'll fix...)

Is this now fine for you? Then I would post a real patch.

Thanks!
metze
Pavel Begunkov Oct. 27, 2022, 10:51 a.m. UTC | #8
On 10/21/22 15:03, Stefan Metzmacher wrote:
> Am 21.10.22 um 13:09 schrieb Pavel Begunkov:
>> On 10/21/22 10:36, Stefan Metzmacher wrote:
>>> Hi Pavel,
>> [...]
>>>> Right, I'm just tired of back porting patches by hand :)
>>>
>>> ok, I just assumed it would be 6.1 only.
>>
>> I'm fine with 6.1 only, it'd make things easier. I thought from
>> your first postings you wanted it 6.0. Then we don't need to care
>> about the placing of the copied/used flags.
>>
>>>>> Otherwise we could have IORING_CQE_F_COPIED by default without opt-in
>>>>> flag...
>>>
>>> Do you still want an opt-in flag to get IORING_CQE_F_COPIED?
>>> If so what name do you want it to be?
>>
>> Ala a IORING_SEND_* flag? Yes please.
>>
>> *_REPORT_USAGE was fine but I'd make it IORING_SEND_ZC_REPORT_USAGE.
>> And can be extended if there is more info needed in the future.
>>
>> And I don't mind using a bit in cqe->res, makes cflags less polluted.
> 
> So no worries about the delayed/skip sendmsg completion anymore?

I'll just make it incompatible the reporting.

> Should I define it like this, ok?
> 
> #define IORING_NOTIF_USAGE_ZC_COPIED    (1U << 31)
> 
> See the full patch below...

Looks good


> metze
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index d69ae7eba773..32e1f2a55b70 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -296,10 +296,24 @@ enum io_uring_op {
>    *
>    * IORING_RECVSEND_FIXED_BUF    Use registered buffers, the index is stored in
>    *                the buf_index field.
> +
> + * IORING_SEND_NOTIF_REPORT_USAGE
> + *                If SEND[MSG]_ZC should report
> + *                the zerocopy usage in cqe.res
> + *                for the IORING_CQE_F_NOTIF cqe.
> + *                IORING_NOTIF_USAGE_ZC_COPIED if data was copied
> + *                (at least partially).
>    */
>   #define IORING_RECVSEND_POLL_FIRST    (1U << 0)
>   #define IORING_RECV_MULTISHOT        (1U << 1)
>   #define IORING_RECVSEND_FIXED_BUF    (1U << 2)
> +#define IORING_SEND_ZC_REPORT_USAGE    (1U << 3)
> +
> +/*
> + * cqe.res for IORING_CQE_F_NOTIF if
> + * IORING_SEND_ZC_REPORT_USAGE was requested
> + */
> +#define IORING_NOTIF_USAGE_ZC_COPIED    (1U << 31)
> 
>   /*
>    * accept flags stored in sqe->ioprio
> diff --git a/io_uring/net.c b/io_uring/net.c
> index 56078f47efe7..1aa3b50b3e82 100644
> --- a/io_uring/net.c
> +++ b/io_uring/net.c
> @@ -939,7 +939,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> 
>       zc->flags = READ_ONCE(sqe->ioprio);
>       if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
> -              IORING_RECVSEND_FIXED_BUF))
> +              IORING_RECVSEND_FIXED_BUF |
> +              IORING_SEND_ZC_REPORT_USAGE))
>           return -EINVAL;
>       notif = zc->notif = io_alloc_notif(ctx);
>       if (!notif)
> @@ -957,6 +958,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>           req->imu = READ_ONCE(ctx->user_bufs[idx]);
>           io_req_set_rsrc_node(notif, ctx, 0);
>       }
> +    if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
> +        io_notif_to_data(notif)->zc_report = true;
> +    }
> 
>       if (req->opcode == IORING_OP_SEND_ZC) {
>           if (READ_ONCE(sqe->__pad3[0]))
> diff --git a/io_uring/notif.c b/io_uring/notif.c
> index e37c6569d82e..4bfef10161fa 100644
> --- a/io_uring/notif.c
> +++ b/io_uring/notif.c
> @@ -18,6 +18,10 @@ static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
>           __io_unaccount_mem(ctx->user, nd->account_pages);
>           nd->account_pages = 0;
>       }
> +
> +    if (nd->zc_report && (nd->zc_copied || !nd->zc_used))
> +        notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
> +
>       io_req_task_complete(notif, locked);
>   }
> 
> @@ -28,6 +32,13 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
>       struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
>       struct io_kiocb *notif = cmd_to_io_kiocb(nd);
> 
> +    if (nd->zc_report) {
> +        if (success && !nd->zc_used && skb)
> +            WRITE_ONCE(nd->zc_used, true);
> +        else if (!success && !nd->zc_copied)
> +            WRITE_ONCE(nd->zc_copied, true);
> +    }
> +
>       if (refcount_dec_and_test(&uarg->refcnt)) {
>           notif->io_task_work.func = __io_notif_complete_tw;
>           io_req_task_work_add(notif);
> @@ -55,6 +66,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
>       nd->account_pages = 0;
>       nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
>       nd->uarg.callback = io_uring_tx_zerocopy_callback;
> +    nd->zc_report = nd->zc_used = nd->zc_copied = false;
>       refcount_set(&nd->uarg.refcnt, 1);
>       return notif;
>   }
> diff --git a/io_uring/notif.h b/io_uring/notif.h
> index e4fbcae0f3fd..6be2e5ae8581 100644
> --- a/io_uring/notif.h
> +++ b/io_uring/notif.h
> @@ -15,6 +15,9 @@ struct io_notif_data {
>       struct file        *file;
>       struct ubuf_info    uarg;
>       unsigned long        account_pages;
> +    bool            zc_report;
> +    bool            zc_used;
> +    bool            zc_copied;
>   };
> 
>   void io_notif_flush(struct io_kiocb *notif);
>
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ab7458033ee3..751fc4eff8d1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -296,10 +296,28 @@  enum io_uring_op {
   *
   * IORING_RECVSEND_FIXED_BUF	Use registered buffers, the index is stored in
   *				the buf_index field.
+ *
+ * IORING_SEND_NOTIF_REPORT_USAGE
+ *				If SEND[MSG]_ZC should report
+ *				the zerocopy usage in cqe.res
+ *				for the IORING_CQE_F_NOTIF cqe.
+ *				IORING_NOTIF_USAGE_ZC_USED if zero copy was used
+ *				(at least partially).
+ *				IORING_NOTIF_USAGE_ZC_COPIED if data was copied
+ *				(at least partially).
   */
  #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
  #define IORING_RECV_MULTISHOT		(1U << 1)
  #define IORING_RECVSEND_FIXED_BUF	(1U << 2)
+#define IORING_SEND_NOTIF_REPORT_USAGE	(1U << 3)
+
+/*
+ * cqe.res for IORING_CQE_F_NOTIF if
+ * IORING_SEND_NOTIF_REPORT_USAGE was requested
+ */
+#define IORING_NOTIF_USAGE_ZC_USED	(1U << 0)
+#define IORING_NOTIF_USAGE_ZC_COPIED	(1U << 31)
+

  /*
   * accept flags stored in sqe->ioprio
diff --git a/io_uring/net.c b/io_uring/net.c
index 735eec545115..a79d7d349e19 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -946,9 +946,11 @@  int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

  	zc->flags = READ_ONCE(sqe->ioprio);
  	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
-			  IORING_RECVSEND_FIXED_BUF))
+			  IORING_RECVSEND_FIXED_BUF |
+			  IORING_SEND_NOTIF_REPORT_USAGE))
  		return -EINVAL;
-	notif = zc->notif = io_alloc_notif(ctx);
+	notif = zc->notif = io_alloc_notif(ctx,
+					   zc->flags & IORING_SEND_NOTIF_REPORT_USAGE);
  	if (!notif)
  		return -ENOMEM;
  	notif->cqe.user_data = req->cqe.user_data;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index e37c6569d82e..3844e3c8ad7e 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -3,13 +3,14 @@ 
  #include <linux/file.h>
  #include <linux/slab.h>
  #include <linux/net.h>
+#include <linux/errqueue.h>
  #include <linux/io_uring.h>

  #include "io_uring.h"
  #include "notif.h"
  #include "rsrc.h"

-static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
+static inline void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
  {
  	struct io_notif_data *nd = io_notif_to_data(notif);
  	struct io_ring_ctx *ctx = notif->ctx;
@@ -21,20 +22,46 @@  static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
  	io_req_task_complete(notif, locked);
  }

-static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
-					  struct ubuf_info *uarg,
-					  bool success)
+static inline void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
+						 struct ubuf_info *uarg,
+						 bool success)
  {
  	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
  	struct io_kiocb *notif = cmd_to_io_kiocb(nd);

  	if (refcount_dec_and_test(&uarg->refcnt)) {
-		notif->io_task_work.func = __io_notif_complete_tw;
  		io_req_task_work_add(notif);
  	}
  }

-struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
+static void __io_notif_complete_tw_report_usage(struct io_kiocb *notif, bool *locked)
+{
+	struct io_notif_data *nd = io_notif_to_data(notif);
+
+	if (likely(nd->zc_used))
+		notif->cqe.res |= IORING_NOTIF_USAGE_ZC_USED;
+
+	if (unlikely(nd->zc_copied))
+		notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
+
+	__io_notif_complete_tw(notif, locked);
+}
+
+static void io_uring_tx_zerocopy_callback_report_usage(struct sk_buff *skb,
+							struct ubuf_info *uarg,
+							bool success)
+{
+	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
+
+	if (success && !nd->zc_used && skb)
+		nd->zc_used = true;
+	else if (unlikely(!success && !nd->zc_copied))
+		nd->zc_copied = true;
+
+	io_uring_tx_zerocopy_callback(skb, uarg, success);
+}
+
+struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, bool report_usage)
  	__must_hold(&ctx->uring_lock)
  {
  	struct io_kiocb *notif;
@@ -54,7 +81,14 @@  struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
  	nd = io_notif_to_data(notif);
  	nd->account_pages = 0;
  	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
-	nd->uarg.callback = io_uring_tx_zerocopy_callback;
+	if (report_usage) {
+		nd->zc_used = nd->zc_copied = false;
+		nd->uarg.callback = io_uring_tx_zerocopy_callback_report_usage;
+		notif->io_task_work.func = __io_notif_complete_tw_report_usage;
+	} else {
+		nd->uarg.callback = io_uring_tx_zerocopy_callback;
+		notif->io_task_work.func = __io_notif_complete_tw;
+	}
  	refcount_set(&nd->uarg.refcnt, 1);
  	return notif;
  }
@@ -66,7 +100,6 @@  void io_notif_flush(struct io_kiocb *notif)

  	/* drop slot's master ref */
  	if (refcount_dec_and_test(&nd->uarg.refcnt)) {
-		notif->io_task_work.func = __io_notif_complete_tw;
  		io_req_task_work_add(notif);
  	}
  }
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 5b4d710c8ca5..5ac7a2745e52 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -13,10 +13,12 @@  struct io_notif_data {
  	struct file		*file;
  	struct ubuf_info	uarg;
  	unsigned long		account_pages;
+	bool			zc_used;
+	bool			zc_copied;
  };

  void io_notif_flush(struct io_kiocb *notif);
-struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx);
+struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, bool report_usage);

  static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
  {