diff mbox series

[v4,6/6] io_uring: add support for zone-append

Message ID 1595605762-17010-7-git-send-email-joshi.k@samsung.com (mailing list archive)
State New, archived
Headers show
Series [v4,1/6] fs: introduce FMODE_ZONE_APPEND and IOCB_ZONE_APPEND | expand

Commit Message

Kanchan Joshi July 24, 2020, 3:49 p.m. UTC
From: SelvaKumar S <selvakuma.s1@samsung.com>

Repurpose [cqe->res, cqe->flags] into cqe->res64 (signed) to report
64bit written-offset for zone-append. The appending-write which requires
reporting written-location (conveyed by IOCB_ZONE_APPEND flag) is
ensured not to be a short-write; this avoids the need to report
number-of-bytes-copied.
append-offset is returned by lower-layer to io-uring via ret2 of
ki_complete interface. Make changes to collect it and send to user-space
via cqe->res64.

Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
---
 fs/io_uring.c                 | 49 ++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/io_uring.h |  9 ++++++--
 2 files changed, 48 insertions(+), 10 deletions(-)

Comments

Jens Axboe July 24, 2020, 4:29 p.m. UTC | #1
On 7/24/20 9:49 AM, Kanchan Joshi wrote:
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 7809ab2..6510cf5 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>  	cqe = io_get_cqring(ctx);
>  	if (likely(cqe)) {
>  		WRITE_ONCE(cqe->user_data, req->user_data);
> -		WRITE_ONCE(cqe->res, res);
> -		WRITE_ONCE(cqe->flags, cflags);
> +		if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
> +			if (likely(res > 0))
> +				WRITE_ONCE(cqe->res64, req->rw.append_offset);
> +			else
> +				WRITE_ONCE(cqe->res64, res);
> +		} else {
> +			WRITE_ONCE(cqe->res, res);
> +			WRITE_ONCE(cqe->flags, cflags);
> +		}

This would be nice to keep out of the fast path, if possible.

> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 92c2269..2580d93 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -156,8 +156,13 @@ enum {
>   */
>  struct io_uring_cqe {
>  	__u64	user_data;	/* sqe->data submission passed back */
> -	__s32	res;		/* result code for this event */
> -	__u32	flags;
> +	union {
> +		struct {
> +			__s32	res;	/* result code for this event */
> +			__u32	flags;
> +		};
> +		__s64	res64;	/* appending offset for zone append */
> +	};
>  };

Is this a compatible change, both for now but also going forward? You
could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
Layout would also be different between big and little endian, so not
even that easy to set aside a flag for this. But even if that was done,
we'd still have this weird API where liburing or the app would need to
distinguish this cqe from all others based on... the user_data? Hence
liburing can't do it, only the app would be able to.

Just seems like a hack to me.
Kanchan Joshi July 27, 2020, 7:16 p.m. UTC | #2
On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 7809ab2..6510cf5 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
> >       cqe = io_get_cqring(ctx);
> >       if (likely(cqe)) {
> >               WRITE_ONCE(cqe->user_data, req->user_data);
> > -             WRITE_ONCE(cqe->res, res);
> > -             WRITE_ONCE(cqe->flags, cflags);
> > +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
> > +                     if (likely(res > 0))
> > +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
> > +                     else
> > +                             WRITE_ONCE(cqe->res64, res);
> > +             } else {
> > +                     WRITE_ONCE(cqe->res, res);
> > +                     WRITE_ONCE(cqe->flags, cflags);
> > +             }
>
> This would be nice to keep out of the fast path, if possible.

I was thinking of keeping a function-pointer (in io_kiocb) during
submission. That would have avoided this check......but argument count
differs, so it did not add up.

> > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> > index 92c2269..2580d93 100644
> > --- a/include/uapi/linux/io_uring.h
> > +++ b/include/uapi/linux/io_uring.h
> > @@ -156,8 +156,13 @@ enum {
> >   */
> >  struct io_uring_cqe {
> >       __u64   user_data;      /* sqe->data submission passed back */
> > -     __s32   res;            /* result code for this event */
> > -     __u32   flags;
> > +     union {
> > +             struct {
> > +                     __s32   res;    /* result code for this event */
> > +                     __u32   flags;
> > +             };
> > +             __s64   res64;  /* appending offset for zone append */
> > +     };
> >  };
>
> Is this a compatible change, both for now but also going forward? You
> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.

Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
used/set for write currently, so it looked compatible at this point.
Yes, no room for future flags for this operation.
Do you see any other way to enable this support in io-uring?

> Layout would also be different between big and little endian, so not
> even that easy to set aside a flag for this. But even if that was done,
> we'd still have this weird API where liburing or the app would need to
> distinguish this cqe from all others based on... the user_data? Hence
> liburing can't do it, only the app would be able to.
>
> Just seems like a hack to me.

Yes, only user_data to distinguish. Do liburing helpers need to look
at cqe->res (and decide something) before returning the cqe to
application?
I see that happening at once place, but not sure when it would hit
LIBURING_DATA_TIMEOUT condition.
__io_uring_peek_cqe()
{
           do {
                io_uring_for_each_cqe(ring, head, cqe)
                        break;
                if (cqe) {
                        if (cqe->user_data == LIBURING_UDATA_TIMEOUT) {
                                if (cqe->res < 0)
                                        err = cqe->res;
                                io_uring_cq_advance(ring, 1);
                                if (!err)
                                        continue;
                                cqe = NULL;
                        }
                }
                break;
        } while (1);
}
Jens Axboe July 27, 2020, 8:34 p.m. UTC | #3
On 7/27/20 1:16 PM, Kanchan Joshi wrote:
> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 7809ab2..6510cf5 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>       cqe = io_get_cqring(ctx);
>>>       if (likely(cqe)) {
>>>               WRITE_ONCE(cqe->user_data, req->user_data);
>>> -             WRITE_ONCE(cqe->res, res);
>>> -             WRITE_ONCE(cqe->flags, cflags);
>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
>>> +                     if (likely(res > 0))
>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
>>> +                     else
>>> +                             WRITE_ONCE(cqe->res64, res);
>>> +             } else {
>>> +                     WRITE_ONCE(cqe->res, res);
>>> +                     WRITE_ONCE(cqe->flags, cflags);
>>> +             }
>>
>> This would be nice to keep out of the fast path, if possible.
> 
> I was thinking of keeping a function-pointer (in io_kiocb) during
> submission. That would have avoided this check......but argument count
> differs, so it did not add up.

But that'd grow the io_kiocb just for this use case, which is arguably
even worse. Unless you can keep it in the per-request private data,
but there's no more room there for the regular read/write side.

>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>> index 92c2269..2580d93 100644
>>> --- a/include/uapi/linux/io_uring.h
>>> +++ b/include/uapi/linux/io_uring.h
>>> @@ -156,8 +156,13 @@ enum {
>>>   */
>>>  struct io_uring_cqe {
>>>       __u64   user_data;      /* sqe->data submission passed back */
>>> -     __s32   res;            /* result code for this event */
>>> -     __u32   flags;
>>> +     union {
>>> +             struct {
>>> +                     __s32   res;    /* result code for this event */
>>> +                     __u32   flags;
>>> +             };
>>> +             __s64   res64;  /* appending offset for zone append */
>>> +     };
>>>  };
>>
>> Is this a compatible change, both for now but also going forward? You
>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
> 
> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
> used/set for write currently, so it looked compatible at this point.

Not worried about that, since we won't ever use that for writes. But it
is a potential headache down the line for other flags, if they apply to
normal writes.

> Yes, no room for future flags for this operation.
> Do you see any other way to enable this support in io-uring?

Honestly I think the only viable option is as we discussed previously,
pass in a pointer to a 64-bit type where we can copy the additional
completion information to.

>> Layout would also be different between big and little endian, so not
>> even that easy to set aside a flag for this. But even if that was done,
>> we'd still have this weird API where liburing or the app would need to
>> distinguish this cqe from all others based on... the user_data? Hence
>> liburing can't do it, only the app would be able to.
>>
>> Just seems like a hack to me.
> 
> Yes, only user_data to distinguish. Do liburing helpers need to look
> at cqe->res (and decide something) before returning the cqe to
> application?

They generally don't, outside of the internal timeout. But it's an issue
for the API, as it forces applications to handle the CQEs a certain way.
Normally there's flexibility. This makes the append writes behave
differently than everything else, which is never a good idea.
Pavel Begunkov July 30, 2020, 3:57 p.m. UTC | #4
On 24/07/2020 18:49, Kanchan Joshi wrote:
> From: SelvaKumar S <selvakuma.s1@samsung.com>
> 
> Repurpose [cqe->res, cqe->flags] into cqe->res64 (signed) to report
> 64bit written-offset for zone-append. The appending-write which requires
> reporting written-location (conveyed by IOCB_ZONE_APPEND flag) is
> ensured not to be a short-write; this avoids the need to report
> number-of-bytes-copied.
> append-offset is returned by lower-layer to io-uring via ret2 of
> ki_complete interface. Make changes to collect it and send to user-space
> via cqe->res64.
> 
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
> ---
>  fs/io_uring.c                 | 49 ++++++++++++++++++++++++++++++++++++-------
>  include/uapi/linux/io_uring.h |  9 ++++++--
>  2 files changed, 48 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 7809ab2..6510cf5 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
...
> @@ -1244,8 +1254,15 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
>  		req->flags &= ~REQ_F_OVERFLOW;
>  		if (cqe) {
>  			WRITE_ONCE(cqe->user_data, req->user_data);
> -			WRITE_ONCE(cqe->res, req->result);
> -			WRITE_ONCE(cqe->flags, req->cflags);
> +			if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
> +				if (likely(req->result > 0))
> +					WRITE_ONCE(cqe->res64, req->rw.append_offset);
> +				else
> +					WRITE_ONCE(cqe->res64, req->result);
> +			} else {
> +				WRITE_ONCE(cqe->res, req->result);
> +				WRITE_ONCE(cqe->flags, req->cflags);
> +			}
>  		} else {
>  			WRITE_ONCE(ctx->rings->cq_overflow,
>  				atomic_inc_return(&ctx->cached_cq_overflow));
> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>  	cqe = io_get_cqring(ctx);
>  	if (likely(cqe)) {
>  		WRITE_ONCE(cqe->user_data, req->user_data);
> -		WRITE_ONCE(cqe->res, res);
> -		WRITE_ONCE(cqe->flags, cflags);
> +		if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
> +			if (likely(res > 0))
> +				WRITE_ONCE(cqe->res64, req->rw.append_offset);

1. as I mentioned before, that's not not nice to ignore @cflags
2. that's not the right place for opcode specific handling
3. it doesn't work with overflowed reqs, see the final else below

For this scheme, I'd pass @append_offset as an argument. That should
also remove this extra if from the fast path, which Jens mentioned.

> +			else
> +				WRITE_ONCE(cqe->res64, res);
> +		} else {
> +			WRITE_ONCE(cqe->res, res);
> +			WRITE_ONCE(cqe->flags, cflags);
> +		}
>  	} else if (ctx->cq_overflow_flushed) {
>  		WRITE_ONCE(ctx->rings->cq_overflow,
>  				atomic_inc_return(&ctx->cached_cq_overflow));
> @@ -1943,7 +1967,7 @@ static inline void req_set_fail_links(struct io_kiocb *req)
>  		req->flags |= REQ_F_FAIL_LINK;
>  }
>
Pavel Begunkov July 30, 2020, 4:08 p.m. UTC | #5
On 27/07/2020 23:34, Jens Axboe wrote:
> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>> index 7809ab2..6510cf5 100644
>>>> --- a/fs/io_uring.c
>>>> +++ b/fs/io_uring.c
>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>       cqe = io_get_cqring(ctx);
>>>>       if (likely(cqe)) {
>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
>>>> -             WRITE_ONCE(cqe->res, res);
>>>> -             WRITE_ONCE(cqe->flags, cflags);
>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
>>>> +                     if (likely(res > 0))
>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
>>>> +                     else
>>>> +                             WRITE_ONCE(cqe->res64, res);
>>>> +             } else {
>>>> +                     WRITE_ONCE(cqe->res, res);
>>>> +                     WRITE_ONCE(cqe->flags, cflags);
>>>> +             }
>>>
>>> This would be nice to keep out of the fast path, if possible.
>>
>> I was thinking of keeping a function-pointer (in io_kiocb) during
>> submission. That would have avoided this check......but argument count
>> differs, so it did not add up.
> 
> But that'd grow the io_kiocb just for this use case, which is arguably
> even worse. Unless you can keep it in the per-request private data,
> but there's no more room there for the regular read/write side.
> 
>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>> index 92c2269..2580d93 100644
>>>> --- a/include/uapi/linux/io_uring.h
>>>> +++ b/include/uapi/linux/io_uring.h
>>>> @@ -156,8 +156,13 @@ enum {
>>>>   */
>>>>  struct io_uring_cqe {
>>>>       __u64   user_data;      /* sqe->data submission passed back */
>>>> -     __s32   res;            /* result code for this event */
>>>> -     __u32   flags;
>>>> +     union {
>>>> +             struct {
>>>> +                     __s32   res;    /* result code for this event */
>>>> +                     __u32   flags;
>>>> +             };
>>>> +             __s64   res64;  /* appending offset for zone append */
>>>> +     };
>>>>  };
>>>
>>> Is this a compatible change, both for now but also going forward? You
>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
>>
>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
>> used/set for write currently, so it looked compatible at this point.
> 
> Not worried about that, since we won't ever use that for writes. But it
> is a potential headache down the line for other flags, if they apply to
> normal writes.
> 
>> Yes, no room for future flags for this operation.
>> Do you see any other way to enable this support in io-uring?
> 
> Honestly I think the only viable option is as we discussed previously,
> pass in a pointer to a 64-bit type where we can copy the additional
> completion information to.

TBH, I hate the idea of such overhead/latency at times when SSDs can
serve writes in less than 10ms. Any chance you measured how long does it
take to drag through task_work?

> 
>>> Layout would also be different between big and little endian, so not
>>> even that easy to set aside a flag for this. But even if that was done,
>>> we'd still have this weird API where liburing or the app would need to
>>> distinguish this cqe from all others based on... the user_data? Hence
>>> liburing can't do it, only the app would be able to.
>>>
>>> Just seems like a hack to me.
>>
>> Yes, only user_data to distinguish. Do liburing helpers need to look
>> at cqe->res (and decide something) before returning the cqe to
>> application?
> 
> They generally don't, outside of the internal timeout. But it's an issue
> for the API, as it forces applications to handle the CQEs a certain way.
> Normally there's flexibility. This makes the append writes behave
> differently than everything else, which is never a good idea.
>
Jens Axboe July 30, 2020, 4:13 p.m. UTC | #6
On 7/30/20 10:08 AM, Pavel Begunkov wrote:
> On 27/07/2020 23:34, Jens Axboe wrote:
>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>> index 7809ab2..6510cf5 100644
>>>>> --- a/fs/io_uring.c
>>>>> +++ b/fs/io_uring.c
>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>>       cqe = io_get_cqring(ctx);
>>>>>       if (likely(cqe)) {
>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
>>>>> -             WRITE_ONCE(cqe->res, res);
>>>>> -             WRITE_ONCE(cqe->flags, cflags);
>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
>>>>> +                     if (likely(res > 0))
>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
>>>>> +                     else
>>>>> +                             WRITE_ONCE(cqe->res64, res);
>>>>> +             } else {
>>>>> +                     WRITE_ONCE(cqe->res, res);
>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
>>>>> +             }
>>>>
>>>> This would be nice to keep out of the fast path, if possible.
>>>
>>> I was thinking of keeping a function-pointer (in io_kiocb) during
>>> submission. That would have avoided this check......but argument count
>>> differs, so it did not add up.
>>
>> But that'd grow the io_kiocb just for this use case, which is arguably
>> even worse. Unless you can keep it in the per-request private data,
>> but there's no more room there for the regular read/write side.
>>
>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>> index 92c2269..2580d93 100644
>>>>> --- a/include/uapi/linux/io_uring.h
>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>> @@ -156,8 +156,13 @@ enum {
>>>>>   */
>>>>>  struct io_uring_cqe {
>>>>>       __u64   user_data;      /* sqe->data submission passed back */
>>>>> -     __s32   res;            /* result code for this event */
>>>>> -     __u32   flags;
>>>>> +     union {
>>>>> +             struct {
>>>>> +                     __s32   res;    /* result code for this event */
>>>>> +                     __u32   flags;
>>>>> +             };
>>>>> +             __s64   res64;  /* appending offset for zone append */
>>>>> +     };
>>>>>  };
>>>>
>>>> Is this a compatible change, both for now but also going forward? You
>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
>>>
>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
>>> used/set for write currently, so it looked compatible at this point.
>>
>> Not worried about that, since we won't ever use that for writes. But it
>> is a potential headache down the line for other flags, if they apply to
>> normal writes.
>>
>>> Yes, no room for future flags for this operation.
>>> Do you see any other way to enable this support in io-uring?
>>
>> Honestly I think the only viable option is as we discussed previously,
>> pass in a pointer to a 64-bit type where we can copy the additional
>> completion information to.
> 
> TBH, I hate the idea of such overhead/latency at times when SSDs can
> serve writes in less than 10ms. Any chance you measured how long does it

10us? :-)

> take to drag through task_work?

A 64-bit value copy is really not a lot of overhead... But yes, we'd
need to push the completion through task_work at that point, as we can't
do it from the completion side. That's not a lot of overhead, and most
notably, it's overhead that only affects this particular type.

That's not a bad starting point, and something that can always be
optimized later if need be. But I seriously doubt it'd be anything to
worry about.
Pavel Begunkov July 30, 2020, 4:26 p.m. UTC | #7
On 30/07/2020 19:13, Jens Axboe wrote:
> On 7/30/20 10:08 AM, Pavel Begunkov wrote:
>> On 27/07/2020 23:34, Jens Axboe wrote:
>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>
>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>> index 7809ab2..6510cf5 100644
>>>>>> --- a/fs/io_uring.c
>>>>>> +++ b/fs/io_uring.c
>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>>>       cqe = io_get_cqring(ctx);
>>>>>>       if (likely(cqe)) {
>>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
>>>>>> -             WRITE_ONCE(cqe->res, res);
>>>>>> -             WRITE_ONCE(cqe->flags, cflags);
>>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
>>>>>> +                     if (likely(res > 0))
>>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
>>>>>> +                     else
>>>>>> +                             WRITE_ONCE(cqe->res64, res);
>>>>>> +             } else {
>>>>>> +                     WRITE_ONCE(cqe->res, res);
>>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
>>>>>> +             }
>>>>>
>>>>> This would be nice to keep out of the fast path, if possible.
>>>>
>>>> I was thinking of keeping a function-pointer (in io_kiocb) during
>>>> submission. That would have avoided this check......but argument count
>>>> differs, so it did not add up.
>>>
>>> But that'd grow the io_kiocb just for this use case, which is arguably
>>> even worse. Unless you can keep it in the per-request private data,
>>> but there's no more room there for the regular read/write side.
>>>
>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>> index 92c2269..2580d93 100644
>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>> @@ -156,8 +156,13 @@ enum {
>>>>>>   */
>>>>>>  struct io_uring_cqe {
>>>>>>       __u64   user_data;      /* sqe->data submission passed back */
>>>>>> -     __s32   res;            /* result code for this event */
>>>>>> -     __u32   flags;
>>>>>> +     union {
>>>>>> +             struct {
>>>>>> +                     __s32   res;    /* result code for this event */
>>>>>> +                     __u32   flags;
>>>>>> +             };
>>>>>> +             __s64   res64;  /* appending offset for zone append */
>>>>>> +     };
>>>>>>  };
>>>>>
>>>>> Is this a compatible change, both for now but also going forward? You
>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
>>>>
>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
>>>> used/set for write currently, so it looked compatible at this point.
>>>
>>> Not worried about that, since we won't ever use that for writes. But it
>>> is a potential headache down the line for other flags, if they apply to
>>> normal writes.
>>>
>>>> Yes, no room for future flags for this operation.
>>>> Do you see any other way to enable this support in io-uring?
>>>
>>> Honestly I think the only viable option is as we discussed previously,
>>> pass in a pointer to a 64-bit type where we can copy the additional
>>> completion information to.
>>
>> TBH, I hate the idea of such overhead/latency at times when SSDs can
>> serve writes in less than 10ms. Any chance you measured how long does it
> 
> 10us? :-)

Hah, 10us indeed :)

> 
>> take to drag through task_work?
> 
> A 64-bit value copy is really not a lot of overhead... But yes, we'd
> need to push the completion through task_work at that point, as we can't
> do it from the completion side. That's not a lot of overhead, and most
> notably, it's overhead that only affects this particular type.
> 
> That's not a bad starting point, and something that can always be
> optimized later if need be. But I seriously doubt it'd be anything to
> worry about.

I probably need to look myself how it's really scheduled, but if you don't
mind, here is a quick question: if we do work_add(task) when the task is
running in the userspace, wouldn't the work execution wait until the next
syscall/allotted time ends up?
Jens Axboe July 30, 2020, 5:16 p.m. UTC | #8
On 7/30/20 10:26 AM, Pavel Begunkov wrote:
> On 30/07/2020 19:13, Jens Axboe wrote:
>> On 7/30/20 10:08 AM, Pavel Begunkov wrote:
>>> On 27/07/2020 23:34, Jens Axboe wrote:
>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>
>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>> index 7809ab2..6510cf5 100644
>>>>>>> --- a/fs/io_uring.c
>>>>>>> +++ b/fs/io_uring.c
>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>>>>       cqe = io_get_cqring(ctx);
>>>>>>>       if (likely(cqe)) {
>>>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
>>>>>>> -             WRITE_ONCE(cqe->res, res);
>>>>>>> -             WRITE_ONCE(cqe->flags, cflags);
>>>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
>>>>>>> +                     if (likely(res > 0))
>>>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
>>>>>>> +                     else
>>>>>>> +                             WRITE_ONCE(cqe->res64, res);
>>>>>>> +             } else {
>>>>>>> +                     WRITE_ONCE(cqe->res, res);
>>>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
>>>>>>> +             }
>>>>>>
>>>>>> This would be nice to keep out of the fast path, if possible.
>>>>>
>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during
>>>>> submission. That would have avoided this check......but argument count
>>>>> differs, so it did not add up.
>>>>
>>>> But that'd grow the io_kiocb just for this use case, which is arguably
>>>> even worse. Unless you can keep it in the per-request private data,
>>>> but there's no more room there for the regular read/write side.
>>>>
>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>>> index 92c2269..2580d93 100644
>>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>>> @@ -156,8 +156,13 @@ enum {
>>>>>>>   */
>>>>>>>  struct io_uring_cqe {
>>>>>>>       __u64   user_data;      /* sqe->data submission passed back */
>>>>>>> -     __s32   res;            /* result code for this event */
>>>>>>> -     __u32   flags;
>>>>>>> +     union {
>>>>>>> +             struct {
>>>>>>> +                     __s32   res;    /* result code for this event */
>>>>>>> +                     __u32   flags;
>>>>>>> +             };
>>>>>>> +             __s64   res64;  /* appending offset for zone append */
>>>>>>> +     };
>>>>>>>  };
>>>>>>
>>>>>> Is this a compatible change, both for now but also going forward? You
>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
>>>>>
>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
>>>>> used/set for write currently, so it looked compatible at this point.
>>>>
>>>> Not worried about that, since we won't ever use that for writes. But it
>>>> is a potential headache down the line for other flags, if they apply to
>>>> normal writes.
>>>>
>>>>> Yes, no room for future flags for this operation.
>>>>> Do you see any other way to enable this support in io-uring?
>>>>
>>>> Honestly I think the only viable option is as we discussed previously,
>>>> pass in a pointer to a 64-bit type where we can copy the additional
>>>> completion information to.
>>>
>>> TBH, I hate the idea of such overhead/latency at times when SSDs can
>>> serve writes in less than 10ms. Any chance you measured how long does it
>>
>> 10us? :-)
> 
> Hah, 10us indeed :)
> 
>>
>>> take to drag through task_work?
>>
>> A 64-bit value copy is really not a lot of overhead... But yes, we'd
>> need to push the completion through task_work at that point, as we can't
>> do it from the completion side. That's not a lot of overhead, and most
>> notably, it's overhead that only affects this particular type.
>>
>> That's not a bad starting point, and something that can always be
>> optimized later if need be. But I seriously doubt it'd be anything to
>> worry about.
> 
> I probably need to look myself how it's really scheduled, but if you don't
> mind, here is a quick question: if we do work_add(task) when the task is
> running in the userspace, wouldn't the work execution wait until the next
> syscall/allotted time ends up?

It'll get the task to enter the kernel, just like signal delivery. The only
tricky part is really if we have a dependency waiting in the kernel, like
the recent eventfd fix.
Pavel Begunkov July 30, 2020, 5:38 p.m. UTC | #9
On 30/07/2020 20:16, Jens Axboe wrote:
> On 7/30/20 10:26 AM, Pavel Begunkov wrote:
>> On 30/07/2020 19:13, Jens Axboe wrote:
>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote:
>>>> On 27/07/2020 23:34, Jens Axboe wrote:
>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>
>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>> index 7809ab2..6510cf5 100644
>>>>>>>> --- a/fs/io_uring.c
>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>>>>>       cqe = io_get_cqring(ctx);
>>>>>>>>       if (likely(cqe)) {
>>>>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
>>>>>>>> -             WRITE_ONCE(cqe->res, res);
>>>>>>>> -             WRITE_ONCE(cqe->flags, cflags);
>>>>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
>>>>>>>> +                     if (likely(res > 0))
>>>>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
>>>>>>>> +                     else
>>>>>>>> +                             WRITE_ONCE(cqe->res64, res);
>>>>>>>> +             } else {
>>>>>>>> +                     WRITE_ONCE(cqe->res, res);
>>>>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
>>>>>>>> +             }
>>>>>>>
>>>>>>> This would be nice to keep out of the fast path, if possible.
>>>>>>
>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during
>>>>>> submission. That would have avoided this check......but argument count
>>>>>> differs, so it did not add up.
>>>>>
>>>>> But that'd grow the io_kiocb just for this use case, which is arguably
>>>>> even worse. Unless you can keep it in the per-request private data,
>>>>> but there's no more room there for the regular read/write side.
>>>>>
>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>>>> index 92c2269..2580d93 100644
>>>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>>>> @@ -156,8 +156,13 @@ enum {
>>>>>>>>   */
>>>>>>>>  struct io_uring_cqe {
>>>>>>>>       __u64   user_data;      /* sqe->data submission passed back */
>>>>>>>> -     __s32   res;            /* result code for this event */
>>>>>>>> -     __u32   flags;
>>>>>>>> +     union {
>>>>>>>> +             struct {
>>>>>>>> +                     __s32   res;    /* result code for this event */
>>>>>>>> +                     __u32   flags;
>>>>>>>> +             };
>>>>>>>> +             __s64   res64;  /* appending offset for zone append */
>>>>>>>> +     };
>>>>>>>>  };
>>>>>>>
>>>>>>> Is this a compatible change, both for now but also going forward? You
>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
>>>>>>
>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
>>>>>> used/set for write currently, so it looked compatible at this point.
>>>>>
>>>>> Not worried about that, since we won't ever use that for writes. But it
>>>>> is a potential headache down the line for other flags, if they apply to
>>>>> normal writes.
>>>>>
>>>>>> Yes, no room for future flags for this operation.
>>>>>> Do you see any other way to enable this support in io-uring?
>>>>>
>>>>> Honestly I think the only viable option is as we discussed previously,
>>>>> pass in a pointer to a 64-bit type where we can copy the additional
>>>>> completion information to.
>>>>
>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can
>>>> serve writes in less than 10ms. Any chance you measured how long does it
>>>
>>> 10us? :-)
>>
>> Hah, 10us indeed :)
>>
>>>
>>>> take to drag through task_work?
>>>
>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd
>>> need to push the completion through task_work at that point, as we can't
>>> do it from the completion side. That's not a lot of overhead, and most
>>> notably, it's overhead that only affects this particular type.
>>>
>>> That's not a bad starting point, and something that can always be
>>> optimized later if need be. But I seriously doubt it'd be anything to
>>> worry about.
>>
>> I probably need to look myself how it's really scheduled, but if you don't
>> mind, here is a quick question: if we do work_add(task) when the task is
>> running in the userspace, wouldn't the work execution wait until the next
>> syscall/allotted time ends up?
> 
> It'll get the task to enter the kernel, just like signal delivery. The only
> tricky part is really if we have a dependency waiting in the kernel, like
> the recent eventfd fix.

I see, thanks for sorting this out!
Kanchan Joshi July 30, 2020, 5:51 p.m. UTC | #10
On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 30/07/2020 20:16, Jens Axboe wrote:
> > On 7/30/20 10:26 AM, Pavel Begunkov wrote:
> >> On 30/07/2020 19:13, Jens Axboe wrote:
> >>> On 7/30/20 10:08 AM, Pavel Begunkov wrote:
> >>>> On 27/07/2020 23:34, Jens Axboe wrote:
> >>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
> >>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>>>>>
> >>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
> >>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
> >>>>>>>> index 7809ab2..6510cf5 100644
> >>>>>>>> --- a/fs/io_uring.c
> >>>>>>>> +++ b/fs/io_uring.c
> >>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
> >>>>>>>>       cqe = io_get_cqring(ctx);
> >>>>>>>>       if (likely(cqe)) {
> >>>>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
> >>>>>>>> -             WRITE_ONCE(cqe->res, res);
> >>>>>>>> -             WRITE_ONCE(cqe->flags, cflags);
> >>>>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
> >>>>>>>> +                     if (likely(res > 0))
> >>>>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
> >>>>>>>> +                     else
> >>>>>>>> +                             WRITE_ONCE(cqe->res64, res);
> >>>>>>>> +             } else {
> >>>>>>>> +                     WRITE_ONCE(cqe->res, res);
> >>>>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
> >>>>>>>> +             }
> >>>>>>>
> >>>>>>> This would be nice to keep out of the fast path, if possible.
> >>>>>>
> >>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during
> >>>>>> submission. That would have avoided this check......but argument count
> >>>>>> differs, so it did not add up.
> >>>>>
> >>>>> But that'd grow the io_kiocb just for this use case, which is arguably
> >>>>> even worse. Unless you can keep it in the per-request private data,
> >>>>> but there's no more room there for the regular read/write side.
> >>>>>
> >>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> >>>>>>>> index 92c2269..2580d93 100644
> >>>>>>>> --- a/include/uapi/linux/io_uring.h
> >>>>>>>> +++ b/include/uapi/linux/io_uring.h
> >>>>>>>> @@ -156,8 +156,13 @@ enum {
> >>>>>>>>   */
> >>>>>>>>  struct io_uring_cqe {
> >>>>>>>>       __u64   user_data;      /* sqe->data submission passed back */
> >>>>>>>> -     __s32   res;            /* result code for this event */
> >>>>>>>> -     __u32   flags;
> >>>>>>>> +     union {
> >>>>>>>> +             struct {
> >>>>>>>> +                     __s32   res;    /* result code for this event */
> >>>>>>>> +                     __u32   flags;
> >>>>>>>> +             };
> >>>>>>>> +             __s64   res64;  /* appending offset for zone append */
> >>>>>>>> +     };
> >>>>>>>>  };
> >>>>>>>
> >>>>>>> Is this a compatible change, both for now but also going forward? You
> >>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
> >>>>>>
> >>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
> >>>>>> used/set for write currently, so it looked compatible at this point.
> >>>>>
> >>>>> Not worried about that, since we won't ever use that for writes. But it
> >>>>> is a potential headache down the line for other flags, if they apply to
> >>>>> normal writes.
> >>>>>
> >>>>>> Yes, no room for future flags for this operation.
> >>>>>> Do you see any other way to enable this support in io-uring?
> >>>>>
> >>>>> Honestly I think the only viable option is as we discussed previously,
> >>>>> pass in a pointer to a 64-bit type where we can copy the additional
> >>>>> completion information to.
> >>>>
> >>>> TBH, I hate the idea of such overhead/latency at times when SSDs can
> >>>> serve writes in less than 10ms. Any chance you measured how long does it
> >>>
> >>> 10us? :-)
> >>
> >> Hah, 10us indeed :)
> >>
> >>>
> >>>> take to drag through task_work?
> >>>
> >>> A 64-bit value copy is really not a lot of overhead... But yes, we'd
> >>> need to push the completion through task_work at that point, as we can't
> >>> do it from the completion side. That's not a lot of overhead, and most
> >>> notably, it's overhead that only affects this particular type.
> >>>
> >>> That's not a bad starting point, and something that can always be
> >>> optimized later if need be. But I seriously doubt it'd be anything to
> >>> worry about.
> >>
> >> I probably need to look myself how it's really scheduled, but if you don't
> >> mind, here is a quick question: if we do work_add(task) when the task is
> >> running in the userspace, wouldn't the work execution wait until the next
> >> syscall/allotted time ends up?
> >
> > It'll get the task to enter the kernel, just like signal delivery. The only
> > tricky part is really if we have a dependency waiting in the kernel, like
> > the recent eventfd fix.
>
> I see, thanks for sorting this out!

Few more doubts about this (please mark me wrong if that is the case):

- Task-work makes me feel like N completions waiting to be served by
single task.
Currently completions keep arriving and CQEs would be updated with
result, but the user-space (submitter task) would not be poked.

- Completion-code will set the task-work. But post that it cannot go
immediately to its regular business of picking cqe and updating
res/flags, as we cannot afford user-space to see the cqe before the
pointer update. So it seems completion-code needs to spawn another
work which will allocate/update cqe after waiting for pointer-update
from task-work?
Jens Axboe July 30, 2020, 5:54 p.m. UTC | #11
On 7/30/20 11:51 AM, Kanchan Joshi wrote:
> On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 30/07/2020 20:16, Jens Axboe wrote:
>>> On 7/30/20 10:26 AM, Pavel Begunkov wrote:
>>>> On 30/07/2020 19:13, Jens Axboe wrote:
>>>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote:
>>>>>> On 27/07/2020 23:34, Jens Axboe wrote:
>>>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
>>>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>>
>>>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
>>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>>>> index 7809ab2..6510cf5 100644
>>>>>>>>>> --- a/fs/io_uring.c
>>>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>>>>>>>       cqe = io_get_cqring(ctx);
>>>>>>>>>>       if (likely(cqe)) {
>>>>>>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
>>>>>>>>>> -             WRITE_ONCE(cqe->res, res);
>>>>>>>>>> -             WRITE_ONCE(cqe->flags, cflags);
>>>>>>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
>>>>>>>>>> +                     if (likely(res > 0))
>>>>>>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
>>>>>>>>>> +                     else
>>>>>>>>>> +                             WRITE_ONCE(cqe->res64, res);
>>>>>>>>>> +             } else {
>>>>>>>>>> +                     WRITE_ONCE(cqe->res, res);
>>>>>>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
>>>>>>>>>> +             }
>>>>>>>>>
>>>>>>>>> This would be nice to keep out of the fast path, if possible.
>>>>>>>>
>>>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during
>>>>>>>> submission. That would have avoided this check......but argument count
>>>>>>>> differs, so it did not add up.
>>>>>>>
>>>>>>> But that'd grow the io_kiocb just for this use case, which is arguably
>>>>>>> even worse. Unless you can keep it in the per-request private data,
>>>>>>> but there's no more room there for the regular read/write side.
>>>>>>>
>>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>>>>>> index 92c2269..2580d93 100644
>>>>>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>>>>>> @@ -156,8 +156,13 @@ enum {
>>>>>>>>>>   */
>>>>>>>>>>  struct io_uring_cqe {
>>>>>>>>>>       __u64   user_data;      /* sqe->data submission passed back */
>>>>>>>>>> -     __s32   res;            /* result code for this event */
>>>>>>>>>> -     __u32   flags;
>>>>>>>>>> +     union {
>>>>>>>>>> +             struct {
>>>>>>>>>> +                     __s32   res;    /* result code for this event */
>>>>>>>>>> +                     __u32   flags;
>>>>>>>>>> +             };
>>>>>>>>>> +             __s64   res64;  /* appending offset for zone append */
>>>>>>>>>> +     };
>>>>>>>>>>  };
>>>>>>>>>
>>>>>>>>> Is this a compatible change, both for now but also going forward? You
>>>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
>>>>>>>>
>>>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
>>>>>>>> used/set for write currently, so it looked compatible at this point.
>>>>>>>
>>>>>>> Not worried about that, since we won't ever use that for writes. But it
>>>>>>> is a potential headache down the line for other flags, if they apply to
>>>>>>> normal writes.
>>>>>>>
>>>>>>>> Yes, no room for future flags for this operation.
>>>>>>>> Do you see any other way to enable this support in io-uring?
>>>>>>>
>>>>>>> Honestly I think the only viable option is as we discussed previously,
>>>>>>> pass in a pointer to a 64-bit type where we can copy the additional
>>>>>>> completion information to.
>>>>>>
>>>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can
>>>>>> serve writes in less than 10ms. Any chance you measured how long does it
>>>>>
>>>>> 10us? :-)
>>>>
>>>> Hah, 10us indeed :)
>>>>
>>>>>
>>>>>> take to drag through task_work?
>>>>>
>>>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd
>>>>> need to push the completion through task_work at that point, as we can't
>>>>> do it from the completion side. That's not a lot of overhead, and most
>>>>> notably, it's overhead that only affects this particular type.
>>>>>
>>>>> That's not a bad starting point, and something that can always be
>>>>> optimized later if need be. But I seriously doubt it'd be anything to
>>>>> worry about.
>>>>
>>>> I probably need to look myself how it's really scheduled, but if you don't
>>>> mind, here is a quick question: if we do work_add(task) when the task is
>>>> running in the userspace, wouldn't the work execution wait until the next
>>>> syscall/allotted time ends up?
>>>
>>> It'll get the task to enter the kernel, just like signal delivery. The only
>>> tricky part is really if we have a dependency waiting in the kernel, like
>>> the recent eventfd fix.
>>
>> I see, thanks for sorting this out!
> 
> Few more doubts about this (please mark me wrong if that is the case):
> 
> - Task-work makes me feel like N completions waiting to be served by
> single task.
> Currently completions keep arriving and CQEs would be updated with
> result, but the user-space (submitter task) would not be poked.
> 
> - Completion-code will set the task-work. But post that it cannot go
> immediately to its regular business of picking cqe and updating
> res/flags, as we cannot afford user-space to see the cqe before the
> pointer update. So it seems completion-code needs to spawn another
> work which will allocate/update cqe after waiting for pointer-update
> from task-work?

The task work would post the completion CQE for the request after
writing the offset.
Kanchan Joshi July 30, 2020, 6:25 p.m. UTC | #12
On Thu, Jul 30, 2020 at 11:24 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 7/30/20 11:51 AM, Kanchan Joshi wrote:
> > On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>
> >> On 30/07/2020 20:16, Jens Axboe wrote:
> >>> On 7/30/20 10:26 AM, Pavel Begunkov wrote:
> >>>> On 30/07/2020 19:13, Jens Axboe wrote:
> >>>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote:
> >>>>>> On 27/07/2020 23:34, Jens Axboe wrote:
> >>>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
> >>>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>>>>>>>
> >>>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
> >>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
> >>>>>>>>>> index 7809ab2..6510cf5 100644
> >>>>>>>>>> --- a/fs/io_uring.c
> >>>>>>>>>> +++ b/fs/io_uring.c
> >>>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
> >>>>>>>>>>       cqe = io_get_cqring(ctx);
> >>>>>>>>>>       if (likely(cqe)) {
> >>>>>>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
> >>>>>>>>>> -             WRITE_ONCE(cqe->res, res);
> >>>>>>>>>> -             WRITE_ONCE(cqe->flags, cflags);
> >>>>>>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
> >>>>>>>>>> +                     if (likely(res > 0))
> >>>>>>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
> >>>>>>>>>> +                     else
> >>>>>>>>>> +                             WRITE_ONCE(cqe->res64, res);
> >>>>>>>>>> +             } else {
> >>>>>>>>>> +                     WRITE_ONCE(cqe->res, res);
> >>>>>>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
> >>>>>>>>>> +             }
> >>>>>>>>>
> >>>>>>>>> This would be nice to keep out of the fast path, if possible.
> >>>>>>>>
> >>>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during
> >>>>>>>> submission. That would have avoided this check......but argument count
> >>>>>>>> differs, so it did not add up.
> >>>>>>>
> >>>>>>> But that'd grow the io_kiocb just for this use case, which is arguably
> >>>>>>> even worse. Unless you can keep it in the per-request private data,
> >>>>>>> but there's no more room there for the regular read/write side.
> >>>>>>>
> >>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> >>>>>>>>>> index 92c2269..2580d93 100644
> >>>>>>>>>> --- a/include/uapi/linux/io_uring.h
> >>>>>>>>>> +++ b/include/uapi/linux/io_uring.h
> >>>>>>>>>> @@ -156,8 +156,13 @@ enum {
> >>>>>>>>>>   */
> >>>>>>>>>>  struct io_uring_cqe {
> >>>>>>>>>>       __u64   user_data;      /* sqe->data submission passed back */
> >>>>>>>>>> -     __s32   res;            /* result code for this event */
> >>>>>>>>>> -     __u32   flags;
> >>>>>>>>>> +     union {
> >>>>>>>>>> +             struct {
> >>>>>>>>>> +                     __s32   res;    /* result code for this event */
> >>>>>>>>>> +                     __u32   flags;
> >>>>>>>>>> +             };
> >>>>>>>>>> +             __s64   res64;  /* appending offset for zone append */
> >>>>>>>>>> +     };
> >>>>>>>>>>  };
> >>>>>>>>>
> >>>>>>>>> Is this a compatible change, both for now but also going forward? You
> >>>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
> >>>>>>>>
> >>>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
> >>>>>>>> used/set for write currently, so it looked compatible at this point.
> >>>>>>>
> >>>>>>> Not worried about that, since we won't ever use that for writes. But it
> >>>>>>> is a potential headache down the line for other flags, if they apply to
> >>>>>>> normal writes.
> >>>>>>>
> >>>>>>>> Yes, no room for future flags for this operation.
> >>>>>>>> Do you see any other way to enable this support in io-uring?
> >>>>>>>
> >>>>>>> Honestly I think the only viable option is as we discussed previously,
> >>>>>>> pass in a pointer to a 64-bit type where we can copy the additional
> >>>>>>> completion information to.
> >>>>>>
> >>>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can
> >>>>>> serve writes in less than 10ms. Any chance you measured how long does it
> >>>>>
> >>>>> 10us? :-)
> >>>>
> >>>> Hah, 10us indeed :)
> >>>>
> >>>>>
> >>>>>> take to drag through task_work?
> >>>>>
> >>>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd
> >>>>> need to push the completion through task_work at that point, as we can't
> >>>>> do it from the completion side. That's not a lot of overhead, and most
> >>>>> notably, it's overhead that only affects this particular type.
> >>>>>
> >>>>> That's not a bad starting point, and something that can always be
> >>>>> optimized later if need be. But I seriously doubt it'd be anything to
> >>>>> worry about.
> >>>>
> >>>> I probably need to look myself how it's really scheduled, but if you don't
> >>>> mind, here is a quick question: if we do work_add(task) when the task is
> >>>> running in the userspace, wouldn't the work execution wait until the next
> >>>> syscall/allotted time ends up?
> >>>
> >>> It'll get the task to enter the kernel, just like signal delivery. The only
> >>> tricky part is really if we have a dependency waiting in the kernel, like
> >>> the recent eventfd fix.
> >>
> >> I see, thanks for sorting this out!
> >
> > Few more doubts about this (please mark me wrong if that is the case):
> >
> > - Task-work makes me feel like N completions waiting to be served by
> > single task.
> > Currently completions keep arriving and CQEs would be updated with
> > result, but the user-space (submitter task) would not be poked.
> >
> > - Completion-code will set the task-work. But post that it cannot go
> > immediately to its regular business of picking cqe and updating
> > res/flags, as we cannot afford user-space to see the cqe before the
> > pointer update. So it seems completion-code needs to spawn another
> > work which will allocate/update cqe after waiting for pointer-update
> > from task-work?
>
> The task work would post the completion CQE for the request after
> writing the offset.

Got it, thank you for making it simple.
Overall if I try to put the tradeoffs of moving to indirect-offset
(compared to current scheme)–

Upside:
- cqe res/flags would be intact, avoids future-headaches as you mentioned
- short-write cases do not have to be failed in lower-layers (as
cqe->res is there to report bytes-copied)

Downside:
- We may not be able to use RWF_APPEND, and need exposing a new
type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
sounds outrageous, but is it OK to have uring-only flag which can be
combined with RWF_APPEND?
-  Expensive compared to sending results in cqe itself. But I agree
that this may not be major, and only for one type of write.
Damien Le Moal July 31, 2020, 6:42 a.m. UTC | #13
On 2020/07/31 3:26, Kanchan Joshi wrote:
> On Thu, Jul 30, 2020 at 11:24 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 7/30/20 11:51 AM, Kanchan Joshi wrote:
>>> On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>>>
>>>> On 30/07/2020 20:16, Jens Axboe wrote:
>>>>> On 7/30/20 10:26 AM, Pavel Begunkov wrote:
>>>>>> On 30/07/2020 19:13, Jens Axboe wrote:
>>>>>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote:
>>>>>>>> On 27/07/2020 23:34, Jens Axboe wrote:
>>>>>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
>>>>>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
>>>>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>>>>>>>>>> index 7809ab2..6510cf5 100644
>>>>>>>>>>>> --- a/fs/io_uring.c
>>>>>>>>>>>> +++ b/fs/io_uring.c
>>>>>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
>>>>>>>>>>>>       cqe = io_get_cqring(ctx);
>>>>>>>>>>>>       if (likely(cqe)) {
>>>>>>>>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
>>>>>>>>>>>> -             WRITE_ONCE(cqe->res, res);
>>>>>>>>>>>> -             WRITE_ONCE(cqe->flags, cflags);
>>>>>>>>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
>>>>>>>>>>>> +                     if (likely(res > 0))
>>>>>>>>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
>>>>>>>>>>>> +                     else
>>>>>>>>>>>> +                             WRITE_ONCE(cqe->res64, res);
>>>>>>>>>>>> +             } else {
>>>>>>>>>>>> +                     WRITE_ONCE(cqe->res, res);
>>>>>>>>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
>>>>>>>>>>>> +             }
>>>>>>>>>>>
>>>>>>>>>>> This would be nice to keep out of the fast path, if possible.
>>>>>>>>>>
>>>>>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during
>>>>>>>>>> submission. That would have avoided this check......but argument count
>>>>>>>>>> differs, so it did not add up.
>>>>>>>>>
>>>>>>>>> But that'd grow the io_kiocb just for this use case, which is arguably
>>>>>>>>> even worse. Unless you can keep it in the per-request private data,
>>>>>>>>> but there's no more room there for the regular read/write side.
>>>>>>>>>
>>>>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>>>>>>>> index 92c2269..2580d93 100644
>>>>>>>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>>>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>>>>>>>> @@ -156,8 +156,13 @@ enum {
>>>>>>>>>>>>   */
>>>>>>>>>>>>  struct io_uring_cqe {
>>>>>>>>>>>>       __u64   user_data;      /* sqe->data submission passed back */
>>>>>>>>>>>> -     __s32   res;            /* result code for this event */
>>>>>>>>>>>> -     __u32   flags;
>>>>>>>>>>>> +     union {
>>>>>>>>>>>> +             struct {
>>>>>>>>>>>> +                     __s32   res;    /* result code for this event */
>>>>>>>>>>>> +                     __u32   flags;
>>>>>>>>>>>> +             };
>>>>>>>>>>>> +             __s64   res64;  /* appending offset for zone append */
>>>>>>>>>>>> +     };
>>>>>>>>>>>>  };
>>>>>>>>>>>
>>>>>>>>>>> Is this a compatible change, both for now but also going forward? You
>>>>>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
>>>>>>>>>>
>>>>>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
>>>>>>>>>> used/set for write currently, so it looked compatible at this point.
>>>>>>>>>
>>>>>>>>> Not worried about that, since we won't ever use that for writes. But it
>>>>>>>>> is a potential headache down the line for other flags, if they apply to
>>>>>>>>> normal writes.
>>>>>>>>>
>>>>>>>>>> Yes, no room for future flags for this operation.
>>>>>>>>>> Do you see any other way to enable this support in io-uring?
>>>>>>>>>
>>>>>>>>> Honestly I think the only viable option is as we discussed previously,
>>>>>>>>> pass in a pointer to a 64-bit type where we can copy the additional
>>>>>>>>> completion information to.
>>>>>>>>
>>>>>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can
>>>>>>>> serve writes in less than 10ms. Any chance you measured how long does it
>>>>>>>
>>>>>>> 10us? :-)
>>>>>>
>>>>>> Hah, 10us indeed :)
>>>>>>
>>>>>>>
>>>>>>>> take to drag through task_work?
>>>>>>>
>>>>>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd
>>>>>>> need to push the completion through task_work at that point, as we can't
>>>>>>> do it from the completion side. That's not a lot of overhead, and most
>>>>>>> notably, it's overhead that only affects this particular type.
>>>>>>>
>>>>>>> That's not a bad starting point, and something that can always be
>>>>>>> optimized later if need be. But I seriously doubt it'd be anything to
>>>>>>> worry about.
>>>>>>
>>>>>> I probably need to look myself how it's really scheduled, but if you don't
>>>>>> mind, here is a quick question: if we do work_add(task) when the task is
>>>>>> running in the userspace, wouldn't the work execution wait until the next
>>>>>> syscall/allotted time ends up?
>>>>>
>>>>> It'll get the task to enter the kernel, just like signal delivery. The only
>>>>> tricky part is really if we have a dependency waiting in the kernel, like
>>>>> the recent eventfd fix.
>>>>
>>>> I see, thanks for sorting this out!
>>>
>>> Few more doubts about this (please mark me wrong if that is the case):
>>>
>>> - Task-work makes me feel like N completions waiting to be served by
>>> single task.
>>> Currently completions keep arriving and CQEs would be updated with
>>> result, but the user-space (submitter task) would not be poked.
>>>
>>> - Completion-code will set the task-work. But post that it cannot go
>>> immediately to its regular business of picking cqe and updating
>>> res/flags, as we cannot afford user-space to see the cqe before the
>>> pointer update. So it seems completion-code needs to spawn another
>>> work which will allocate/update cqe after waiting for pointer-update
>>> from task-work?
>>
>> The task work would post the completion CQE for the request after
>> writing the offset.
> 
> Got it, thank you for making it simple.
> Overall if I try to put the tradeoffs of moving to indirect-offset
> (compared to current scheme)–
> 
> Upside:
> - cqe res/flags would be intact, avoids future-headaches as you mentioned
> - short-write cases do not have to be failed in lower-layers (as
> cqe->res is there to report bytes-copied)

I personally think it is a super bad idea to allow short asynchronous append
writes. The interface should allow the async zone append write to proceed only
and only if it can be stuffed entirely into a single BIO which necessarilly will
be a single request on the device side. Otherwise, the application would have no
guarantees as to where a split may happen, and since this is zone append, the
next async append will not leave any hole to complete a previous short write.
This will wreak the structure of the application data.

For the sync case, this is fine. The application can just issue a new append
write with the remaining unwritten data from the previous append write. But in
the async case, if one write == one data record (e.g. a key-value tuple for an
SSTable in an LSM tree), then allowing a short write will destroy the record:
the partial write will be garbage data that will need garbage collection...

> 
> Downside:
> - We may not be able to use RWF_APPEND, and need exposing a new
> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
> sounds outrageous, but is it OK to have uring-only flag which can be
> combined with RWF_APPEND?

Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for
raw block device accesses. We could certainly define a meaning for these in the
context of zoned block devices.

I already commented on the need for first defining an interface (flags etc) and
its semantic (e.g. do we allow short zone append or not ? What happens for
regular files ? etc). Did you read my comment ? We really need to first agree on
something to clarify what needs to be done.


> -  Expensive compared to sending results in cqe itself. But I agree
> that this may not be major, and only for one type of write.
> 
>
Christoph Hellwig July 31, 2020, 6:45 a.m. UTC | #14
On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote:
> > - We may not be able to use RWF_APPEND, and need exposing a new
> > type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
> > sounds outrageous, but is it OK to have uring-only flag which can be
> > combined with RWF_APPEND?
> 
> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for
> raw block device accesses. We could certainly define a meaning for these in the
> context of zoned block devices.

We can't just add a meaning for O_APPEND on block devices now,
as it was previously silently ignored.  I also really don't think any
of these semantics even fit the block device to start with.  If you
want to work on raw zones use zonefs, that's what is exists for.
Damien Le Moal July 31, 2020, 6:59 a.m. UTC | #15
On 2020/07/31 15:45, hch@infradead.org wrote:
> On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote:
>>> - We may not be able to use RWF_APPEND, and need exposing a new
>>> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
>>> sounds outrageous, but is it OK to have uring-only flag which can be
>>> combined with RWF_APPEND?
>>
>> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for
>> raw block device accesses. We could certainly define a meaning for these in the
>> context of zoned block devices.
> 
> We can't just add a meaning for O_APPEND on block devices now,
> as it was previously silently ignored.  I also really don't think any
> of these semantics even fit the block device to start with.  If you
> want to work on raw zones use zonefs, that's what is exists for.

Which is fine with me. Just trying to say that I think this is exactly the
discussion we need to start with. What interface do we implement...

Allowing zone append only through zonefs as the raw block device equivalent, all
the O_APPEND/RWF_APPEND semantic is defined and the "return written offset"
implementation in VFS would be common for all file systems, including regular
ones. Beside that, there is I think the question of short writes... Not sure if
short writes can currently happen with async RWF_APPEND writes to regular files.
I think not but that may depend on the FS.
Kanchan Joshi July 31, 2020, 7:08 a.m. UTC | #16
On Fri, Jul 31, 2020 at 12:12 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
>
> On 2020/07/31 3:26, Kanchan Joshi wrote:
> > On Thu, Jul 30, 2020 at 11:24 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>
> >> On 7/30/20 11:51 AM, Kanchan Joshi wrote:
> >>> On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>>>
> >>>> On 30/07/2020 20:16, Jens Axboe wrote:
> >>>>> On 7/30/20 10:26 AM, Pavel Begunkov wrote:
> >>>>>> On 30/07/2020 19:13, Jens Axboe wrote:
> >>>>>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote:
> >>>>>>>> On 27/07/2020 23:34, Jens Axboe wrote:
> >>>>>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote:
> >>>>>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>>>>>>>>>
> >>>>>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote:
> >>>>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
> >>>>>>>>>>>> index 7809ab2..6510cf5 100644
> >>>>>>>>>>>> --- a/fs/io_uring.c
> >>>>>>>>>>>> +++ b/fs/io_uring.c
> >>>>>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
> >>>>>>>>>>>>       cqe = io_get_cqring(ctx);
> >>>>>>>>>>>>       if (likely(cqe)) {
> >>>>>>>>>>>>               WRITE_ONCE(cqe->user_data, req->user_data);
> >>>>>>>>>>>> -             WRITE_ONCE(cqe->res, res);
> >>>>>>>>>>>> -             WRITE_ONCE(cqe->flags, cflags);
> >>>>>>>>>>>> +             if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
> >>>>>>>>>>>> +                     if (likely(res > 0))
> >>>>>>>>>>>> +                             WRITE_ONCE(cqe->res64, req->rw.append_offset);
> >>>>>>>>>>>> +                     else
> >>>>>>>>>>>> +                             WRITE_ONCE(cqe->res64, res);
> >>>>>>>>>>>> +             } else {
> >>>>>>>>>>>> +                     WRITE_ONCE(cqe->res, res);
> >>>>>>>>>>>> +                     WRITE_ONCE(cqe->flags, cflags);
> >>>>>>>>>>>> +             }
> >>>>>>>>>>>
> >>>>>>>>>>> This would be nice to keep out of the fast path, if possible.
> >>>>>>>>>>
> >>>>>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during
> >>>>>>>>>> submission. That would have avoided this check......but argument count
> >>>>>>>>>> differs, so it did not add up.
> >>>>>>>>>
> >>>>>>>>> But that'd grow the io_kiocb just for this use case, which is arguably
> >>>>>>>>> even worse. Unless you can keep it in the per-request private data,
> >>>>>>>>> but there's no more room there for the regular read/write side.
> >>>>>>>>>
> >>>>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> >>>>>>>>>>>> index 92c2269..2580d93 100644
> >>>>>>>>>>>> --- a/include/uapi/linux/io_uring.h
> >>>>>>>>>>>> +++ b/include/uapi/linux/io_uring.h
> >>>>>>>>>>>> @@ -156,8 +156,13 @@ enum {
> >>>>>>>>>>>>   */
> >>>>>>>>>>>>  struct io_uring_cqe {
> >>>>>>>>>>>>       __u64   user_data;      /* sqe->data submission passed back */
> >>>>>>>>>>>> -     __s32   res;            /* result code for this event */
> >>>>>>>>>>>> -     __u32   flags;
> >>>>>>>>>>>> +     union {
> >>>>>>>>>>>> +             struct {
> >>>>>>>>>>>> +                     __s32   res;    /* result code for this event */
> >>>>>>>>>>>> +                     __u32   flags;
> >>>>>>>>>>>> +             };
> >>>>>>>>>>>> +             __s64   res64;  /* appending offset for zone append */
> >>>>>>>>>>>> +     };
> >>>>>>>>>>>>  };
> >>>>>>>>>>>
> >>>>>>>>>>> Is this a compatible change, both for now but also going forward? You
> >>>>>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags.
> >>>>>>>>>>
> >>>>>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not
> >>>>>>>>>> used/set for write currently, so it looked compatible at this point.
> >>>>>>>>>
> >>>>>>>>> Not worried about that, since we won't ever use that for writes. But it
> >>>>>>>>> is a potential headache down the line for other flags, if they apply to
> >>>>>>>>> normal writes.
> >>>>>>>>>
> >>>>>>>>>> Yes, no room for future flags for this operation.
> >>>>>>>>>> Do you see any other way to enable this support in io-uring?
> >>>>>>>>>
> >>>>>>>>> Honestly I think the only viable option is as we discussed previously,
> >>>>>>>>> pass in a pointer to a 64-bit type where we can copy the additional
> >>>>>>>>> completion information to.
> >>>>>>>>
> >>>>>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can
> >>>>>>>> serve writes in less than 10ms. Any chance you measured how long does it
> >>>>>>>
> >>>>>>> 10us? :-)
> >>>>>>
> >>>>>> Hah, 10us indeed :)
> >>>>>>
> >>>>>>>
> >>>>>>>> take to drag through task_work?
> >>>>>>>
> >>>>>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd
> >>>>>>> need to push the completion through task_work at that point, as we can't
> >>>>>>> do it from the completion side. That's not a lot of overhead, and most
> >>>>>>> notably, it's overhead that only affects this particular type.
> >>>>>>>
> >>>>>>> That's not a bad starting point, and something that can always be
> >>>>>>> optimized later if need be. But I seriously doubt it'd be anything to
> >>>>>>> worry about.
> >>>>>>
> >>>>>> I probably need to look myself how it's really scheduled, but if you don't
> >>>>>> mind, here is a quick question: if we do work_add(task) when the task is
> >>>>>> running in the userspace, wouldn't the work execution wait until the next
> >>>>>> syscall/allotted time ends up?
> >>>>>
> >>>>> It'll get the task to enter the kernel, just like signal delivery. The only
> >>>>> tricky part is really if we have a dependency waiting in the kernel, like
> >>>>> the recent eventfd fix.
> >>>>
> >>>> I see, thanks for sorting this out!
> >>>
> >>> Few more doubts about this (please mark me wrong if that is the case):
> >>>
> >>> - Task-work makes me feel like N completions waiting to be served by
> >>> single task.
> >>> Currently completions keep arriving and CQEs would be updated with
> >>> result, but the user-space (submitter task) would not be poked.
> >>>
> >>> - Completion-code will set the task-work. But post that it cannot go
> >>> immediately to its regular business of picking cqe and updating
> >>> res/flags, as we cannot afford user-space to see the cqe before the
> >>> pointer update. So it seems completion-code needs to spawn another
> >>> work which will allocate/update cqe after waiting for pointer-update
> >>> from task-work?
> >>
> >> The task work would post the completion CQE for the request after
> >> writing the offset.
> >
> > Got it, thank you for making it simple.
> > Overall if I try to put the tradeoffs of moving to indirect-offset
> > (compared to current scheme)–
> >
> > Upside:
> > - cqe res/flags would be intact, avoids future-headaches as you mentioned
> > - short-write cases do not have to be failed in lower-layers (as
> > cqe->res is there to report bytes-copied)
>
> I personally think it is a super bad idea to allow short asynchronous append
> writes. The interface should allow the async zone append write to proceed only
> and only if it can be stuffed entirely into a single BIO which necessarilly will
> be a single request on the device side. Otherwise, the application would have no
> guarantees as to where a split may happen, and since this is zone append, the
> next async append will not leave any hole to complete a previous short write.
> This will wreak the structure of the application data.
>
> For the sync case, this is fine. The application can just issue a new append
> write with the remaining unwritten data from the previous append write. But in
> the async case, if one write == one data record (e.g. a key-value tuple for an
> SSTable in an LSM tree), then allowing a short write will destroy the record:
> the partial write will be garbage data that will need garbage collection...

There are cases when short-write is fine, isn't it? For example I can
serve only 8K write (either because of space, or because of those file
limits), but application sent 12K.....iov_iter_gets truncated to 8K
and the write is successful. At least that's what O_APPEND and
RWF_APPEND behaves currently.
But in the current scheme there is no way to report number-of-bytes
copied in io-uring, so I had to fail such short-write in lower-layer
(which does not know whether it is talking to io_uring or aio).
Failing such short-write is perhaps fine for zone-appened, but is it
fine for generic file-append?

> > Downside:
> > - We may not be able to use RWF_APPEND, and need exposing a new
> > type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
> > sounds outrageous, but is it OK to have uring-only flag which can be
> > combined with RWF_APPEND?
>
> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for
> raw block device accesses. We could certainly define a meaning for these in the
> context of zoned block devices.
But application using O_APPEND/RWF_APPEND does not pass a pointer to
be updated by kernel.
While in kernel we would expect that, and may start writing something
which is not a pointer.

> I already commented on the need for first defining an interface (flags etc) and
> its semantic (e.g. do we allow short zone append or not ? What happens for
> regular files ? etc). Did you read my comment ? We really need to first agree on
> something to clarify what needs to be done.

I read and was planning to respond, sorry. But it seemed important to
get the clarity on the uring-interface, as this seems to decide how
this whole thing looks like (to application and to lower layers as
well).

> > -  Expensive compared to sending results in cqe itself. But I agree
> > that this may not be major, and only for one type of write.
> >
> >
>
>
> --
> Damien Le Moal
> Western Digital Research
Kanchan Joshi July 31, 2020, 7:58 a.m. UTC | #17
On Fri, Jul 31, 2020 at 12:29 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
>
> On 2020/07/31 15:45, hch@infradead.org wrote:
> > On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote:
> >>> - We may not be able to use RWF_APPEND, and need exposing a new
> >>> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
> >>> sounds outrageous, but is it OK to have uring-only flag which can be
> >>> combined with RWF_APPEND?
> >>
> >> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for
> >> raw block device accesses. We could certainly define a meaning for these in the
> >> context of zoned block devices.
> >
> > We can't just add a meaning for O_APPEND on block devices now,
> > as it was previously silently ignored.  I also really don't think any
> > of these semantics even fit the block device to start with.  If you
> > want to work on raw zones use zonefs, that's what is exists for.
>
> Which is fine with me. Just trying to say that I think this is exactly the
> discussion we need to start with. What interface do we implement...
>
> Allowing zone append only through zonefs as the raw block device equivalent, all
> the O_APPEND/RWF_APPEND semantic is defined and the "return written offset"
> implementation in VFS would be common for all file systems, including regular
> ones. Beside that, there is I think the question of short writes... Not sure if
> short writes can currently happen with async RWF_APPEND writes to regular files.
> I think not but that may depend on the FS.

generic_write_check_limits (called by generic_write_checks, used by
most FS) may make it short, and AFAIK it does not depend on
async/sync.
This was one of the reason why we chose to isolate the operation by a
different IOCB flag and not by IOCB_APPEND alone.

For block-device these checks are not done, but there is another place
when it receives writes spanning beyond EOF and iov_iter_truncate()
adjusts it before sending it down.
And we return failure for that case in V4-  "Ref: [PATCH v4 3/6] uio:
return status with iov truncation"
Damien Le Moal July 31, 2020, 8:14 a.m. UTC | #18
On 2020/07/31 16:59, Kanchan Joshi wrote:
> On Fri, Jul 31, 2020 at 12:29 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
>>
>> On 2020/07/31 15:45, hch@infradead.org wrote:
>>> On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote:
>>>>> - We may not be able to use RWF_APPEND, and need exposing a new
>>>>> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
>>>>> sounds outrageous, but is it OK to have uring-only flag which can be
>>>>> combined with RWF_APPEND?
>>>>
>>>> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for
>>>> raw block device accesses. We could certainly define a meaning for these in the
>>>> context of zoned block devices.
>>>
>>> We can't just add a meaning for O_APPEND on block devices now,
>>> as it was previously silently ignored.  I also really don't think any
>>> of these semantics even fit the block device to start with.  If you
>>> want to work on raw zones use zonefs, that's what is exists for.
>>
>> Which is fine with me. Just trying to say that I think this is exactly the
>> discussion we need to start with. What interface do we implement...
>>
>> Allowing zone append only through zonefs as the raw block device equivalent, all
>> the O_APPEND/RWF_APPEND semantic is defined and the "return written offset"
>> implementation in VFS would be common for all file systems, including regular
>> ones. Beside that, there is I think the question of short writes... Not sure if
>> short writes can currently happen with async RWF_APPEND writes to regular files.
>> I think not but that may depend on the FS.
> 
> generic_write_check_limits (called by generic_write_checks, used by
> most FS) may make it short, and AFAIK it does not depend on
> async/sync.

Johannes has a patch (not posted yet) fixing all this for zonefs,
differentiating sync and async cases, allow short writes or not, etc. This was
done by not using generic_write_check_limits() and instead writing a
zonefs_check_write() function that is zone append friendly.

We can post that as a base for the discussion on semantic if you want...

> This was one of the reason why we chose to isolate the operation by a
> different IOCB flag and not by IOCB_APPEND alone.

For zonefs, the plan is:
* For the sync write case, zone append is always used.
* For the async write case, if we see IOCB_APPEND, then zone append BIOs are
used. If not, regular write BIOs are used.

Simple enough I think. No need for a new flag.
Christoph Hellwig July 31, 2020, 9:14 a.m. UTC | #19
On Fri, Jul 31, 2020 at 08:14:22AM +0000, Damien Le Moal wrote:
> 
> > This was one of the reason why we chose to isolate the operation by a
> > different IOCB flag and not by IOCB_APPEND alone.
> 
> For zonefs, the plan is:
> * For the sync write case, zone append is always used.
> * For the async write case, if we see IOCB_APPEND, then zone append BIOs are
> used. If not, regular write BIOs are used.
> 
> Simple enough I think. No need for a new flag.

Simple, but wrong.  Sync vs async really doesn't matter, even sync
writes will have problems if there are other writers.  We need a flag
for "report the actual offset for appending writes", and based on that
flag we need to not allow short writes (or split extents for real
file systems).  We also need a fcntl or ioctl to report this max atomic
write size so that applications can rely on it.
Damien Le Moal July 31, 2020, 9:34 a.m. UTC | #20
On 2020/07/31 18:14, hch@infradead.org wrote:
> On Fri, Jul 31, 2020 at 08:14:22AM +0000, Damien Le Moal wrote:
>>
>>> This was one of the reason why we chose to isolate the operation by a
>>> different IOCB flag and not by IOCB_APPEND alone.
>>
>> For zonefs, the plan is:
>> * For the sync write case, zone append is always used.
>> * For the async write case, if we see IOCB_APPEND, then zone append BIOs are
>> used. If not, regular write BIOs are used.
>>
>> Simple enough I think. No need for a new flag.
> 
> Simple, but wrong.  Sync vs async really doesn't matter, even sync
> writes will have problems if there are other writers.  We need a flag
> for "report the actual offset for appending writes", and based on that
> flag we need to not allow short writes (or split extents for real
> file systems).  We also need a fcntl or ioctl to report this max atomic
> write size so that applications can rely on it.
> 

Sync writes are done under the inode lock, so there cannot be other writers at
the same time. And for the sync case, since the actual written offset is
necessarily equal to the file size before the write, there is no need to report
it (there is no system call that can report that anyway). For this sync case,
the only change that the use of zone append introduces compared to regular
writes is the potential for more short writes.

Adding a flag for "report the actual offset for appending writes" is fine with
me, but do you also mean to use this flag for driving zone append write vs
regular writes in zonefs ?

The fcntl or ioctl for getting the max atomic write size would be fine too.
Given that zonefs is very close to the underlying zoned drive, I was assuming
that the application can simply consult the device sysfs zone_append_max_bytes
queue attribute. For regular file systems, this value would be used internally
only. I do not really see how it can be useful to applications. Furthermore, the
file system may have a hard time giving that information to the application
depending on its underlying storage configuration (e.g. erasure
coding/declustered RAID).
Kanchan Joshi July 31, 2020, 9:38 a.m. UTC | #21
On Fri, Jul 31, 2020 at 1:44 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
>
> On 2020/07/31 16:59, Kanchan Joshi wrote:
> > On Fri, Jul 31, 2020 at 12:29 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
> >>
> >> On 2020/07/31 15:45, hch@infradead.org wrote:
> >>> On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote:
> >>>>> - We may not be able to use RWF_APPEND, and need exposing a new
> >>>>> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
> >>>>> sounds outrageous, but is it OK to have uring-only flag which can be
> >>>>> combined with RWF_APPEND?
> >>>>
> >>>> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for
> >>>> raw block device accesses. We could certainly define a meaning for these in the
> >>>> context of zoned block devices.
> >>>
> >>> We can't just add a meaning for O_APPEND on block devices now,
> >>> as it was previously silently ignored.  I also really don't think any
> >>> of these semantics even fit the block device to start with.  If you
> >>> want to work on raw zones use zonefs, that's what is exists for.
> >>
> >> Which is fine with me. Just trying to say that I think this is exactly the
> >> discussion we need to start with. What interface do we implement...
> >>
> >> Allowing zone append only through zonefs as the raw block device equivalent, all
> >> the O_APPEND/RWF_APPEND semantic is defined and the "return written offset"
> >> implementation in VFS would be common for all file systems, including regular
> >> ones. Beside that, there is I think the question of short writes... Not sure if
> >> short writes can currently happen with async RWF_APPEND writes to regular files.
> >> I think not but that may depend on the FS.
> >
> > generic_write_check_limits (called by generic_write_checks, used by
> > most FS) may make it short, and AFAIK it does not depend on
> > async/sync.
>
> Johannes has a patch (not posted yet) fixing all this for zonefs,
> differentiating sync and async cases, allow short writes or not, etc. This was
> done by not using generic_write_check_limits() and instead writing a
> zonefs_check_write() function that is zone append friendly.
>
> We can post that as a base for the discussion on semantic if you want...

There is no problem in about how-to-do-it. That part is simple - we
have the iocb, and sync/async can be known whether ki_complete
callback is set.
This point to be discussed was whether-to-allow-short-write-or-not if
we are talking about a generic file-append-returning-location.

That said, since we are talking about moving to indirect-offset in
io-uring, short-write is not an issue anymore I suppose (it goes back
to how it was).
But the unsettled thing is -  whether we can use O/RWF_APPEND with
indirect-offset (pointer) scheme.

> > This was one of the reason why we chose to isolate the operation by a
> > different IOCB flag and not by IOCB_APPEND alone.
>
> For zonefs, the plan is:
> * For the sync write case, zone append is always used.
> * For the async write case, if we see IOCB_APPEND, then zone append BIOs are
> used. If not, regular write BIOs are used.
>
> Simple enough I think. No need for a new flag.

Maybe simple if we only think of ZoneFS (how user-space sends
async-append and gets result is a common problem).
Add Block I/O in scope -  it gets slightly more complicated because it
has to cater to non-zoned devices. And there already is a
well-established understanding that append does nothing...so  code
like "if (flags & IOCB_APPEND) { do something; }" in block I/O path
may surprise someone resuming after a hiatus.
Add File I/O in scope - It gets further complicated. I think it would
make sense to make it opt-in rather than compulsory, but most of them
already implement a behavior for IOCB_APPEND. How to make it opt-in
without new flags.

New flags (FMODE_SOME_NAME, IOCB_SOME_NAME) serve that purpose.
Please assess the need (for isolation) considering all three cases.
Christoph Hellwig July 31, 2020, 9:41 a.m. UTC | #22
On Fri, Jul 31, 2020 at 09:34:50AM +0000, Damien Le Moal wrote:
> Sync writes are done under the inode lock, so there cannot be other writers at
> the same time. And for the sync case, since the actual written offset is
> necessarily equal to the file size before the write, there is no need to report
> it (there is no system call that can report that anyway). For this sync case,
> the only change that the use of zone append introduces compared to regular
> writes is the potential for more short writes.
> 
> Adding a flag for "report the actual offset for appending writes" is fine with
> me, but do you also mean to use this flag for driving zone append write vs
> regular writes in zonefs ?

Let's keep semantics and implementation separate.  For the case
where we report the actual offset we need a size imitation and no
short writes.

Anything with those semantics can be implemented using Zone Append
trivially in zonefs, and we don't even need the exclusive lock in that
case.  But even without that flag anything that has an exclusive lock can
at least in theory be implemented using Zone Append, it just need
support for submitting another request from the I/O completion handler
of the first.  I just don't think it is worth it - with the exclusive
lock we do have access to the zone serialied so a normal write works
just fine.  Both for the sync and async case.

> The fcntl or ioctl for getting the max atomic write size would be fine too.
> Given that zonefs is very close to the underlying zoned drive, I was assuming
> that the application can simply consult the device sysfs zone_append_max_bytes
> queue attribute.

For zonefs we can, yes.  But in many ways that is a lot more cumbersome
that having an API that works on the fd you want to write on.

> For regular file systems, this value would be used internally
> only. I do not really see how it can be useful to applications. Furthermore, the
> file system may have a hard time giving that information to the application
> depending on its underlying storage configuration (e.g. erasure
> coding/declustered RAID).

File systems might have all kinds of limits of their own (e.g. extent
sizes).  And a good API that just works everywhere and is properly
documented is much better than heaps of cargo culted crap all over
applications.
Damien Le Moal July 31, 2020, 10:16 a.m. UTC | #23
On 2020/07/31 18:41, hch@infradead.org wrote:
> On Fri, Jul 31, 2020 at 09:34:50AM +0000, Damien Le Moal wrote:
>> Sync writes are done under the inode lock, so there cannot be other writers at
>> the same time. And for the sync case, since the actual written offset is
>> necessarily equal to the file size before the write, there is no need to report
>> it (there is no system call that can report that anyway). For this sync case,
>> the only change that the use of zone append introduces compared to regular
>> writes is the potential for more short writes.
>>
>> Adding a flag for "report the actual offset for appending writes" is fine with
>> me, but do you also mean to use this flag for driving zone append write vs
>> regular writes in zonefs ?
> 
> Let's keep semantics and implementation separate.  For the case
> where we report the actual offset we need a size imitation and no
> short writes.

OK. So the name of the flag confused me. The flag name should reflect "Do zone
append and report written offset", right ?

Just to clarify, here was my thinking for zonefs:
1) file open with O_APPEND/aio has RWF_APPEND: then it is OK to assume that the
application did not set the aio offset since APPEND means offset==file size. In
that case, do zone append and report back the written offset.
2) file open without O_APPEND/aio does not have RWF_APPEND: the application
specified an aio offset and we must respect it, write it that exact same order,
so use regular writes.

For regular file systems, with case (1) condition, the FS use whatever it wants
for the implementation, and report back the written offset, which  will always
be the file size at the time the aio was issued.

Your method with a new flag to switch between (1) and (2) is OK with me, but the
"no short writes" may be difficult to achieve in a regular FS, no ? I do not
think current FSes have such guarantees... Especially in the case of buffered
async writes I think.

> Anything with those semantics can be implemented using Zone Append
> trivially in zonefs, and we don't even need the exclusive lock in that
> case.  But even without that flag anything that has an exclusive lock can
> at least in theory be implemented using Zone Append, it just need
> support for submitting another request from the I/O completion handler
> of the first.  I just don't think it is worth it - with the exclusive
> lock we do have access to the zone serialied so a normal write works
> just fine.  Both for the sync and async case.

We did switch to have zonefs do append writes in the sync case, always. Hmmm...
Not sure anymore it was such a good idea.

> 
>> The fcntl or ioctl for getting the max atomic write size would be fine too.
>> Given that zonefs is very close to the underlying zoned drive, I was assuming
>> that the application can simply consult the device sysfs zone_append_max_bytes
>> queue attribute.
> 
> For zonefs we can, yes.  But in many ways that is a lot more cumbersome
> that having an API that works on the fd you want to write on.

Got it. Makes sense.

>> For regular file systems, this value would be used internally
>> only. I do not really see how it can be useful to applications. Furthermore, the
>> file system may have a hard time giving that information to the application
>> depending on its underlying storage configuration (e.g. erasure
>> coding/declustered RAID).
> 
> File systems might have all kinds of limits of their own (e.g. extent
> sizes).  And a good API that just works everywhere and is properly
> documented is much better than heaps of cargo culted crap all over
> applications.

OK. Makes sense. That said, taking Naohiro's work on btrfs as an example, zone
append is used for every data write, no matter if it is O_APPEND/RWF_APPEND or
not. The size limitation for zone append writes is not needed at all by
applications. Maximum extent size is aligned to the max append write size
internally, and if the application issued a larger write, it loops over multiple
extents, similarly to any regular write may do (if there is overwrite etc...).

For the regular FS case, my thinking on the semantic really was: if asked to do
so, return the written offset for a RWF_APPEND aios. And I think that
implementing that does not depend in any way on what the FS does internally.

But I think I am starting to see the picture you are drawing here:
1) Introduce a fcntl() to get "maximum size for atomic append writes"
2) Introduce an aio flag specifying "Do atomic append write and report written
offset"
3) For an aio specifying "Do atomic append write and report written offset", if
the aio is larger than "maximum size for atomic append writes", fail it on
submission, no short writes.
4) For any other aio, it is business as usual, aio is processed as they are now.

And the implementation is actually completely free to use zone append writes or
regular writes regardless of the "Do atomic append write and report written
offset" being used or not.

Is it your thinking ? That would work for me. That actually end up completely
unifying the interface behavior for zonefs and regular FS. Same semantic.
Christoph Hellwig July 31, 2020, 12:51 p.m. UTC | #24
On Fri, Jul 31, 2020 at 10:16:49AM +0000, Damien Le Moal wrote:
> > 
> > Let's keep semantics and implementation separate.  For the case
> > where we report the actual offset we need a size imitation and no
> > short writes.
> 
> OK. So the name of the flag confused me. The flag name should reflect "Do zone
> append and report written offset", right ?

Well, we already have O_APPEND, which is the equivalent to append to
the write pointer.  The only interesting addition is that we also want
to report where we wrote.  So I'd rather have RWF_REPORT_OFFSET or so.

> Just to clarify, here was my thinking for zonefs:
> 1) file open with O_APPEND/aio has RWF_APPEND: then it is OK to assume that the
> application did not set the aio offset since APPEND means offset==file size. In
> that case, do zone append and report back the written offset.

Yes.

> 2) file open without O_APPEND/aio does not have RWF_APPEND: the application
> specified an aio offset and we must respect it, write it that exact same order,
> so use regular writes.

Yes.

> 
> For regular file systems, with case (1) condition, the FS use whatever it wants
> for the implementation, and report back the written offset, which  will always
> be the file size at the time the aio was issued.

Yes. 

> 
> Your method with a new flag to switch between (1) and (2) is OK with me, but the
> "no short writes" may be difficult to achieve in a regular FS, no ? I do not
> think current FSes have such guarantees... Especially in the case of buffered
> async writes I think.

I'll have to check what Jens recently changed with io_uring, but
traditionally the only short write case for a normal file system is the
artifical INT_MAX limit for the write size.

> > Anything with those semantics can be implemented using Zone Append
> > trivially in zonefs, and we don't even need the exclusive lock in that
> > case.  But even without that flag anything that has an exclusive lock can
> > at least in theory be implemented using Zone Append, it just need
> > support for submitting another request from the I/O completion handler
> > of the first.  I just don't think it is worth it - with the exclusive
> > lock we do have access to the zone serialied so a normal write works
> > just fine.  Both for the sync and async case.
> 
> We did switch to have zonefs do append writes in the sync case, always. Hmmm...
> Not sure anymore it was such a good idea.

It might be a good idea as long as we have the heavy handed scheduler
locking.  But if we don't have that there doesn't seem to be much of
a reason for zone append.

> OK. Makes sense. That said, taking Naohiro's work on btrfs as an example, zone
> append is used for every data write, no matter if it is O_APPEND/RWF_APPEND or
> not. The size limitation for zone append writes is not needed at all by
> applications. Maximum extent size is aligned to the max append write size
> internally, and if the application issued a larger write, it loops over multiple
> extents, similarly to any regular write may do (if there is overwrite etc...).

True, we probably don't need the limit for a normal file system.

> For the regular FS case, my thinking on the semantic really was: if asked to do
> so, return the written offset for a RWF_APPEND aios. And I think that
> implementing that does not depend in any way on what the FS does internally.

Exactly.

> 
> But I think I am starting to see the picture you are drawing here:
> 1) Introduce a fcntl() to get "maximum size for atomic append writes"
> 2) Introduce an aio flag specifying "Do atomic append write and report written
> offset"

I think we just need the 'report written offset flag', in fact even for
zonefs with the right locking we can handle unlimited write sizes, just
with lower performance.  E.g.

1) check if the write size is larger than the zone append limit

if no:

 - take the shared lock  and issue a zone append, done

if yes:

 - take the exclusive per-inode (zone) lock and just issue either normal
   writes or zone append at your choice, relying on the lock to
   serialize other writers.  For the async case this means we need a
   lock than can be release in a different context than it was acquired,
   which is a little ugly but can be done.

> And the implementation is actually completely free to use zone append writes or
> regular writes regardless of the "Do atomic append write and report written
> offset" being used or not.

Yes, that was my point about keeping the semantics and implementation
separate.
Christoph Hellwig July 31, 2020, 1:08 p.m. UTC | #25
And FYI, this is what I'd do for a hacky aio-only prototype (untested):


diff --git a/fs/aio.c b/fs/aio.c
index 91e7cc4a9f179b..42b1934e38758b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1438,7 +1438,10 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 	}
 
 	iocb->ki_res.res = res;
-	iocb->ki_res.res2 = res2;
+	if ((kiocb->ki_flags & IOCB_REPORT_OFFSET) && res > 0)
+		iocb->ki_res.res2 = kiocb->ki_pos - res;
+	else
+		iocb->ki_res.res2 = res2;
 	iocb_put(iocb);
 }
 
@@ -1452,6 +1455,8 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	req->ki_flags = iocb_flags(req->ki_filp);
 	if (iocb->aio_flags & IOCB_FLAG_RESFD)
 		req->ki_flags |= IOCB_EVENTFD;
+	if (iocb->aio_flags & IOCB_FLAG_REPORT_OFFSET)
+		req->ki_flags |= IOCB_REPORT_OFFSET;
 	req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
 	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
 		/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f5abba86107d86..522b0a3437d420 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -316,6 +316,7 @@ enum rw_hint {
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
 #define IOCB_NOIO		(1 << 9)
+#define IOCB_REPORT_OFFSET	(1 << 10)
 
 struct kiocb {
 	struct file		*ki_filp;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index 8387e0af0f768a..e4313d7aa3b7e7 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -55,6 +55,7 @@ enum {
  */
 #define IOCB_FLAG_RESFD		(1 << 0)
 #define IOCB_FLAG_IOPRIO	(1 << 1)
+#define IOCB_FLAG_REPORT_OFFSET	(1 << 2)
 
 /* read() from /dev/aio returns these structures. */
 struct io_event {
Kanchan Joshi July 31, 2020, 3:07 p.m. UTC | #26
On Fri, Jul 31, 2020 at 6:38 PM hch@infradead.org <hch@infradead.org> wrote:
>
> And FYI, this is what I'd do for a hacky aio-only prototype (untested):
>
>
> diff --git a/fs/aio.c b/fs/aio.c
> index 91e7cc4a9f179b..42b1934e38758b 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1438,7 +1438,10 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
>         }
>
>         iocb->ki_res.res = res;
> -       iocb->ki_res.res2 = res2;
> +       if ((kiocb->ki_flags & IOCB_REPORT_OFFSET) && res > 0)
> +               iocb->ki_res.res2 = kiocb->ki_pos - res;
> +       else
> +               iocb->ki_res.res2 = res2;
>         iocb_put(iocb);
>  }
>
> @@ -1452,6 +1455,8 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
>         req->ki_flags = iocb_flags(req->ki_filp);
>         if (iocb->aio_flags & IOCB_FLAG_RESFD)
>                 req->ki_flags |= IOCB_EVENTFD;
> +       if (iocb->aio_flags & IOCB_FLAG_REPORT_OFFSET)
> +               req->ki_flags |= IOCB_REPORT_OFFSET;
>         req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
>         if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
>                 /*
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f5abba86107d86..522b0a3437d420 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -316,6 +316,7 @@ enum rw_hint {
>  #define IOCB_WRITE             (1 << 6)
>  #define IOCB_NOWAIT            (1 << 7)
>  #define IOCB_NOIO              (1 << 9)
> +#define IOCB_REPORT_OFFSET     (1 << 10)
>
>  struct kiocb {
>         struct file             *ki_filp;
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index 8387e0af0f768a..e4313d7aa3b7e7 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -55,6 +55,7 @@ enum {
>   */
>  #define IOCB_FLAG_RESFD                (1 << 0)
>  #define IOCB_FLAG_IOPRIO       (1 << 1)
> +#define IOCB_FLAG_REPORT_OFFSET        (1 << 2)
>
>  /* read() from /dev/aio returns these structures. */
>  struct io_event {

Looks good, but it drops io_uring.
How about two flags -
1. RWF_REPORT_OFFSET (only for aio)  ----> aio fails the second one
2. RWF_REPORT_OFFSET_INDIRECT (for io_uring).  ----> uring fails the first one
Since these are RWF flags, they can be used by other sync/async
transports also in future if need be.
Either of these flags will set single IOCB_REPORT_OFFSET, which can be
used by FS/Block etc (they don't have to worry how uring/aio sends it
up).

This is what I mean in code -

diff --git a/fs/aio.c b/fs/aio.c
index 91e7cc4a9f17..307dfbfb04f7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1472,6 +1472,11 @@ static int aio_prep_rw(struct kiocb *req, const
struct iocb *iocb)
        ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
        if (unlikely(ret))
                return ret;
+       /* support only direct offset */
+       if (unlikely(iocb->aio_rw_flags & RWF_REPORT_OFFSET_INDIRECT))
+               return -EOPNOTSUPP;
+
        req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
        return 0;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3e406bc1f855..5fa21644251f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2451,6 +2451,7 @@ static int io_prep_rw(struct io_kiocb *req,
const struct io_uring_sqe *sqe,
        struct kiocb *kiocb = &req->rw.kiocb;
        unsigned ioprio;
        int ret;
+       rwf_t rw_flags;

        if (S_ISREG(file_inode(req->file)->i_mode))
                req->flags |= REQ_F_ISREG;
@@ -2462,9 +2463,13 @@ static int io_prep_rw(struct io_kiocb *req,
const struct io_uring_sqe *sqe,
        }
        kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
        kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
-       ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+       rw_flags = READ_ONCE(sqe->rw_flags);
+       ret = kiocb_set_rw_flags(kiocb, rw_flags);
        if (unlikely(ret))
                return ret;
+       /* support only indirect offset */
+       if (unlikely(rw_flags & RWF_REPORT_OFFSET_DIRECT))
+               return -EOPNOTSUPP;

        ioprio = READ_ONCE(sqe->ioprio);
        if (ioprio) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8a00ba99284e..fe2f1f5c5d33 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3296,8 +3296,17 @@ static inline int kiocb_set_rw_flags(struct
kiocb *ki, rwf_t flags)
                ki->ki_flags |= IOCB_DSYNC;
        if (flags & RWF_SYNC)
                ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
-       if (flags & RWF_APPEND)
+       if (flags & RWF_APPEND) {
                ki->ki_flags |= IOCB_APPEND;
+               /*
+                * 1. These flags do not make sense when used standalone
+                * 2. RWF_REPORT_OFFSET_DIRECT = report result
directly (for aio)
+                * 3. RWF_REPORT_INDIRECT_OFFSER = use pointer (for io_uring)
+                * */
+               if (flags & RWF_REPORT_OFFSET_DIRECT ||
+                               flags & RWF_REPORT_OFFSET_INDIRECT)
+                       ki->ki_flags |= IOCB_REPORT_OFFSET;
Damien Le Moal Aug. 5, 2020, 7:35 a.m. UTC | #27
On 2020/07/31 21:51, hch@infradead.org wrote:
> On Fri, Jul 31, 2020 at 10:16:49AM +0000, Damien Le Moal wrote:
>>>
>>> Let's keep semantics and implementation separate.  For the case
>>> where we report the actual offset we need a size imitation and no
>>> short writes.
>>
>> OK. So the name of the flag confused me. The flag name should reflect "Do zone
>> append and report written offset", right ?
> 
> Well, we already have O_APPEND, which is the equivalent to append to
> the write pointer.  The only interesting addition is that we also want
> to report where we wrote.  So I'd rather have RWF_REPORT_OFFSET or so.

That works for me. But that rules out having the same interface for raw block
devices since O_APPEND has no meaning in that case. So for raw block devices, it
will have to be through zonefs. That works for me, and I think it was your idea
all along. Can you confirm please ?

>> But I think I am starting to see the picture you are drawing here:
>> 1) Introduce a fcntl() to get "maximum size for atomic append writes"
>> 2) Introduce an aio flag specifying "Do atomic append write and report written
>> offset"
> 
> I think we just need the 'report written offset flag', in fact even for
> zonefs with the right locking we can handle unlimited write sizes, just
> with lower performance.  E.g.
> 
> 1) check if the write size is larger than the zone append limit
> 
> if no:
> 
>  - take the shared lock  and issue a zone append, done
> 
> if yes:
> 
>  - take the exclusive per-inode (zone) lock and just issue either normal
>    writes or zone append at your choice, relying on the lock to
>    serialize other writers.  For the async case this means we need a
>    lock than can be release in a different context than it was acquired,
>    which is a little ugly but can be done.

Yes, that would be possible. But likely, this will also need calls to
inode_dio_wait() to avoid ending up with a mix of regular write and zone append
writes in flight (which likely would result in the regular write failing as the
zone append writes would go straight to the device without waiting for the zone
write lock like regular writes do).

This all sound sensible to me. One last point though, specific to zonefs: if the
user opens a zone file with O_APPEND, I do want to have that necessarily mean
"use zone append". And same for the "RWF_REPORT_OFFSET". The point here is that
both O_APPEND and RWF_REPORT_OFFSET can be used with both regular writes and
zone append writes, but none of them actually clearly specify if the
application/user tolerates writing data to disk in a different order than the
issuing order... So another flag to indicate "atomic out-of-order writes" (==
zone append) ?

Without a new flag, we can only have these cases to enable zone append:

1) No O_APPEND: ignore RWF_REPORT_OFFSET and use regular writes using ->aio_ofst

2) O_APPEND without RWF_REPORT_OFFSET: use regular write with file size as
->aio_ofst (as done today already), do not report the write offset on completion.

3) O_APPEND with RWF_REPORT_OFFSET: use zone append, implying potentially out of
order writes.

And case (3) is not nice. I would rather prefer something like:

3) O_APPEND with RWF_REPORT_OFFSET: use regular write with file size as
->aio_ofst (as done today already), report the write offset on completion.

4) O_APPEND with RWF_ATOMIC_APPEND: use zone append writes, do not report the
write offset on completion.

5) O_APPEND with RWF_ATOMIC_APPEND+RWF_REPORT_OFFSET: use zone append writes,
report the write offset on completion.

RWF_ATOMIC_APPEND could also always imply RWF_REPORT_OFFSET. ANy aio with
RWF_ATOMIC_APPEND that is too large would be failed.

Thoughts ?
Christoph Hellwig Aug. 14, 2020, 8:14 a.m. UTC | #28
On Wed, Aug 05, 2020 at 07:35:28AM +0000, Damien Le Moal wrote:
> > the write pointer.  The only interesting addition is that we also want
> > to report where we wrote.  So I'd rather have RWF_REPORT_OFFSET or so.
> 
> That works for me. But that rules out having the same interface for raw block
> devices since O_APPEND has no meaning in that case. So for raw block devices, it
> will have to be through zonefs. That works for me, and I think it was your idea
> all along. Can you confirm please ?

Yes.  I don't think think raw syscall level access to the zone append
primitive makes sense.  Either use zonefs for a file-like API, or
use the NVMe pass through interface for 100% raw access.

> >  - take the exclusive per-inode (zone) lock and just issue either normal
> >    writes or zone append at your choice, relying on the lock to
> >    serialize other writers.  For the async case this means we need a
> >    lock than can be release in a different context than it was acquired,
> >    which is a little ugly but can be done.
> 
> Yes, that would be possible. But likely, this will also need calls to
> inode_dio_wait() to avoid ending up with a mix of regular write and zone append
> writes in flight (which likely would result in the regular write failing as the
> zone append writes would go straight to the device without waiting for the zone
> write lock like regular writes do).

inode_dio_wait is a really bad implementation of almost a lock.  I've
started some work that I need to finish to just replace it with proper
non-owner rwsems (or even the range locks Dave has been looking into).

> 
> This all sound sensible to me. One last point though, specific to zonefs: if the
> user opens a zone file with O_APPEND, I do want to have that necessarily mean
> "use zone append". And same for the "RWF_REPORT_OFFSET". The point here is that
> both O_APPEND and RWF_REPORT_OFFSET can be used with both regular writes and
> zone append writes, but none of them actually clearly specify if the
> application/user tolerates writing data to disk in a different order than the
> issuing order... So another flag to indicate "atomic out-of-order writes" (==
> zone append) ?

O_APPEND pretty much implies out of order, as there is no way for an
application to know which thread wins the race to write the next chunk.
Damien Le Moal Aug. 14, 2020, 8:27 a.m. UTC | #29
On 2020/08/14 17:14, hch@infradead.org wrote:
> On Wed, Aug 05, 2020 at 07:35:28AM +0000, Damien Le Moal wrote:
>>> the write pointer.  The only interesting addition is that we also want
>>> to report where we wrote.  So I'd rather have RWF_REPORT_OFFSET or so.
>>
>> That works for me. But that rules out having the same interface for raw block
>> devices since O_APPEND has no meaning in that case. So for raw block devices, it
>> will have to be through zonefs. That works for me, and I think it was your idea
>> all along. Can you confirm please ?
> 
> Yes.  I don't think think raw syscall level access to the zone append
> primitive makes sense.  Either use zonefs for a file-like API, or
> use the NVMe pass through interface for 100% raw access.
> 
>>>  - take the exclusive per-inode (zone) lock and just issue either normal
>>>    writes or zone append at your choice, relying on the lock to
>>>    serialize other writers.  For the async case this means we need a
>>>    lock than can be release in a different context than it was acquired,
>>>    which is a little ugly but can be done.
>>
>> Yes, that would be possible. But likely, this will also need calls to
>> inode_dio_wait() to avoid ending up with a mix of regular write and zone append
>> writes in flight (which likely would result in the regular write failing as the
>> zone append writes would go straight to the device without waiting for the zone
>> write lock like regular writes do).
> 
> inode_dio_wait is a really bad implementation of almost a lock.  I've
> started some work that I need to finish to just replace it with proper
> non-owner rwsems (or even the range locks Dave has been looking into).

OK.

>> This all sound sensible to me. One last point though, specific to zonefs: if the
>> user opens a zone file with O_APPEND, I do want to have that necessarily mean
>> "use zone append". And same for the "RWF_REPORT_OFFSET". The point here is that
>> both O_APPEND and RWF_REPORT_OFFSET can be used with both regular writes and
>> zone append writes, but none of them actually clearly specify if the
>> application/user tolerates writing data to disk in a different order than the
>> issuing order... So another flag to indicate "atomic out-of-order writes" (==
>> zone append) ?
> 
> O_APPEND pretty much implies out of order, as there is no way for an
> application to know which thread wins the race to write the next chunk.

Yes and no. If the application threads do not synchronize their calls to
io_submit(), then yes indeed, things can get out of order. But if the
application threads are synchronized, then the offset set for each append AIO
will be in sequence of submission, so the user will not see its writes
completing at different write offsets than this implied offsets.

If O_APPEND is the sole flag that triggers the use of zone append, then we loose
this current implied and predictable positioning of the writes. Even for a
single thread by the way.

Hence my thinking to preserve this, meaning that O_APPEND alone will see zonefs
using regular writes. O_APPEND or RWF_APPEND + RWF_SOME_NICELY_NAMED_FLAG_for_ZA
would trigger the use of zone append, with the implied effect that writes may
not end up in the same order as they are submitted. So the flag name could be:
RWF_RELAXED_ORDER or something like that (I am bad at naming...).

And we also have RWF_REPORT_OFFSET which applies to all cases of append writes,
regardless of the command used.

Does this make sense ?
Christoph Hellwig Aug. 14, 2020, 12:04 p.m. UTC | #30
On Fri, Aug 14, 2020 at 08:27:13AM +0000, Damien Le Moal wrote:
> > 
> > O_APPEND pretty much implies out of order, as there is no way for an
> > application to know which thread wins the race to write the next chunk.
> 
> Yes and no. If the application threads do not synchronize their calls to
> io_submit(), then yes indeed, things can get out of order. But if the
> application threads are synchronized, then the offset set for each append AIO
> will be in sequence of submission, so the user will not see its writes
> completing at different write offsets than this implied offsets.

Nothing gurantees any kind of ordering for two separate io_submit calls.
The kernel may delay one of them for any reason.

Now if you are doing two fully synchronous write calls on an
O_APPEND fd, yes they are serialized.  But using Zone Append won't
change that.
Damien Le Moal Aug. 14, 2020, 12:20 p.m. UTC | #31
On 2020/08/14 21:04, hch@infradead.org wrote:
> On Fri, Aug 14, 2020 at 08:27:13AM +0000, Damien Le Moal wrote:
>>>
>>> O_APPEND pretty much implies out of order, as there is no way for an
>>> application to know which thread wins the race to write the next chunk.
>>
>> Yes and no. If the application threads do not synchronize their calls to
>> io_submit(), then yes indeed, things can get out of order. But if the
>> application threads are synchronized, then the offset set for each append AIO
>> will be in sequence of submission, so the user will not see its writes
>> completing at different write offsets than this implied offsets.
> 
> Nothing gurantees any kind of ordering for two separate io_submit calls.
> The kernel may delay one of them for any reason.

Ah. Yes. The inode locking is at the single aio issuing level, not the io_submit
syscall level... So yes, in the end, the aios offsets and their execution order
can be anything. I see it now. So O_APPEND implying zone append is fine for zonefs.

> 
> Now if you are doing two fully synchronous write calls on an
> O_APPEND fd, yes they are serialized.  But using Zone Append won't
> change that.

Yep. That zonefs already does.

OK. So I think I will send a writeup of the semantic discussed so far. We also
still need a solution for io_uring interface for the written offset report and
we can implement.
Kanchan Joshi Sept. 7, 2020, 7:01 a.m. UTC | #32
On Fri, Aug 14, 2020 at 1:44 PM hch@infradead.org <hch@infradead.org> wrote:
>
> On Wed, Aug 05, 2020 at 07:35:28AM +0000, Damien Le Moal wrote:
> > > the write pointer.  The only interesting addition is that we also want
> > > to report where we wrote.  So I'd rather have RWF_REPORT_OFFSET or so.
> >
> > That works for me. But that rules out having the same interface for raw block
> > devices since O_APPEND has no meaning in that case. So for raw block devices, it
> > will have to be through zonefs. That works for me, and I think it was your idea
> > all along. Can you confirm please ?
>
> Yes.  I don't think think raw syscall level access to the zone append
> primitive makes sense.  Either use zonefs for a file-like API, or
> use the NVMe pass through interface for 100% raw access.

But there are use-cases which benefit from supporting zone-append on
raw block-dev path.
Certain user-space log-structured/cow FS/DB will use the device that
way. Aerospike is one example.
Pass-through is synchronous, and we lose the ability to use io-uring.

For async uring/aio to block-dev, file-pointer will not be moved to
EoF, but that position was not very important anyway- as with this
interface we expect many async appends outstanding, all with
zone-start.
Do you think RWF_APPEND | RWF_REPORT_OFFSET_DIRECT/INDIRECT is too bad
for direct block-dev. Could you please suggest another way to go about
it?



--
Kanchan
Christoph Hellwig Sept. 8, 2020, 3:18 p.m. UTC | #33
On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote:
> But there are use-cases which benefit from supporting zone-append on
> raw block-dev path.
> Certain user-space log-structured/cow FS/DB will use the device that
> way. Aerospike is one example.
> Pass-through is synchronous, and we lose the ability to use io-uring.

So use zonefs, which is designed exactly for that use case.
Kanchan Joshi Sept. 24, 2020, 5:19 p.m. UTC | #34
On Tue, Sep 8, 2020 at 8:48 PM hch@infradead.org <hch@infradead.org> wrote:
>
> On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote:
> > But there are use-cases which benefit from supporting zone-append on
> > raw block-dev path.
> > Certain user-space log-structured/cow FS/DB will use the device that
> > way. Aerospike is one example.
> > Pass-through is synchronous, and we lose the ability to use io-uring.
>
> So use zonefs, which is designed exactly for that use case.

Not specific to zone-append, but in general it may not be good to lock
new features/interfaces to ZoneFS alone, given that direct-block
interface has its own merits.
Mapping one file to a one zone is good for some use-cases, but
limiting for others.
Some user-space FS/DBs would be more efficient (less meta, indirection)
with the freedom to decide file-to-zone mapping/placement.
- Rocksdb and those LSM style DBs would map SSTable to zone, but
SSTable file may be two small (initially) and may become too large
(after compaction) for a zone.
- The internal parallelism of a single zone is a design-choice, and
depends on the drive. Writing multiple zones parallely (striped/raid
way) can give better performance than writing on one. In that case one
would want to file that seamlessly combines multiple-zones in a
striped fashion.

Also it seems difficult (compared to block dev) to fit simple-copy TP
in ZoneFS. The new
command needs: one NVMe drive, list of source LBAs and one destination
LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone
file, and one destination zone file) for that. While with block
interface, we do not need  more than one file-descriptor representing
the entire device. With more zone-files, we face open/close overhead too.
Damien Le Moal Sept. 25, 2020, 2:52 a.m. UTC | #35
On 2020/09/25 2:20, Kanchan Joshi wrote:
> On Tue, Sep 8, 2020 at 8:48 PM hch@infradead.org <hch@infradead.org> wrote:
>>
>> On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote:
>>> But there are use-cases which benefit from supporting zone-append on
>>> raw block-dev path.
>>> Certain user-space log-structured/cow FS/DB will use the device that
>>> way. Aerospike is one example.
>>> Pass-through is synchronous, and we lose the ability to use io-uring.
>>
>> So use zonefs, which is designed exactly for that use case.
> 
> Not specific to zone-append, but in general it may not be good to lock
> new features/interfaces to ZoneFS alone, given that direct-block
> interface has its own merits.
> Mapping one file to a one zone is good for some use-cases, but
> limiting for others.
> Some user-space FS/DBs would be more efficient (less meta, indirection)
> with the freedom to decide file-to-zone mapping/placement.

There is no metadata in zonefs. One file == one zone and the mapping between
zonefs files and zones is static, determined at mount time simply using report
zones. Zonefs files cannot be renamed nor deleted in anyway. Choosing a zonefs
file *is* the same as choosing a zone. Zonfes is *not* a POSIX file system doing
dynamic block allocation to files. The backing storage of files in zonefs is
static and fixed to the zone they represent. The difference between zonefs vs
raw zoned block device is the API that has to be used by the application, that
is, file descriptor representing the entire disk for raw disk vs file descriptor
representing one zone in zonefs. Note that the later has *a lot* of advantages
over the former: enables O_APPEND use, protects against bugs with user write
offsets mistakes, adds consistency of cached data against zone resets, and more.

> - Rocksdb and those LSM style DBs would map SSTable to zone, but
> SSTable file may be two small (initially) and may become too large
> (after compaction) for a zone.

You are contradicting yourself here. If a SSTable is mapped to a zone, then its
size cannot exceed the zone capacity, regardless of the interface used to access
the zones. And except for L0 tables which can be smaller (and are in memory
anyway), all levels tables have the same maximum size, which for zoned drives
must be the zone capacity. In any case, solving any problem in this area does
not depend in any way on zonefs vs raw disk interface. The implementation will
differ depending on the chosen interface, but what needs to be done to map
SSTables to zones is the same in both cases.

> - The internal parallelism of a single zone is a design-choice, and
> depends on the drive. Writing multiple zones parallely (striped/raid
> way) can give better performance than writing on one. In that case one
> would want to file that seamlessly combines multiple-zones in a
> striped fashion.

Then write a FS for that... Or have a library do it in user space. For the
library case, the implementation will differ for zonefs vs raw disk due to the
different API (regular file vs block devicer file), but the principles to follow
for stripping zones into a single storage object remain the same.

> Also it seems difficult (compared to block dev) to fit simple-copy TP
> in ZoneFS. The new
> command needs: one NVMe drive, list of source LBAs and one destination
> LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone
> file, and one destination zone file) for that. While with block
> interface, we do not need  more than one file-descriptor representing
> the entire device. With more zone-files, we face open/close overhead too.

Are you expecting simple-copy to allow requests that are not zone aligned ? I do
not think that will ever happen. Otherwise, the gotcha cases for it would be far
too numerous. Simple-copy is essentially an optimized regular write command.
Similarly to that command, it will not allow copies over zone boundaries and
will need the destination LBA to be aligned to the destination zone WP. I have
not checked the TP though and given the NVMe NDA, I will stop the discussion here.

filesend() could be used as the interface for simple-copy. Implementing that in
zonefs would not be that hard. What is your plan for simple-copy interface for
raw block device ? An  ioctl ? filesend() too ? As as with any other user level
API, we should not be restricted to a particular device type if we can avoid it,
so in-kernel emulation of the feature is needed for devices that do not have
simple-copy or scsi extended copy. filesend() seems to me like the best choice
since all of that is already implemented there.

As for the open()/close() overhead for zonefs, may be some use cases may suffer
from it, but my tests with LevelDB+zonefs did not show any significant
difference. zonefs open()/close() operations are way faster than for a regular
file system since there is no metadata and all inodes always exist in-memory.
And zonefs() now supports MAR/MOR limits for O_WRONLY open(). That can simplify
things for the user.
Kanchan Joshi Sept. 28, 2020, 6:58 p.m. UTC | #36
On Fri, Sep 25, 2020 at 8:22 AM Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
>
> On 2020/09/25 2:20, Kanchan Joshi wrote:
> > On Tue, Sep 8, 2020 at 8:48 PM hch@infradead.org <hch@infradead.org> wrote:
> >>
> >> On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote:
> >>> But there are use-cases which benefit from supporting zone-append on
> >>> raw block-dev path.
> >>> Certain user-space log-structured/cow FS/DB will use the device that
> >>> way. Aerospike is one example.
> >>> Pass-through is synchronous, and we lose the ability to use io-uring.
> >>
> >> So use zonefs, which is designed exactly for that use case.
> >
> > Not specific to zone-append, but in general it may not be good to lock
> > new features/interfaces to ZoneFS alone, given that direct-block
> > interface has its own merits.
> > Mapping one file to a one zone is good for some use-cases, but
> > limiting for others.
> > Some user-space FS/DBs would be more efficient (less meta, indirection)
> > with the freedom to decide file-to-zone mapping/placement.
>
> There is no metadata in zonefs. One file == one zone and the mapping between
> zonefs files and zones is static, determined at mount time simply using report
> zones. Zonefs files cannot be renamed nor deleted in anyway. Choosing a zonefs
> file *is* the same as choosing a zone. Zonfes is *not* a POSIX file system doing
> dynamic block allocation to files. The backing storage of files in zonefs is
> static and fixed to the zone they represent. The difference between zonefs vs
> raw zoned block device is the API that has to be used by the application, that
> is, file descriptor representing the entire disk for raw disk vs file descriptor
> representing one zone in zonefs. Note that the later has *a lot* of advantages
> over the former: enables O_APPEND use, protects against bugs with user write
> offsets mistakes, adds consistency of cached data against zone resets, and more.
>
> > - Rocksdb and those LSM style DBs would map SSTable to zone, but
> > SSTable file may be two small (initially) and may become too large
> > (after compaction) for a zone.
>
> You are contradicting yourself here. If a SSTable is mapped to a zone, then its
> size cannot exceed the zone capacity, regardless of the interface used to access
> the zones. And except for L0 tables which can be smaller (and are in memory
> anyway), all levels tables have the same maximum size, which for zoned drives
> must be the zone capacity. In any case, solving any problem in this area does
> not depend in any way on zonefs vs raw disk interface. The implementation will
> differ depending on the chosen interface, but what needs to be done to map
> SSTables to zones is the same in both cases.
>
> > - The internal parallelism of a single zone is a design-choice, and
> > depends on the drive. Writing multiple zones parallely (striped/raid
> > way) can give better performance than writing on one. In that case one
> > would want to file that seamlessly combines multiple-zones in a
> > striped fashion.
>
> Then write a FS for that... Or have a library do it in user space. For the
> library case, the implementation will differ for zonefs vs raw disk due to the
> different API (regular file vs block devicer file), but the principles to follow
> for stripping zones into a single storage object remain the same.

ZoneFS is better when it is about dealing at single-zone granularity,
and direct-block seems better when it is about grouping zones (in
various ways including striping). The latter case (i.e. grouping
zones) requires more involved mapping, and I agree that it can be left
to application (for both ZoneFS and raw-block backends).
But when an application tries that on ZoneFS, apart from mapping there
would be additional cost of indirection/fd-management (due to
file-on-files).
And if new features (zone-append for now) are available only on
ZoneFS, it forces application to use something that maynot be most
optimal for its need.

Coming to the original problem of plumbing append - I think divergence
started because RWF_APPEND did not have any meaning for block device.
Did I miss any other reason?
How about write-anywhere semantics (RWF_RELAXED_WRITE or
RWF_ANONYMOUS_WRITE flag) on block-dev.
Zone-append works a lot like write-anywhere on block-dev (or on any
other file that combines multiple-zones, in non-sequential fashion).

> > Also it seems difficult (compared to block dev) to fit simple-copy TP
> > in ZoneFS. The new
> > command needs: one NVMe drive, list of source LBAs and one destination
> > LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone
> > file, and one destination zone file) for that. While with block
> > interface, we do not need  more than one file-descriptor representing
> > the entire device. With more zone-files, we face open/close overhead too.
>
> Are you expecting simple-copy to allow requests that are not zone aligned ? I do
> not think that will ever happen. Otherwise, the gotcha cases for it would be far
> too numerous. Simple-copy is essentially an optimized regular write command.
> Similarly to that command, it will not allow copies over zone boundaries and
> will need the destination LBA to be aligned to the destination zone WP. I have
> not checked the TP though and given the NVMe NDA, I will stop the discussion here.

TP is ratified, if that is the problem you are referring to.

> filesend() could be used as the interface for simple-copy. Implementing that in
> zonefs would not be that hard. What is your plan for simple-copy interface for
> raw block device ? An  ioctl ? filesend() too ? As as with any other user level
> API, we should not be restricted to a particular device type if we can avoid it,
> so in-kernel emulation of the feature is needed for devices that do not have
> simple-copy or scsi extended copy. filesend() seems to me like the best choice
> since all of that is already implemented there.

At this moment, ioctl as sync and io-uring for async. sendfile() and
copy_file_range() takes two fds....with that we can represent copy
from one source zone to another zone.
But it does not fit to represent larger copy (from N source zones to
one destination zone).
Not sure if I am clear, perhaps sending RFC would be better for
discussion on simple-copy.

> As for the open()/close() overhead for zonefs, may be some use cases may suffer
> from it, but my tests with LevelDB+zonefs did not show any significant
> difference. zonefs open()/close() operations are way faster than for a regular
> file system since there is no metadata and all inodes always exist in-memory.
> And zonefs() now supports MAR/MOR limits for O_WRONLY open(). That can simplify
> things for the user.
>
>
> --
> Damien Le Moal
> Western Digital Research
Damien Le Moal Sept. 29, 2020, 1:24 a.m. UTC | #37
On 2020/09/29 3:58, Kanchan Joshi wrote:
[...]
> ZoneFS is better when it is about dealing at single-zone granularity,
> and direct-block seems better when it is about grouping zones (in
> various ways including striping). The latter case (i.e. grouping
> zones) requires more involved mapping, and I agree that it can be left
> to application (for both ZoneFS and raw-block backends).
> But when an application tries that on ZoneFS, apart from mapping there
> would be additional cost of indirection/fd-management (due to
> file-on-files).

There is no indirection in zonefs. fd-to-struct file/inode conversion is very
fast and happens for every system call anyway, regardless of what the fd
represents. So I really do not understand what your worry is here. If you are
worried about overhead/performance, then please show numbers. If something is
wrong, we can work on fixing it.

> And if new features (zone-append for now) are available only on
> ZoneFS, it forces application to use something that maynot be most
> optimal for its need.

"may" is not enough to convince me...

> Coming to the original problem of plumbing append - I think divergence
> started because RWF_APPEND did not have any meaning for block device.
> Did I miss any other reason?

Correct.

> How about write-anywhere semantics (RWF_RELAXED_WRITE or
> RWF_ANONYMOUS_WRITE flag) on block-dev.

"write-anywhere" ? What do you mean ? That is not possible on zoned devices,
even with zone append, since you at least need to guarantee that zones have
enough unwritten space to accept an append command.

> Zone-append works a lot like write-anywhere on block-dev (or on any
> other file that combines multiple-zones, in non-sequential fashion).

That is an over-simplification that is not helpful at all. Zone append is not
"write anywhere" at all. And "write anywhere" is not a concept that exist on
regular block devices anyway. Writes only go to the offset that the user
decided, through lseek(), pwrite() or aio->aio_offset. It is not like the block
layer decides where the writes land. The same constraint applies to zone append:
the user decide the target zone. That is not "anywhere". Please be precise with
wording and implied/desired semantic. Narrow down the scope of your concept
names for clarity.

And talking about "file that combines multiple-zones" would mean that we are now
back in FS land, not raw block device file accesses anymore. So which one are we
talking about ? It looks like you are confusing what the application does and
how it uses whatever usable interface to the device with what that interface
actually is. It is very confusing.

>>> Also it seems difficult (compared to block dev) to fit simple-copy TP
>>> in ZoneFS. The new
>>> command needs: one NVMe drive, list of source LBAs and one destination
>>> LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone
>>> file, and one destination zone file) for that. While with block
>>> interface, we do not need  more than one file-descriptor representing
>>> the entire device. With more zone-files, we face open/close overhead too.
>>
>> Are you expecting simple-copy to allow requests that are not zone aligned ? I do
>> not think that will ever happen. Otherwise, the gotcha cases for it would be far
>> too numerous. Simple-copy is essentially an optimized regular write command.
>> Similarly to that command, it will not allow copies over zone boundaries and
>> will need the destination LBA to be aligned to the destination zone WP. I have
>> not checked the TP though and given the NVMe NDA, I will stop the discussion here.
> 
> TP is ratified, if that is the problem you are referring to.

Ah. Yes. Got confused with ZRWA. Simple-copy is a different story anyway. Let's
not mix it into zone append user interface please.

> 
>> filesend() could be used as the interface for simple-copy. Implementing that in
>> zonefs would not be that hard. What is your plan for simple-copy interface for
>> raw block device ? An  ioctl ? filesend() too ? As as with any other user level
>> API, we should not be restricted to a particular device type if we can avoid it,
>> so in-kernel emulation of the feature is needed for devices that do not have
>> simple-copy or scsi extended copy. filesend() seems to me like the best choice
>> since all of that is already implemented there.
> 
> At this moment, ioctl as sync and io-uring for async. sendfile() and
> copy_file_range() takes two fds....with that we can represent copy
> from one source zone to another zone.
> But it does not fit to represent larger copy (from N source zones to
> one destination zone).

nvme passthrough ? If that does not fit your use case, then think of an
interface, its definition/semantic and propose it. But again, use a different
thread. This is mixing up zone-append and simple copy, which I do not think are
directly related.

> Not sure if I am clear, perhaps sending RFC would be better for
> discussion on simple-copy.

Separate this discussion from zone append please. Mixing up 2 problems in one
thread is not helpful to make progress.
Kanchan Joshi Sept. 29, 2020, 6:49 p.m. UTC | #38
On Tue, Sep 29, 2020 at 6:54 AM Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
>
> On 2020/09/29 3:58, Kanchan Joshi wrote:
> [...]
> > ZoneFS is better when it is about dealing at single-zone granularity,
> > and direct-block seems better when it is about grouping zones (in
> > various ways including striping). The latter case (i.e. grouping
> > zones) requires more involved mapping, and I agree that it can be left
> > to application (for both ZoneFS and raw-block backends).
> > But when an application tries that on ZoneFS, apart from mapping there
> > would be additional cost of indirection/fd-management (due to
> > file-on-files).
>
> There is no indirection in zonefs. fd-to-struct file/inode conversion is very
> fast and happens for every system call anyway, regardless of what the fd
> represents. So I really do not understand what your worry is here.

file-over-files.....if application (user-space FS/DB) has to place
file-abstraction over zonefs again, to group/map the zones in a
different manner (larger files, striped mapping etc.).
Imagine a file over say 4 zones.....with zonefs backend, application
will invoke kernel at least 4 times to open the fds. With raw-block
backend, these system calls won't be required in the first place.

> If you are
> worried about overhead/performance, then please show numbers. If something is
> wrong, we can work on fixing it.
>
> > And if new features (zone-append for now) are available only on
> > ZoneFS, it forces application to use something that maynot be most
> > optimal for its need.
>
> "may" is not enough to convince me...
>
> > Coming to the original problem of plumbing append - I think divergence
> > started because RWF_APPEND did not have any meaning for block device.
> > Did I miss any other reason?
>
> Correct.
>
> > How about write-anywhere semantics (RWF_RELAXED_WRITE or
> > RWF_ANONYMOUS_WRITE flag) on block-dev.
>
> "write-anywhere" ? What do you mean ? That is not possible on zoned devices,
> even with zone append, since you at least need to guarantee that zones have
> enough unwritten space to accept an append command.
>
> > Zone-append works a lot like write-anywhere on block-dev (or on any
> > other file that combines multiple-zones, in non-sequential fashion).
>
> That is an over-simplification that is not helpful at all. Zone append is not
> "write anywhere" at all. And "write anywhere" is not a concept that exist on
> regular block devices anyway. Writes only go to the offset that the user
> decided, through lseek(), pwrite() or aio->aio_offset. It is not like the block
> layer decides where the writes land. The same constraint applies to zone append:
> the user decide the target zone. That is not "anywhere". Please be precise with
> wording and implied/desired semantic. Narrow down the scope of your concept
> names for clarity.

This -
<start>
Issue write on offset X with no flag, it happens at X.
Issue write on offset X with "anywhere" flag, it happens
anywhere....and application comes to know that on completion.
</start>
Above is fairly generic as far as high-level interface is concerned.
Return offset can be file-pointer or supplied-offset or something
completely different. If anywhere-flag is passed, the caller should
not assume anything and must explicitly check where write happened.
The "anywhere" part can be decided by the component that implements
the interface.
For zoned block dev,  this "anywhere" can come by issuing zone-append
underneath. Some other components are free to implement "anywhere" in
another way, or do nothing....in that case write continues to happen
at X.

For zoned raw-block, we have got an address-space that contains N
zones placed sequentially.
Write on offset O1 with anywhere flag: => The O1 is rounded-down to
zone-start (say Z1) by direct-io code, zone-append is issued on Z1,
and completion-offset O2 is returned.
write-anywhere on another offset O3 of address space => zone-start
could be Z2, and completion-offset O4 is returned.
Application picks completion offsets O3 and O4 and goes about its
business, not needing to know about Z1 or Z2.

> And talking about "file that combines multiple-zones" would mean that we are now
> back in FS land, not raw block device file accesses anymore. So which one are we
> talking about ?

About user-space FS/DB/SDS using zoned-storage, aiming optimized placement.
In all this discussion, I have been referring to ZoneFS and Raw-block
as backends for higher-level abstraction residing in user-space.

> It looks like you are confusing what the application does and
> how it uses whatever usable interface to the device with what that interface
> actually is. It is very confusing.
>
> >>> Also it seems difficult (compared to block dev) to fit simple-copy TP
> >>> in ZoneFS. The new
> >>> command needs: one NVMe drive, list of source LBAs and one destination
> >>> LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone
> >>> file, and one destination zone file) for that. While with block
> >>> interface, we do not need  more than one file-descriptor representing
> >>> the entire device. With more zone-files, we face open/close overhead too.
> >>
> >> Are you expecting simple-copy to allow requests that are not zone aligned ? I do
> >> not think that will ever happen. Otherwise, the gotcha cases for it would be far
> >> too numerous. Simple-copy is essentially an optimized regular write command.
> >> Similarly to that command, it will not allow copies over zone boundaries and
> >> will need the destination LBA to be aligned to the destination zone WP. I have
> >> not checked the TP though and given the NVMe NDA, I will stop the discussion here.
> >
> > TP is ratified, if that is the problem you are referring to.
>
> Ah. Yes. Got confused with ZRWA. Simple-copy is a different story anyway. Let's
> not mix it into zone append user interface please.
>
> >
> >> filesend() could be used as the interface for simple-copy. Implementing that in
> >> zonefs would not be that hard. What is your plan for simple-copy interface for
> >> raw block device ? An  ioctl ? filesend() too ? As as with any other user level
> >> API, we should not be restricted to a particular device type if we can avoid it,
> >> so in-kernel emulation of the feature is needed for devices that do not have
> >> simple-copy or scsi extended copy. filesend() seems to me like the best choice
> >> since all of that is already implemented there.
> >
> > At this moment, ioctl as sync and io-uring for async. sendfile() and
> > copy_file_range() takes two fds....with that we can represent copy
> > from one source zone to another zone.
> > But it does not fit to represent larger copy (from N source zones to
> > one destination zone).
>
> nvme passthrough ? If that does not fit your use case, then think of an
> interface, its definition/semantic and propose it. But again, use a different
> thread. This is mixing up zone-append and simple copy, which I do not think are
> directly related.
>
> > Not sure if I am clear, perhaps sending RFC would be better for
> > discussion on simple-copy.
>
> Separate this discussion from zone append please. Mixing up 2 problems in one
> thread is not helpful to make progress.
Fine.
Luis Chamberlain March 2, 2022, 8:43 p.m. UTC | #39
On Tue, Sep 08, 2020 at 04:18:01PM +0100, hch@infradead.org wrote:
> On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote:
> > But there are use-cases which benefit from supporting zone-append on
> > raw block-dev path.
> > Certain user-space log-structured/cow FS/DB will use the device that
> > way. Aerospike is one example.
> > Pass-through is synchronous, and we lose the ability to use io-uring.
> 
> So use zonefs, which is designed exactly for that use case.

Using zonefs to test append alone can introduce a slight overhead with
the VFS if we want to do something such as just testing any hot path
with append and the block layer. If we want to live with that, that's
fine!

Just saying.

  Luis
Luis Chamberlain March 2, 2022, 8:47 p.m. UTC | #40
On Fri, Jul 31, 2020 at 02:08:02PM +0100, hch@infradead.org wrote:
> And FYI, this is what I'd do for a hacky aio-only prototype (untested):

So... are we OK with an aio-only approach (instead of using io-uring)
for raw access to append?

  Luis
Luis Chamberlain March 2, 2022, 8:51 p.m. UTC | #41
On Fri, Jul 31, 2020 at 07:45:26AM +0100, hch@infradead.org wrote:
> On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote:
> > > - We may not be able to use RWF_APPEND, and need exposing a new
> > > type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this
> > > sounds outrageous, but is it OK to have uring-only flag which can be
> > > combined with RWF_APPEND?
> > 
> > Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for
> > raw block device accesses. We could certainly define a meaning for these in the
> > context of zoned block devices.
> 
> We can't just add a meaning for O_APPEND on block devices now,

Make sense.

Is a new call system call for nameless writes called for instead then?
Then there is no baggage. Or is this completely stupid?

> as it was previously silently ignored.  I also really don't think any
> of these semantics even fit the block device to start with.  If you
> want to work on raw zones use zonefs, that's what is exists for.

Using zonefs adds a slight VFS overhead. Fine if we want to live with
that, but I have a feeling if we want to do something like just testing
hot paths alone to compare apples to apples we'd want something more
fine grained.

  Luis
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7809ab2..6510cf5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -401,7 +401,14 @@  struct io_rw {
 	/* NOTE: kiocb has the file as the first member, so don't do it here */
 	struct kiocb			kiocb;
 	u64				addr;
-	u64				len;
+	union {
+		/*
+		 * len is used only during submission.
+		 * append_offset is used only during completion.
+		 */
+		u64			len;
+		u64			append_offset;
+	};
 };
 
 struct io_connect {
@@ -541,6 +548,7 @@  enum {
 	REQ_F_NO_FILE_TABLE_BIT,
 	REQ_F_QUEUE_TIMEOUT_BIT,
 	REQ_F_WORK_INITIALIZED_BIT,
+	REQ_F_ZONE_APPEND_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -598,6 +606,8 @@  enum {
 	REQ_F_QUEUE_TIMEOUT	= BIT(REQ_F_QUEUE_TIMEOUT_BIT),
 	/* io_wq_work is initialized */
 	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
+	/* to return zone append offset */
+	REQ_F_ZONE_APPEND = BIT(REQ_F_ZONE_APPEND_BIT),
 };
 
 struct async_poll {
@@ -1244,8 +1254,15 @@  static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		req->flags &= ~REQ_F_OVERFLOW;
 		if (cqe) {
 			WRITE_ONCE(cqe->user_data, req->user_data);
-			WRITE_ONCE(cqe->res, req->result);
-			WRITE_ONCE(cqe->flags, req->cflags);
+			if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
+				if (likely(req->result > 0))
+					WRITE_ONCE(cqe->res64, req->rw.append_offset);
+				else
+					WRITE_ONCE(cqe->res64, req->result);
+			} else {
+				WRITE_ONCE(cqe->res, req->result);
+				WRITE_ONCE(cqe->flags, req->cflags);
+			}
 		} else {
 			WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1284,8 +1301,15 @@  static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 	cqe = io_get_cqring(ctx);
 	if (likely(cqe)) {
 		WRITE_ONCE(cqe->user_data, req->user_data);
-		WRITE_ONCE(cqe->res, res);
-		WRITE_ONCE(cqe->flags, cflags);
+		if (unlikely(req->flags & REQ_F_ZONE_APPEND)) {
+			if (likely(res > 0))
+				WRITE_ONCE(cqe->res64, req->rw.append_offset);
+			else
+				WRITE_ONCE(cqe->res64, res);
+		} else {
+			WRITE_ONCE(cqe->res, res);
+			WRITE_ONCE(cqe->flags, cflags);
+		}
 	} else if (ctx->cq_overflow_flushed) {
 		WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1943,7 +1967,7 @@  static inline void req_set_fail_links(struct io_kiocb *req)
 		req->flags |= REQ_F_FAIL_LINK;
 }
 
-static void io_complete_rw_common(struct kiocb *kiocb, long res)
+static void io_complete_rw_common(struct kiocb *kiocb, long res, long long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 	int cflags = 0;
@@ -1955,6 +1979,9 @@  static void io_complete_rw_common(struct kiocb *kiocb, long res)
 		req_set_fail_links(req);
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_kbuf(req);
+	if (req->flags & REQ_F_ZONE_APPEND)
+		req->rw.append_offset = res2;
+
 	__io_cqring_add_event(req, res, cflags);
 }
 
@@ -1962,7 +1989,7 @@  static void io_complete_rw(struct kiocb *kiocb, long res, long long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
-	io_complete_rw_common(kiocb, res);
+	io_complete_rw_common(kiocb, res, res2);
 	io_put_req(req);
 }
 
@@ -1976,8 +2003,11 @@  static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long long res2)
 	if (res != req->result)
 		req_set_fail_links(req);
 	req->result = res;
-	if (res != -EAGAIN)
+	if (res != -EAGAIN) {
+		if (req->flags & REQ_F_ZONE_APPEND)
+			req->rw.append_offset =  res2;
 		WRITE_ONCE(req->iopoll_completed, 1);
+	}
 }
 
 /*
@@ -2739,6 +2769,9 @@  static int io_write(struct io_kiocb *req, bool force_nonblock)
 						SB_FREEZE_WRITE);
 		}
 		kiocb->ki_flags |= IOCB_WRITE;
+		/* zone-append requires few extra steps during completion */
+		if (kiocb->ki_flags & IOCB_ZONE_APPEND)
+			req->flags |= REQ_F_ZONE_APPEND;
 
 		if (!force_nonblock)
 			current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c2269..2580d93 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -156,8 +156,13 @@  enum {
  */
 struct io_uring_cqe {
 	__u64	user_data;	/* sqe->data submission passed back */
-	__s32	res;		/* result code for this event */
-	__u32	flags;
+	union {
+		struct {
+			__s32	res;	/* result code for this event */
+			__u32	flags;
+		};
+		__s64	res64;	/* appending offset for zone append */
+	};
 };
 
 /*