Message ID | 1595605762-17010-7-git-send-email-joshi.k@samsung.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v4,1/6] fs: introduce FMODE_ZONE_APPEND and IOCB_ZONE_APPEND | expand |
On 7/24/20 9:49 AM, Kanchan Joshi wrote: > diff --git a/fs/io_uring.c b/fs/io_uring.c > index 7809ab2..6510cf5 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) > cqe = io_get_cqring(ctx); > if (likely(cqe)) { > WRITE_ONCE(cqe->user_data, req->user_data); > - WRITE_ONCE(cqe->res, res); > - WRITE_ONCE(cqe->flags, cflags); > + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { > + if (likely(res > 0)) > + WRITE_ONCE(cqe->res64, req->rw.append_offset); > + else > + WRITE_ONCE(cqe->res64, res); > + } else { > + WRITE_ONCE(cqe->res, res); > + WRITE_ONCE(cqe->flags, cflags); > + } This would be nice to keep out of the fast path, if possible. > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index 92c2269..2580d93 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -156,8 +156,13 @@ enum { > */ > struct io_uring_cqe { > __u64 user_data; /* sqe->data submission passed back */ > - __s32 res; /* result code for this event */ > - __u32 flags; > + union { > + struct { > + __s32 res; /* result code for this event */ > + __u32 flags; > + }; > + __s64 res64; /* appending offset for zone append */ > + }; > }; Is this a compatible change, both for now but also going forward? You could randomly have IORING_CQE_F_BUFFER set, or any other future flags. Layout would also be different between big and little endian, so not even that easy to set aside a flag for this. But even if that was done, we'd still have this weird API where liburing or the app would need to distinguish this cqe from all others based on... the user_data? Hence liburing can't do it, only the app would be able to. Just seems like a hack to me.
On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: > > On 7/24/20 9:49 AM, Kanchan Joshi wrote: > > diff --git a/fs/io_uring.c b/fs/io_uring.c > > index 7809ab2..6510cf5 100644 > > --- a/fs/io_uring.c > > +++ b/fs/io_uring.c > > @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) > > cqe = io_get_cqring(ctx); > > if (likely(cqe)) { > > WRITE_ONCE(cqe->user_data, req->user_data); > > - WRITE_ONCE(cqe->res, res); > > - WRITE_ONCE(cqe->flags, cflags); > > + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { > > + if (likely(res > 0)) > > + WRITE_ONCE(cqe->res64, req->rw.append_offset); > > + else > > + WRITE_ONCE(cqe->res64, res); > > + } else { > > + WRITE_ONCE(cqe->res, res); > > + WRITE_ONCE(cqe->flags, cflags); > > + } > > This would be nice to keep out of the fast path, if possible. I was thinking of keeping a function-pointer (in io_kiocb) during submission. That would have avoided this check......but argument count differs, so it did not add up. > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > > index 92c2269..2580d93 100644 > > --- a/include/uapi/linux/io_uring.h > > +++ b/include/uapi/linux/io_uring.h > > @@ -156,8 +156,13 @@ enum { > > */ > > struct io_uring_cqe { > > __u64 user_data; /* sqe->data submission passed back */ > > - __s32 res; /* result code for this event */ > > - __u32 flags; > > + union { > > + struct { > > + __s32 res; /* result code for this event */ > > + __u32 flags; > > + }; > > + __s64 res64; /* appending offset for zone append */ > > + }; > > }; > > Is this a compatible change, both for now but also going forward? You > could randomly have IORING_CQE_F_BUFFER set, or any other future flags. Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not used/set for write currently, so it looked compatible at this point. Yes, no room for future flags for this operation. Do you see any other way to enable this support in io-uring? > Layout would also be different between big and little endian, so not > even that easy to set aside a flag for this. But even if that was done, > we'd still have this weird API where liburing or the app would need to > distinguish this cqe from all others based on... the user_data? Hence > liburing can't do it, only the app would be able to. > > Just seems like a hack to me. Yes, only user_data to distinguish. Do liburing helpers need to look at cqe->res (and decide something) before returning the cqe to application? I see that happening at once place, but not sure when it would hit LIBURING_DATA_TIMEOUT condition. __io_uring_peek_cqe() { do { io_uring_for_each_cqe(ring, head, cqe) break; if (cqe) { if (cqe->user_data == LIBURING_UDATA_TIMEOUT) { if (cqe->res < 0) err = cqe->res; io_uring_cq_advance(ring, 1); if (!err) continue; cqe = NULL; } } break; } while (1); }
On 7/27/20 1:16 PM, Kanchan Joshi wrote: > On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: >> >> On 7/24/20 9:49 AM, Kanchan Joshi wrote: >>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>> index 7809ab2..6510cf5 100644 >>> --- a/fs/io_uring.c >>> +++ b/fs/io_uring.c >>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) >>> cqe = io_get_cqring(ctx); >>> if (likely(cqe)) { >>> WRITE_ONCE(cqe->user_data, req->user_data); >>> - WRITE_ONCE(cqe->res, res); >>> - WRITE_ONCE(cqe->flags, cflags); >>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { >>> + if (likely(res > 0)) >>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); >>> + else >>> + WRITE_ONCE(cqe->res64, res); >>> + } else { >>> + WRITE_ONCE(cqe->res, res); >>> + WRITE_ONCE(cqe->flags, cflags); >>> + } >> >> This would be nice to keep out of the fast path, if possible. > > I was thinking of keeping a function-pointer (in io_kiocb) during > submission. That would have avoided this check......but argument count > differs, so it did not add up. But that'd grow the io_kiocb just for this use case, which is arguably even worse. Unless you can keep it in the per-request private data, but there's no more room there for the regular read/write side. >>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>> index 92c2269..2580d93 100644 >>> --- a/include/uapi/linux/io_uring.h >>> +++ b/include/uapi/linux/io_uring.h >>> @@ -156,8 +156,13 @@ enum { >>> */ >>> struct io_uring_cqe { >>> __u64 user_data; /* sqe->data submission passed back */ >>> - __s32 res; /* result code for this event */ >>> - __u32 flags; >>> + union { >>> + struct { >>> + __s32 res; /* result code for this event */ >>> + __u32 flags; >>> + }; >>> + __s64 res64; /* appending offset for zone append */ >>> + }; >>> }; >> >> Is this a compatible change, both for now but also going forward? You >> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. > > Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not > used/set for write currently, so it looked compatible at this point. Not worried about that, since we won't ever use that for writes. But it is a potential headache down the line for other flags, if they apply to normal writes. > Yes, no room for future flags for this operation. > Do you see any other way to enable this support in io-uring? Honestly I think the only viable option is as we discussed previously, pass in a pointer to a 64-bit type where we can copy the additional completion information to. >> Layout would also be different between big and little endian, so not >> even that easy to set aside a flag for this. But even if that was done, >> we'd still have this weird API where liburing or the app would need to >> distinguish this cqe from all others based on... the user_data? Hence >> liburing can't do it, only the app would be able to. >> >> Just seems like a hack to me. > > Yes, only user_data to distinguish. Do liburing helpers need to look > at cqe->res (and decide something) before returning the cqe to > application? They generally don't, outside of the internal timeout. But it's an issue for the API, as it forces applications to handle the CQEs a certain way. Normally there's flexibility. This makes the append writes behave differently than everything else, which is never a good idea.
On 24/07/2020 18:49, Kanchan Joshi wrote: > From: SelvaKumar S <selvakuma.s1@samsung.com> > > Repurpose [cqe->res, cqe->flags] into cqe->res64 (signed) to report > 64bit written-offset for zone-append. The appending-write which requires > reporting written-location (conveyed by IOCB_ZONE_APPEND flag) is > ensured not to be a short-write; this avoids the need to report > number-of-bytes-copied. > append-offset is returned by lower-layer to io-uring via ret2 of > ki_complete interface. Make changes to collect it and send to user-space > via cqe->res64. > > Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com> > Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com> > --- > fs/io_uring.c | 49 ++++++++++++++++++++++++++++++++++++------- > include/uapi/linux/io_uring.h | 9 ++++++-- > 2 files changed, 48 insertions(+), 10 deletions(-) > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index 7809ab2..6510cf5 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c ... > @@ -1244,8 +1254,15 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) > req->flags &= ~REQ_F_OVERFLOW; > if (cqe) { > WRITE_ONCE(cqe->user_data, req->user_data); > - WRITE_ONCE(cqe->res, req->result); > - WRITE_ONCE(cqe->flags, req->cflags); > + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { > + if (likely(req->result > 0)) > + WRITE_ONCE(cqe->res64, req->rw.append_offset); > + else > + WRITE_ONCE(cqe->res64, req->result); > + } else { > + WRITE_ONCE(cqe->res, req->result); > + WRITE_ONCE(cqe->flags, req->cflags); > + } > } else { > WRITE_ONCE(ctx->rings->cq_overflow, > atomic_inc_return(&ctx->cached_cq_overflow)); > @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) > cqe = io_get_cqring(ctx); > if (likely(cqe)) { > WRITE_ONCE(cqe->user_data, req->user_data); > - WRITE_ONCE(cqe->res, res); > - WRITE_ONCE(cqe->flags, cflags); > + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { > + if (likely(res > 0)) > + WRITE_ONCE(cqe->res64, req->rw.append_offset); 1. as I mentioned before, that's not not nice to ignore @cflags 2. that's not the right place for opcode specific handling 3. it doesn't work with overflowed reqs, see the final else below For this scheme, I'd pass @append_offset as an argument. That should also remove this extra if from the fast path, which Jens mentioned. > + else > + WRITE_ONCE(cqe->res64, res); > + } else { > + WRITE_ONCE(cqe->res, res); > + WRITE_ONCE(cqe->flags, cflags); > + } > } else if (ctx->cq_overflow_flushed) { > WRITE_ONCE(ctx->rings->cq_overflow, > atomic_inc_return(&ctx->cached_cq_overflow)); > @@ -1943,7 +1967,7 @@ static inline void req_set_fail_links(struct io_kiocb *req) > req->flags |= REQ_F_FAIL_LINK; > } >
On 27/07/2020 23:34, Jens Axboe wrote: > On 7/27/20 1:16 PM, Kanchan Joshi wrote: >> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: >>> >>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: >>>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>>> index 7809ab2..6510cf5 100644 >>>> --- a/fs/io_uring.c >>>> +++ b/fs/io_uring.c >>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) >>>> cqe = io_get_cqring(ctx); >>>> if (likely(cqe)) { >>>> WRITE_ONCE(cqe->user_data, req->user_data); >>>> - WRITE_ONCE(cqe->res, res); >>>> - WRITE_ONCE(cqe->flags, cflags); >>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { >>>> + if (likely(res > 0)) >>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); >>>> + else >>>> + WRITE_ONCE(cqe->res64, res); >>>> + } else { >>>> + WRITE_ONCE(cqe->res, res); >>>> + WRITE_ONCE(cqe->flags, cflags); >>>> + } >>> >>> This would be nice to keep out of the fast path, if possible. >> >> I was thinking of keeping a function-pointer (in io_kiocb) during >> submission. That would have avoided this check......but argument count >> differs, so it did not add up. > > But that'd grow the io_kiocb just for this use case, which is arguably > even worse. Unless you can keep it in the per-request private data, > but there's no more room there for the regular read/write side. > >>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>> index 92c2269..2580d93 100644 >>>> --- a/include/uapi/linux/io_uring.h >>>> +++ b/include/uapi/linux/io_uring.h >>>> @@ -156,8 +156,13 @@ enum { >>>> */ >>>> struct io_uring_cqe { >>>> __u64 user_data; /* sqe->data submission passed back */ >>>> - __s32 res; /* result code for this event */ >>>> - __u32 flags; >>>> + union { >>>> + struct { >>>> + __s32 res; /* result code for this event */ >>>> + __u32 flags; >>>> + }; >>>> + __s64 res64; /* appending offset for zone append */ >>>> + }; >>>> }; >>> >>> Is this a compatible change, both for now but also going forward? You >>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. >> >> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not >> used/set for write currently, so it looked compatible at this point. > > Not worried about that, since we won't ever use that for writes. But it > is a potential headache down the line for other flags, if they apply to > normal writes. > >> Yes, no room for future flags for this operation. >> Do you see any other way to enable this support in io-uring? > > Honestly I think the only viable option is as we discussed previously, > pass in a pointer to a 64-bit type where we can copy the additional > completion information to. TBH, I hate the idea of such overhead/latency at times when SSDs can serve writes in less than 10ms. Any chance you measured how long does it take to drag through task_work? > >>> Layout would also be different between big and little endian, so not >>> even that easy to set aside a flag for this. But even if that was done, >>> we'd still have this weird API where liburing or the app would need to >>> distinguish this cqe from all others based on... the user_data? Hence >>> liburing can't do it, only the app would be able to. >>> >>> Just seems like a hack to me. >> >> Yes, only user_data to distinguish. Do liburing helpers need to look >> at cqe->res (and decide something) before returning the cqe to >> application? > > They generally don't, outside of the internal timeout. But it's an issue > for the API, as it forces applications to handle the CQEs a certain way. > Normally there's flexibility. This makes the append writes behave > differently than everything else, which is never a good idea. >
On 7/30/20 10:08 AM, Pavel Begunkov wrote: > On 27/07/2020 23:34, Jens Axboe wrote: >> On 7/27/20 1:16 PM, Kanchan Joshi wrote: >>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: >>>> >>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: >>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>>>> index 7809ab2..6510cf5 100644 >>>>> --- a/fs/io_uring.c >>>>> +++ b/fs/io_uring.c >>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) >>>>> cqe = io_get_cqring(ctx); >>>>> if (likely(cqe)) { >>>>> WRITE_ONCE(cqe->user_data, req->user_data); >>>>> - WRITE_ONCE(cqe->res, res); >>>>> - WRITE_ONCE(cqe->flags, cflags); >>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { >>>>> + if (likely(res > 0)) >>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); >>>>> + else >>>>> + WRITE_ONCE(cqe->res64, res); >>>>> + } else { >>>>> + WRITE_ONCE(cqe->res, res); >>>>> + WRITE_ONCE(cqe->flags, cflags); >>>>> + } >>>> >>>> This would be nice to keep out of the fast path, if possible. >>> >>> I was thinking of keeping a function-pointer (in io_kiocb) during >>> submission. That would have avoided this check......but argument count >>> differs, so it did not add up. >> >> But that'd grow the io_kiocb just for this use case, which is arguably >> even worse. Unless you can keep it in the per-request private data, >> but there's no more room there for the regular read/write side. >> >>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>>> index 92c2269..2580d93 100644 >>>>> --- a/include/uapi/linux/io_uring.h >>>>> +++ b/include/uapi/linux/io_uring.h >>>>> @@ -156,8 +156,13 @@ enum { >>>>> */ >>>>> struct io_uring_cqe { >>>>> __u64 user_data; /* sqe->data submission passed back */ >>>>> - __s32 res; /* result code for this event */ >>>>> - __u32 flags; >>>>> + union { >>>>> + struct { >>>>> + __s32 res; /* result code for this event */ >>>>> + __u32 flags; >>>>> + }; >>>>> + __s64 res64; /* appending offset for zone append */ >>>>> + }; >>>>> }; >>>> >>>> Is this a compatible change, both for now but also going forward? You >>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. >>> >>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not >>> used/set for write currently, so it looked compatible at this point. >> >> Not worried about that, since we won't ever use that for writes. But it >> is a potential headache down the line for other flags, if they apply to >> normal writes. >> >>> Yes, no room for future flags for this operation. >>> Do you see any other way to enable this support in io-uring? >> >> Honestly I think the only viable option is as we discussed previously, >> pass in a pointer to a 64-bit type where we can copy the additional >> completion information to. > > TBH, I hate the idea of such overhead/latency at times when SSDs can > serve writes in less than 10ms. Any chance you measured how long does it 10us? :-) > take to drag through task_work? A 64-bit value copy is really not a lot of overhead... But yes, we'd need to push the completion through task_work at that point, as we can't do it from the completion side. That's not a lot of overhead, and most notably, it's overhead that only affects this particular type. That's not a bad starting point, and something that can always be optimized later if need be. But I seriously doubt it'd be anything to worry about.
On 30/07/2020 19:13, Jens Axboe wrote: > On 7/30/20 10:08 AM, Pavel Begunkov wrote: >> On 27/07/2020 23:34, Jens Axboe wrote: >>> On 7/27/20 1:16 PM, Kanchan Joshi wrote: >>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: >>>>> >>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: >>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>>>>> index 7809ab2..6510cf5 100644 >>>>>> --- a/fs/io_uring.c >>>>>> +++ b/fs/io_uring.c >>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) >>>>>> cqe = io_get_cqring(ctx); >>>>>> if (likely(cqe)) { >>>>>> WRITE_ONCE(cqe->user_data, req->user_data); >>>>>> - WRITE_ONCE(cqe->res, res); >>>>>> - WRITE_ONCE(cqe->flags, cflags); >>>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { >>>>>> + if (likely(res > 0)) >>>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); >>>>>> + else >>>>>> + WRITE_ONCE(cqe->res64, res); >>>>>> + } else { >>>>>> + WRITE_ONCE(cqe->res, res); >>>>>> + WRITE_ONCE(cqe->flags, cflags); >>>>>> + } >>>>> >>>>> This would be nice to keep out of the fast path, if possible. >>>> >>>> I was thinking of keeping a function-pointer (in io_kiocb) during >>>> submission. That would have avoided this check......but argument count >>>> differs, so it did not add up. >>> >>> But that'd grow the io_kiocb just for this use case, which is arguably >>> even worse. Unless you can keep it in the per-request private data, >>> but there's no more room there for the regular read/write side. >>> >>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>>>> index 92c2269..2580d93 100644 >>>>>> --- a/include/uapi/linux/io_uring.h >>>>>> +++ b/include/uapi/linux/io_uring.h >>>>>> @@ -156,8 +156,13 @@ enum { >>>>>> */ >>>>>> struct io_uring_cqe { >>>>>> __u64 user_data; /* sqe->data submission passed back */ >>>>>> - __s32 res; /* result code for this event */ >>>>>> - __u32 flags; >>>>>> + union { >>>>>> + struct { >>>>>> + __s32 res; /* result code for this event */ >>>>>> + __u32 flags; >>>>>> + }; >>>>>> + __s64 res64; /* appending offset for zone append */ >>>>>> + }; >>>>>> }; >>>>> >>>>> Is this a compatible change, both for now but also going forward? You >>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. >>>> >>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not >>>> used/set for write currently, so it looked compatible at this point. >>> >>> Not worried about that, since we won't ever use that for writes. But it >>> is a potential headache down the line for other flags, if they apply to >>> normal writes. >>> >>>> Yes, no room for future flags for this operation. >>>> Do you see any other way to enable this support in io-uring? >>> >>> Honestly I think the only viable option is as we discussed previously, >>> pass in a pointer to a 64-bit type where we can copy the additional >>> completion information to. >> >> TBH, I hate the idea of such overhead/latency at times when SSDs can >> serve writes in less than 10ms. Any chance you measured how long does it > > 10us? :-) Hah, 10us indeed :) > >> take to drag through task_work? > > A 64-bit value copy is really not a lot of overhead... But yes, we'd > need to push the completion through task_work at that point, as we can't > do it from the completion side. That's not a lot of overhead, and most > notably, it's overhead that only affects this particular type. > > That's not a bad starting point, and something that can always be > optimized later if need be. But I seriously doubt it'd be anything to > worry about. I probably need to look myself how it's really scheduled, but if you don't mind, here is a quick question: if we do work_add(task) when the task is running in the userspace, wouldn't the work execution wait until the next syscall/allotted time ends up?
On 7/30/20 10:26 AM, Pavel Begunkov wrote: > On 30/07/2020 19:13, Jens Axboe wrote: >> On 7/30/20 10:08 AM, Pavel Begunkov wrote: >>> On 27/07/2020 23:34, Jens Axboe wrote: >>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote: >>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: >>>>>> >>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: >>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>>>>>> index 7809ab2..6510cf5 100644 >>>>>>> --- a/fs/io_uring.c >>>>>>> +++ b/fs/io_uring.c >>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) >>>>>>> cqe = io_get_cqring(ctx); >>>>>>> if (likely(cqe)) { >>>>>>> WRITE_ONCE(cqe->user_data, req->user_data); >>>>>>> - WRITE_ONCE(cqe->res, res); >>>>>>> - WRITE_ONCE(cqe->flags, cflags); >>>>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { >>>>>>> + if (likely(res > 0)) >>>>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); >>>>>>> + else >>>>>>> + WRITE_ONCE(cqe->res64, res); >>>>>>> + } else { >>>>>>> + WRITE_ONCE(cqe->res, res); >>>>>>> + WRITE_ONCE(cqe->flags, cflags); >>>>>>> + } >>>>>> >>>>>> This would be nice to keep out of the fast path, if possible. >>>>> >>>>> I was thinking of keeping a function-pointer (in io_kiocb) during >>>>> submission. That would have avoided this check......but argument count >>>>> differs, so it did not add up. >>>> >>>> But that'd grow the io_kiocb just for this use case, which is arguably >>>> even worse. Unless you can keep it in the per-request private data, >>>> but there's no more room there for the regular read/write side. >>>> >>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>>>>> index 92c2269..2580d93 100644 >>>>>>> --- a/include/uapi/linux/io_uring.h >>>>>>> +++ b/include/uapi/linux/io_uring.h >>>>>>> @@ -156,8 +156,13 @@ enum { >>>>>>> */ >>>>>>> struct io_uring_cqe { >>>>>>> __u64 user_data; /* sqe->data submission passed back */ >>>>>>> - __s32 res; /* result code for this event */ >>>>>>> - __u32 flags; >>>>>>> + union { >>>>>>> + struct { >>>>>>> + __s32 res; /* result code for this event */ >>>>>>> + __u32 flags; >>>>>>> + }; >>>>>>> + __s64 res64; /* appending offset for zone append */ >>>>>>> + }; >>>>>>> }; >>>>>> >>>>>> Is this a compatible change, both for now but also going forward? You >>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. >>>>> >>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not >>>>> used/set for write currently, so it looked compatible at this point. >>>> >>>> Not worried about that, since we won't ever use that for writes. But it >>>> is a potential headache down the line for other flags, if they apply to >>>> normal writes. >>>> >>>>> Yes, no room for future flags for this operation. >>>>> Do you see any other way to enable this support in io-uring? >>>> >>>> Honestly I think the only viable option is as we discussed previously, >>>> pass in a pointer to a 64-bit type where we can copy the additional >>>> completion information to. >>> >>> TBH, I hate the idea of such overhead/latency at times when SSDs can >>> serve writes in less than 10ms. Any chance you measured how long does it >> >> 10us? :-) > > Hah, 10us indeed :) > >> >>> take to drag through task_work? >> >> A 64-bit value copy is really not a lot of overhead... But yes, we'd >> need to push the completion through task_work at that point, as we can't >> do it from the completion side. That's not a lot of overhead, and most >> notably, it's overhead that only affects this particular type. >> >> That's not a bad starting point, and something that can always be >> optimized later if need be. But I seriously doubt it'd be anything to >> worry about. > > I probably need to look myself how it's really scheduled, but if you don't > mind, here is a quick question: if we do work_add(task) when the task is > running in the userspace, wouldn't the work execution wait until the next > syscall/allotted time ends up? It'll get the task to enter the kernel, just like signal delivery. The only tricky part is really if we have a dependency waiting in the kernel, like the recent eventfd fix.
On 30/07/2020 20:16, Jens Axboe wrote: > On 7/30/20 10:26 AM, Pavel Begunkov wrote: >> On 30/07/2020 19:13, Jens Axboe wrote: >>> On 7/30/20 10:08 AM, Pavel Begunkov wrote: >>>> On 27/07/2020 23:34, Jens Axboe wrote: >>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote: >>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: >>>>>>> >>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: >>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>>>>>>> index 7809ab2..6510cf5 100644 >>>>>>>> --- a/fs/io_uring.c >>>>>>>> +++ b/fs/io_uring.c >>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) >>>>>>>> cqe = io_get_cqring(ctx); >>>>>>>> if (likely(cqe)) { >>>>>>>> WRITE_ONCE(cqe->user_data, req->user_data); >>>>>>>> - WRITE_ONCE(cqe->res, res); >>>>>>>> - WRITE_ONCE(cqe->flags, cflags); >>>>>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { >>>>>>>> + if (likely(res > 0)) >>>>>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); >>>>>>>> + else >>>>>>>> + WRITE_ONCE(cqe->res64, res); >>>>>>>> + } else { >>>>>>>> + WRITE_ONCE(cqe->res, res); >>>>>>>> + WRITE_ONCE(cqe->flags, cflags); >>>>>>>> + } >>>>>>> >>>>>>> This would be nice to keep out of the fast path, if possible. >>>>>> >>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during >>>>>> submission. That would have avoided this check......but argument count >>>>>> differs, so it did not add up. >>>>> >>>>> But that'd grow the io_kiocb just for this use case, which is arguably >>>>> even worse. Unless you can keep it in the per-request private data, >>>>> but there's no more room there for the regular read/write side. >>>>> >>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>>>>>> index 92c2269..2580d93 100644 >>>>>>>> --- a/include/uapi/linux/io_uring.h >>>>>>>> +++ b/include/uapi/linux/io_uring.h >>>>>>>> @@ -156,8 +156,13 @@ enum { >>>>>>>> */ >>>>>>>> struct io_uring_cqe { >>>>>>>> __u64 user_data; /* sqe->data submission passed back */ >>>>>>>> - __s32 res; /* result code for this event */ >>>>>>>> - __u32 flags; >>>>>>>> + union { >>>>>>>> + struct { >>>>>>>> + __s32 res; /* result code for this event */ >>>>>>>> + __u32 flags; >>>>>>>> + }; >>>>>>>> + __s64 res64; /* appending offset for zone append */ >>>>>>>> + }; >>>>>>>> }; >>>>>>> >>>>>>> Is this a compatible change, both for now but also going forward? You >>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. >>>>>> >>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not >>>>>> used/set for write currently, so it looked compatible at this point. >>>>> >>>>> Not worried about that, since we won't ever use that for writes. But it >>>>> is a potential headache down the line for other flags, if they apply to >>>>> normal writes. >>>>> >>>>>> Yes, no room for future flags for this operation. >>>>>> Do you see any other way to enable this support in io-uring? >>>>> >>>>> Honestly I think the only viable option is as we discussed previously, >>>>> pass in a pointer to a 64-bit type where we can copy the additional >>>>> completion information to. >>>> >>>> TBH, I hate the idea of such overhead/latency at times when SSDs can >>>> serve writes in less than 10ms. Any chance you measured how long does it >>> >>> 10us? :-) >> >> Hah, 10us indeed :) >> >>> >>>> take to drag through task_work? >>> >>> A 64-bit value copy is really not a lot of overhead... But yes, we'd >>> need to push the completion through task_work at that point, as we can't >>> do it from the completion side. That's not a lot of overhead, and most >>> notably, it's overhead that only affects this particular type. >>> >>> That's not a bad starting point, and something that can always be >>> optimized later if need be. But I seriously doubt it'd be anything to >>> worry about. >> >> I probably need to look myself how it's really scheduled, but if you don't >> mind, here is a quick question: if we do work_add(task) when the task is >> running in the userspace, wouldn't the work execution wait until the next >> syscall/allotted time ends up? > > It'll get the task to enter the kernel, just like signal delivery. The only > tricky part is really if we have a dependency waiting in the kernel, like > the recent eventfd fix. I see, thanks for sorting this out!
On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > > On 30/07/2020 20:16, Jens Axboe wrote: > > On 7/30/20 10:26 AM, Pavel Begunkov wrote: > >> On 30/07/2020 19:13, Jens Axboe wrote: > >>> On 7/30/20 10:08 AM, Pavel Begunkov wrote: > >>>> On 27/07/2020 23:34, Jens Axboe wrote: > >>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote: > >>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: > >>>>>>> > >>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: > >>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c > >>>>>>>> index 7809ab2..6510cf5 100644 > >>>>>>>> --- a/fs/io_uring.c > >>>>>>>> +++ b/fs/io_uring.c > >>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) > >>>>>>>> cqe = io_get_cqring(ctx); > >>>>>>>> if (likely(cqe)) { > >>>>>>>> WRITE_ONCE(cqe->user_data, req->user_data); > >>>>>>>> - WRITE_ONCE(cqe->res, res); > >>>>>>>> - WRITE_ONCE(cqe->flags, cflags); > >>>>>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { > >>>>>>>> + if (likely(res > 0)) > >>>>>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); > >>>>>>>> + else > >>>>>>>> + WRITE_ONCE(cqe->res64, res); > >>>>>>>> + } else { > >>>>>>>> + WRITE_ONCE(cqe->res, res); > >>>>>>>> + WRITE_ONCE(cqe->flags, cflags); > >>>>>>>> + } > >>>>>>> > >>>>>>> This would be nice to keep out of the fast path, if possible. > >>>>>> > >>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during > >>>>>> submission. That would have avoided this check......but argument count > >>>>>> differs, so it did not add up. > >>>>> > >>>>> But that'd grow the io_kiocb just for this use case, which is arguably > >>>>> even worse. Unless you can keep it in the per-request private data, > >>>>> but there's no more room there for the regular read/write side. > >>>>> > >>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > >>>>>>>> index 92c2269..2580d93 100644 > >>>>>>>> --- a/include/uapi/linux/io_uring.h > >>>>>>>> +++ b/include/uapi/linux/io_uring.h > >>>>>>>> @@ -156,8 +156,13 @@ enum { > >>>>>>>> */ > >>>>>>>> struct io_uring_cqe { > >>>>>>>> __u64 user_data; /* sqe->data submission passed back */ > >>>>>>>> - __s32 res; /* result code for this event */ > >>>>>>>> - __u32 flags; > >>>>>>>> + union { > >>>>>>>> + struct { > >>>>>>>> + __s32 res; /* result code for this event */ > >>>>>>>> + __u32 flags; > >>>>>>>> + }; > >>>>>>>> + __s64 res64; /* appending offset for zone append */ > >>>>>>>> + }; > >>>>>>>> }; > >>>>>>> > >>>>>>> Is this a compatible change, both for now but also going forward? You > >>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. > >>>>>> > >>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not > >>>>>> used/set for write currently, so it looked compatible at this point. > >>>>> > >>>>> Not worried about that, since we won't ever use that for writes. But it > >>>>> is a potential headache down the line for other flags, if they apply to > >>>>> normal writes. > >>>>> > >>>>>> Yes, no room for future flags for this operation. > >>>>>> Do you see any other way to enable this support in io-uring? > >>>>> > >>>>> Honestly I think the only viable option is as we discussed previously, > >>>>> pass in a pointer to a 64-bit type where we can copy the additional > >>>>> completion information to. > >>>> > >>>> TBH, I hate the idea of such overhead/latency at times when SSDs can > >>>> serve writes in less than 10ms. Any chance you measured how long does it > >>> > >>> 10us? :-) > >> > >> Hah, 10us indeed :) > >> > >>> > >>>> take to drag through task_work? > >>> > >>> A 64-bit value copy is really not a lot of overhead... But yes, we'd > >>> need to push the completion through task_work at that point, as we can't > >>> do it from the completion side. That's not a lot of overhead, and most > >>> notably, it's overhead that only affects this particular type. > >>> > >>> That's not a bad starting point, and something that can always be > >>> optimized later if need be. But I seriously doubt it'd be anything to > >>> worry about. > >> > >> I probably need to look myself how it's really scheduled, but if you don't > >> mind, here is a quick question: if we do work_add(task) when the task is > >> running in the userspace, wouldn't the work execution wait until the next > >> syscall/allotted time ends up? > > > > It'll get the task to enter the kernel, just like signal delivery. The only > > tricky part is really if we have a dependency waiting in the kernel, like > > the recent eventfd fix. > > I see, thanks for sorting this out! Few more doubts about this (please mark me wrong if that is the case): - Task-work makes me feel like N completions waiting to be served by single task. Currently completions keep arriving and CQEs would be updated with result, but the user-space (submitter task) would not be poked. - Completion-code will set the task-work. But post that it cannot go immediately to its regular business of picking cqe and updating res/flags, as we cannot afford user-space to see the cqe before the pointer update. So it seems completion-code needs to spawn another work which will allocate/update cqe after waiting for pointer-update from task-work?
On 7/30/20 11:51 AM, Kanchan Joshi wrote: > On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >> >> On 30/07/2020 20:16, Jens Axboe wrote: >>> On 7/30/20 10:26 AM, Pavel Begunkov wrote: >>>> On 30/07/2020 19:13, Jens Axboe wrote: >>>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote: >>>>>> On 27/07/2020 23:34, Jens Axboe wrote: >>>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote: >>>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: >>>>>>>>> >>>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: >>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>>>>>>>>> index 7809ab2..6510cf5 100644 >>>>>>>>>> --- a/fs/io_uring.c >>>>>>>>>> +++ b/fs/io_uring.c >>>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) >>>>>>>>>> cqe = io_get_cqring(ctx); >>>>>>>>>> if (likely(cqe)) { >>>>>>>>>> WRITE_ONCE(cqe->user_data, req->user_data); >>>>>>>>>> - WRITE_ONCE(cqe->res, res); >>>>>>>>>> - WRITE_ONCE(cqe->flags, cflags); >>>>>>>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { >>>>>>>>>> + if (likely(res > 0)) >>>>>>>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); >>>>>>>>>> + else >>>>>>>>>> + WRITE_ONCE(cqe->res64, res); >>>>>>>>>> + } else { >>>>>>>>>> + WRITE_ONCE(cqe->res, res); >>>>>>>>>> + WRITE_ONCE(cqe->flags, cflags); >>>>>>>>>> + } >>>>>>>>> >>>>>>>>> This would be nice to keep out of the fast path, if possible. >>>>>>>> >>>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during >>>>>>>> submission. That would have avoided this check......but argument count >>>>>>>> differs, so it did not add up. >>>>>>> >>>>>>> But that'd grow the io_kiocb just for this use case, which is arguably >>>>>>> even worse. Unless you can keep it in the per-request private data, >>>>>>> but there's no more room there for the regular read/write side. >>>>>>> >>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>>>>>>>> index 92c2269..2580d93 100644 >>>>>>>>>> --- a/include/uapi/linux/io_uring.h >>>>>>>>>> +++ b/include/uapi/linux/io_uring.h >>>>>>>>>> @@ -156,8 +156,13 @@ enum { >>>>>>>>>> */ >>>>>>>>>> struct io_uring_cqe { >>>>>>>>>> __u64 user_data; /* sqe->data submission passed back */ >>>>>>>>>> - __s32 res; /* result code for this event */ >>>>>>>>>> - __u32 flags; >>>>>>>>>> + union { >>>>>>>>>> + struct { >>>>>>>>>> + __s32 res; /* result code for this event */ >>>>>>>>>> + __u32 flags; >>>>>>>>>> + }; >>>>>>>>>> + __s64 res64; /* appending offset for zone append */ >>>>>>>>>> + }; >>>>>>>>>> }; >>>>>>>>> >>>>>>>>> Is this a compatible change, both for now but also going forward? You >>>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. >>>>>>>> >>>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not >>>>>>>> used/set for write currently, so it looked compatible at this point. >>>>>>> >>>>>>> Not worried about that, since we won't ever use that for writes. But it >>>>>>> is a potential headache down the line for other flags, if they apply to >>>>>>> normal writes. >>>>>>> >>>>>>>> Yes, no room for future flags for this operation. >>>>>>>> Do you see any other way to enable this support in io-uring? >>>>>>> >>>>>>> Honestly I think the only viable option is as we discussed previously, >>>>>>> pass in a pointer to a 64-bit type where we can copy the additional >>>>>>> completion information to. >>>>>> >>>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can >>>>>> serve writes in less than 10ms. Any chance you measured how long does it >>>>> >>>>> 10us? :-) >>>> >>>> Hah, 10us indeed :) >>>> >>>>> >>>>>> take to drag through task_work? >>>>> >>>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd >>>>> need to push the completion through task_work at that point, as we can't >>>>> do it from the completion side. That's not a lot of overhead, and most >>>>> notably, it's overhead that only affects this particular type. >>>>> >>>>> That's not a bad starting point, and something that can always be >>>>> optimized later if need be. But I seriously doubt it'd be anything to >>>>> worry about. >>>> >>>> I probably need to look myself how it's really scheduled, but if you don't >>>> mind, here is a quick question: if we do work_add(task) when the task is >>>> running in the userspace, wouldn't the work execution wait until the next >>>> syscall/allotted time ends up? >>> >>> It'll get the task to enter the kernel, just like signal delivery. The only >>> tricky part is really if we have a dependency waiting in the kernel, like >>> the recent eventfd fix. >> >> I see, thanks for sorting this out! > > Few more doubts about this (please mark me wrong if that is the case): > > - Task-work makes me feel like N completions waiting to be served by > single task. > Currently completions keep arriving and CQEs would be updated with > result, but the user-space (submitter task) would not be poked. > > - Completion-code will set the task-work. But post that it cannot go > immediately to its regular business of picking cqe and updating > res/flags, as we cannot afford user-space to see the cqe before the > pointer update. So it seems completion-code needs to spawn another > work which will allocate/update cqe after waiting for pointer-update > from task-work? The task work would post the completion CQE for the request after writing the offset.
On Thu, Jul 30, 2020 at 11:24 PM Jens Axboe <axboe@kernel.dk> wrote: > > On 7/30/20 11:51 AM, Kanchan Joshi wrote: > > On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > >> > >> On 30/07/2020 20:16, Jens Axboe wrote: > >>> On 7/30/20 10:26 AM, Pavel Begunkov wrote: > >>>> On 30/07/2020 19:13, Jens Axboe wrote: > >>>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote: > >>>>>> On 27/07/2020 23:34, Jens Axboe wrote: > >>>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote: > >>>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: > >>>>>>>>> > >>>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: > >>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c > >>>>>>>>>> index 7809ab2..6510cf5 100644 > >>>>>>>>>> --- a/fs/io_uring.c > >>>>>>>>>> +++ b/fs/io_uring.c > >>>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) > >>>>>>>>>> cqe = io_get_cqring(ctx); > >>>>>>>>>> if (likely(cqe)) { > >>>>>>>>>> WRITE_ONCE(cqe->user_data, req->user_data); > >>>>>>>>>> - WRITE_ONCE(cqe->res, res); > >>>>>>>>>> - WRITE_ONCE(cqe->flags, cflags); > >>>>>>>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { > >>>>>>>>>> + if (likely(res > 0)) > >>>>>>>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); > >>>>>>>>>> + else > >>>>>>>>>> + WRITE_ONCE(cqe->res64, res); > >>>>>>>>>> + } else { > >>>>>>>>>> + WRITE_ONCE(cqe->res, res); > >>>>>>>>>> + WRITE_ONCE(cqe->flags, cflags); > >>>>>>>>>> + } > >>>>>>>>> > >>>>>>>>> This would be nice to keep out of the fast path, if possible. > >>>>>>>> > >>>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during > >>>>>>>> submission. That would have avoided this check......but argument count > >>>>>>>> differs, so it did not add up. > >>>>>>> > >>>>>>> But that'd grow the io_kiocb just for this use case, which is arguably > >>>>>>> even worse. Unless you can keep it in the per-request private data, > >>>>>>> but there's no more room there for the regular read/write side. > >>>>>>> > >>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > >>>>>>>>>> index 92c2269..2580d93 100644 > >>>>>>>>>> --- a/include/uapi/linux/io_uring.h > >>>>>>>>>> +++ b/include/uapi/linux/io_uring.h > >>>>>>>>>> @@ -156,8 +156,13 @@ enum { > >>>>>>>>>> */ > >>>>>>>>>> struct io_uring_cqe { > >>>>>>>>>> __u64 user_data; /* sqe->data submission passed back */ > >>>>>>>>>> - __s32 res; /* result code for this event */ > >>>>>>>>>> - __u32 flags; > >>>>>>>>>> + union { > >>>>>>>>>> + struct { > >>>>>>>>>> + __s32 res; /* result code for this event */ > >>>>>>>>>> + __u32 flags; > >>>>>>>>>> + }; > >>>>>>>>>> + __s64 res64; /* appending offset for zone append */ > >>>>>>>>>> + }; > >>>>>>>>>> }; > >>>>>>>>> > >>>>>>>>> Is this a compatible change, both for now but also going forward? You > >>>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. > >>>>>>>> > >>>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not > >>>>>>>> used/set for write currently, so it looked compatible at this point. > >>>>>>> > >>>>>>> Not worried about that, since we won't ever use that for writes. But it > >>>>>>> is a potential headache down the line for other flags, if they apply to > >>>>>>> normal writes. > >>>>>>> > >>>>>>>> Yes, no room for future flags for this operation. > >>>>>>>> Do you see any other way to enable this support in io-uring? > >>>>>>> > >>>>>>> Honestly I think the only viable option is as we discussed previously, > >>>>>>> pass in a pointer to a 64-bit type where we can copy the additional > >>>>>>> completion information to. > >>>>>> > >>>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can > >>>>>> serve writes in less than 10ms. Any chance you measured how long does it > >>>>> > >>>>> 10us? :-) > >>>> > >>>> Hah, 10us indeed :) > >>>> > >>>>> > >>>>>> take to drag through task_work? > >>>>> > >>>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd > >>>>> need to push the completion through task_work at that point, as we can't > >>>>> do it from the completion side. That's not a lot of overhead, and most > >>>>> notably, it's overhead that only affects this particular type. > >>>>> > >>>>> That's not a bad starting point, and something that can always be > >>>>> optimized later if need be. But I seriously doubt it'd be anything to > >>>>> worry about. > >>>> > >>>> I probably need to look myself how it's really scheduled, but if you don't > >>>> mind, here is a quick question: if we do work_add(task) when the task is > >>>> running in the userspace, wouldn't the work execution wait until the next > >>>> syscall/allotted time ends up? > >>> > >>> It'll get the task to enter the kernel, just like signal delivery. The only > >>> tricky part is really if we have a dependency waiting in the kernel, like > >>> the recent eventfd fix. > >> > >> I see, thanks for sorting this out! > > > > Few more doubts about this (please mark me wrong if that is the case): > > > > - Task-work makes me feel like N completions waiting to be served by > > single task. > > Currently completions keep arriving and CQEs would be updated with > > result, but the user-space (submitter task) would not be poked. > > > > - Completion-code will set the task-work. But post that it cannot go > > immediately to its regular business of picking cqe and updating > > res/flags, as we cannot afford user-space to see the cqe before the > > pointer update. So it seems completion-code needs to spawn another > > work which will allocate/update cqe after waiting for pointer-update > > from task-work? > > The task work would post the completion CQE for the request after > writing the offset. Got it, thank you for making it simple. Overall if I try to put the tradeoffs of moving to indirect-offset (compared to current scheme)– Upside: - cqe res/flags would be intact, avoids future-headaches as you mentioned - short-write cases do not have to be failed in lower-layers (as cqe->res is there to report bytes-copied) Downside: - We may not be able to use RWF_APPEND, and need exposing a new type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this sounds outrageous, but is it OK to have uring-only flag which can be combined with RWF_APPEND? - Expensive compared to sending results in cqe itself. But I agree that this may not be major, and only for one type of write.
On 2020/07/31 3:26, Kanchan Joshi wrote: > On Thu, Jul 30, 2020 at 11:24 PM Jens Axboe <axboe@kernel.dk> wrote: >> >> On 7/30/20 11:51 AM, Kanchan Joshi wrote: >>> On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >>>> >>>> On 30/07/2020 20:16, Jens Axboe wrote: >>>>> On 7/30/20 10:26 AM, Pavel Begunkov wrote: >>>>>> On 30/07/2020 19:13, Jens Axboe wrote: >>>>>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote: >>>>>>>> On 27/07/2020 23:34, Jens Axboe wrote: >>>>>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote: >>>>>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: >>>>>>>>>>> >>>>>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: >>>>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>>>>>>>>>>> index 7809ab2..6510cf5 100644 >>>>>>>>>>>> --- a/fs/io_uring.c >>>>>>>>>>>> +++ b/fs/io_uring.c >>>>>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) >>>>>>>>>>>> cqe = io_get_cqring(ctx); >>>>>>>>>>>> if (likely(cqe)) { >>>>>>>>>>>> WRITE_ONCE(cqe->user_data, req->user_data); >>>>>>>>>>>> - WRITE_ONCE(cqe->res, res); >>>>>>>>>>>> - WRITE_ONCE(cqe->flags, cflags); >>>>>>>>>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { >>>>>>>>>>>> + if (likely(res > 0)) >>>>>>>>>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); >>>>>>>>>>>> + else >>>>>>>>>>>> + WRITE_ONCE(cqe->res64, res); >>>>>>>>>>>> + } else { >>>>>>>>>>>> + WRITE_ONCE(cqe->res, res); >>>>>>>>>>>> + WRITE_ONCE(cqe->flags, cflags); >>>>>>>>>>>> + } >>>>>>>>>>> >>>>>>>>>>> This would be nice to keep out of the fast path, if possible. >>>>>>>>>> >>>>>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during >>>>>>>>>> submission. That would have avoided this check......but argument count >>>>>>>>>> differs, so it did not add up. >>>>>>>>> >>>>>>>>> But that'd grow the io_kiocb just for this use case, which is arguably >>>>>>>>> even worse. Unless you can keep it in the per-request private data, >>>>>>>>> but there's no more room there for the regular read/write side. >>>>>>>>> >>>>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>>>>>>>>>> index 92c2269..2580d93 100644 >>>>>>>>>>>> --- a/include/uapi/linux/io_uring.h >>>>>>>>>>>> +++ b/include/uapi/linux/io_uring.h >>>>>>>>>>>> @@ -156,8 +156,13 @@ enum { >>>>>>>>>>>> */ >>>>>>>>>>>> struct io_uring_cqe { >>>>>>>>>>>> __u64 user_data; /* sqe->data submission passed back */ >>>>>>>>>>>> - __s32 res; /* result code for this event */ >>>>>>>>>>>> - __u32 flags; >>>>>>>>>>>> + union { >>>>>>>>>>>> + struct { >>>>>>>>>>>> + __s32 res; /* result code for this event */ >>>>>>>>>>>> + __u32 flags; >>>>>>>>>>>> + }; >>>>>>>>>>>> + __s64 res64; /* appending offset for zone append */ >>>>>>>>>>>> + }; >>>>>>>>>>>> }; >>>>>>>>>>> >>>>>>>>>>> Is this a compatible change, both for now but also going forward? You >>>>>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. >>>>>>>>>> >>>>>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not >>>>>>>>>> used/set for write currently, so it looked compatible at this point. >>>>>>>>> >>>>>>>>> Not worried about that, since we won't ever use that for writes. But it >>>>>>>>> is a potential headache down the line for other flags, if they apply to >>>>>>>>> normal writes. >>>>>>>>> >>>>>>>>>> Yes, no room for future flags for this operation. >>>>>>>>>> Do you see any other way to enable this support in io-uring? >>>>>>>>> >>>>>>>>> Honestly I think the only viable option is as we discussed previously, >>>>>>>>> pass in a pointer to a 64-bit type where we can copy the additional >>>>>>>>> completion information to. >>>>>>>> >>>>>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can >>>>>>>> serve writes in less than 10ms. Any chance you measured how long does it >>>>>>> >>>>>>> 10us? :-) >>>>>> >>>>>> Hah, 10us indeed :) >>>>>> >>>>>>> >>>>>>>> take to drag through task_work? >>>>>>> >>>>>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd >>>>>>> need to push the completion through task_work at that point, as we can't >>>>>>> do it from the completion side. That's not a lot of overhead, and most >>>>>>> notably, it's overhead that only affects this particular type. >>>>>>> >>>>>>> That's not a bad starting point, and something that can always be >>>>>>> optimized later if need be. But I seriously doubt it'd be anything to >>>>>>> worry about. >>>>>> >>>>>> I probably need to look myself how it's really scheduled, but if you don't >>>>>> mind, here is a quick question: if we do work_add(task) when the task is >>>>>> running in the userspace, wouldn't the work execution wait until the next >>>>>> syscall/allotted time ends up? >>>>> >>>>> It'll get the task to enter the kernel, just like signal delivery. The only >>>>> tricky part is really if we have a dependency waiting in the kernel, like >>>>> the recent eventfd fix. >>>> >>>> I see, thanks for sorting this out! >>> >>> Few more doubts about this (please mark me wrong if that is the case): >>> >>> - Task-work makes me feel like N completions waiting to be served by >>> single task. >>> Currently completions keep arriving and CQEs would be updated with >>> result, but the user-space (submitter task) would not be poked. >>> >>> - Completion-code will set the task-work. But post that it cannot go >>> immediately to its regular business of picking cqe and updating >>> res/flags, as we cannot afford user-space to see the cqe before the >>> pointer update. So it seems completion-code needs to spawn another >>> work which will allocate/update cqe after waiting for pointer-update >>> from task-work? >> >> The task work would post the completion CQE for the request after >> writing the offset. > > Got it, thank you for making it simple. > Overall if I try to put the tradeoffs of moving to indirect-offset > (compared to current scheme)– > > Upside: > - cqe res/flags would be intact, avoids future-headaches as you mentioned > - short-write cases do not have to be failed in lower-layers (as > cqe->res is there to report bytes-copied) I personally think it is a super bad idea to allow short asynchronous append writes. The interface should allow the async zone append write to proceed only and only if it can be stuffed entirely into a single BIO which necessarilly will be a single request on the device side. Otherwise, the application would have no guarantees as to where a split may happen, and since this is zone append, the next async append will not leave any hole to complete a previous short write. This will wreak the structure of the application data. For the sync case, this is fine. The application can just issue a new append write with the remaining unwritten data from the previous append write. But in the async case, if one write == one data record (e.g. a key-value tuple for an SSTable in an LSM tree), then allowing a short write will destroy the record: the partial write will be garbage data that will need garbage collection... > > Downside: > - We may not be able to use RWF_APPEND, and need exposing a new > type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this > sounds outrageous, but is it OK to have uring-only flag which can be > combined with RWF_APPEND? Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for raw block device accesses. We could certainly define a meaning for these in the context of zoned block devices. I already commented on the need for first defining an interface (flags etc) and its semantic (e.g. do we allow short zone append or not ? What happens for regular files ? etc). Did you read my comment ? We really need to first agree on something to clarify what needs to be done. > - Expensive compared to sending results in cqe itself. But I agree > that this may not be major, and only for one type of write. > >
On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote: > > - We may not be able to use RWF_APPEND, and need exposing a new > > type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this > > sounds outrageous, but is it OK to have uring-only flag which can be > > combined with RWF_APPEND? > > Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for > raw block device accesses. We could certainly define a meaning for these in the > context of zoned block devices. We can't just add a meaning for O_APPEND on block devices now, as it was previously silently ignored. I also really don't think any of these semantics even fit the block device to start with. If you want to work on raw zones use zonefs, that's what is exists for.
On 2020/07/31 15:45, hch@infradead.org wrote: > On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote: >>> - We may not be able to use RWF_APPEND, and need exposing a new >>> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this >>> sounds outrageous, but is it OK to have uring-only flag which can be >>> combined with RWF_APPEND? >> >> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for >> raw block device accesses. We could certainly define a meaning for these in the >> context of zoned block devices. > > We can't just add a meaning for O_APPEND on block devices now, > as it was previously silently ignored. I also really don't think any > of these semantics even fit the block device to start with. If you > want to work on raw zones use zonefs, that's what is exists for. Which is fine with me. Just trying to say that I think this is exactly the discussion we need to start with. What interface do we implement... Allowing zone append only through zonefs as the raw block device equivalent, all the O_APPEND/RWF_APPEND semantic is defined and the "return written offset" implementation in VFS would be common for all file systems, including regular ones. Beside that, there is I think the question of short writes... Not sure if short writes can currently happen with async RWF_APPEND writes to regular files. I think not but that may depend on the FS.
On Fri, Jul 31, 2020 at 12:12 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote: > > On 2020/07/31 3:26, Kanchan Joshi wrote: > > On Thu, Jul 30, 2020 at 11:24 PM Jens Axboe <axboe@kernel.dk> wrote: > >> > >> On 7/30/20 11:51 AM, Kanchan Joshi wrote: > >>> On Thu, Jul 30, 2020 at 11:10 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > >>>> > >>>> On 30/07/2020 20:16, Jens Axboe wrote: > >>>>> On 7/30/20 10:26 AM, Pavel Begunkov wrote: > >>>>>> On 30/07/2020 19:13, Jens Axboe wrote: > >>>>>>> On 7/30/20 10:08 AM, Pavel Begunkov wrote: > >>>>>>>> On 27/07/2020 23:34, Jens Axboe wrote: > >>>>>>>>> On 7/27/20 1:16 PM, Kanchan Joshi wrote: > >>>>>>>>>> On Fri, Jul 24, 2020 at 10:00 PM Jens Axboe <axboe@kernel.dk> wrote: > >>>>>>>>>>> > >>>>>>>>>>> On 7/24/20 9:49 AM, Kanchan Joshi wrote: > >>>>>>>>>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c > >>>>>>>>>>>> index 7809ab2..6510cf5 100644 > >>>>>>>>>>>> --- a/fs/io_uring.c > >>>>>>>>>>>> +++ b/fs/io_uring.c > >>>>>>>>>>>> @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) > >>>>>>>>>>>> cqe = io_get_cqring(ctx); > >>>>>>>>>>>> if (likely(cqe)) { > >>>>>>>>>>>> WRITE_ONCE(cqe->user_data, req->user_data); > >>>>>>>>>>>> - WRITE_ONCE(cqe->res, res); > >>>>>>>>>>>> - WRITE_ONCE(cqe->flags, cflags); > >>>>>>>>>>>> + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { > >>>>>>>>>>>> + if (likely(res > 0)) > >>>>>>>>>>>> + WRITE_ONCE(cqe->res64, req->rw.append_offset); > >>>>>>>>>>>> + else > >>>>>>>>>>>> + WRITE_ONCE(cqe->res64, res); > >>>>>>>>>>>> + } else { > >>>>>>>>>>>> + WRITE_ONCE(cqe->res, res); > >>>>>>>>>>>> + WRITE_ONCE(cqe->flags, cflags); > >>>>>>>>>>>> + } > >>>>>>>>>>> > >>>>>>>>>>> This would be nice to keep out of the fast path, if possible. > >>>>>>>>>> > >>>>>>>>>> I was thinking of keeping a function-pointer (in io_kiocb) during > >>>>>>>>>> submission. That would have avoided this check......but argument count > >>>>>>>>>> differs, so it did not add up. > >>>>>>>>> > >>>>>>>>> But that'd grow the io_kiocb just for this use case, which is arguably > >>>>>>>>> even worse. Unless you can keep it in the per-request private data, > >>>>>>>>> but there's no more room there for the regular read/write side. > >>>>>>>>> > >>>>>>>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > >>>>>>>>>>>> index 92c2269..2580d93 100644 > >>>>>>>>>>>> --- a/include/uapi/linux/io_uring.h > >>>>>>>>>>>> +++ b/include/uapi/linux/io_uring.h > >>>>>>>>>>>> @@ -156,8 +156,13 @@ enum { > >>>>>>>>>>>> */ > >>>>>>>>>>>> struct io_uring_cqe { > >>>>>>>>>>>> __u64 user_data; /* sqe->data submission passed back */ > >>>>>>>>>>>> - __s32 res; /* result code for this event */ > >>>>>>>>>>>> - __u32 flags; > >>>>>>>>>>>> + union { > >>>>>>>>>>>> + struct { > >>>>>>>>>>>> + __s32 res; /* result code for this event */ > >>>>>>>>>>>> + __u32 flags; > >>>>>>>>>>>> + }; > >>>>>>>>>>>> + __s64 res64; /* appending offset for zone append */ > >>>>>>>>>>>> + }; > >>>>>>>>>>>> }; > >>>>>>>>>>> > >>>>>>>>>>> Is this a compatible change, both for now but also going forward? You > >>>>>>>>>>> could randomly have IORING_CQE_F_BUFFER set, or any other future flags. > >>>>>>>>>> > >>>>>>>>>> Sorry, I didn't quite understand the concern. CQE_F_BUFFER is not > >>>>>>>>>> used/set for write currently, so it looked compatible at this point. > >>>>>>>>> > >>>>>>>>> Not worried about that, since we won't ever use that for writes. But it > >>>>>>>>> is a potential headache down the line for other flags, if they apply to > >>>>>>>>> normal writes. > >>>>>>>>> > >>>>>>>>>> Yes, no room for future flags for this operation. > >>>>>>>>>> Do you see any other way to enable this support in io-uring? > >>>>>>>>> > >>>>>>>>> Honestly I think the only viable option is as we discussed previously, > >>>>>>>>> pass in a pointer to a 64-bit type where we can copy the additional > >>>>>>>>> completion information to. > >>>>>>>> > >>>>>>>> TBH, I hate the idea of such overhead/latency at times when SSDs can > >>>>>>>> serve writes in less than 10ms. Any chance you measured how long does it > >>>>>>> > >>>>>>> 10us? :-) > >>>>>> > >>>>>> Hah, 10us indeed :) > >>>>>> > >>>>>>> > >>>>>>>> take to drag through task_work? > >>>>>>> > >>>>>>> A 64-bit value copy is really not a lot of overhead... But yes, we'd > >>>>>>> need to push the completion through task_work at that point, as we can't > >>>>>>> do it from the completion side. That's not a lot of overhead, and most > >>>>>>> notably, it's overhead that only affects this particular type. > >>>>>>> > >>>>>>> That's not a bad starting point, and something that can always be > >>>>>>> optimized later if need be. But I seriously doubt it'd be anything to > >>>>>>> worry about. > >>>>>> > >>>>>> I probably need to look myself how it's really scheduled, but if you don't > >>>>>> mind, here is a quick question: if we do work_add(task) when the task is > >>>>>> running in the userspace, wouldn't the work execution wait until the next > >>>>>> syscall/allotted time ends up? > >>>>> > >>>>> It'll get the task to enter the kernel, just like signal delivery. The only > >>>>> tricky part is really if we have a dependency waiting in the kernel, like > >>>>> the recent eventfd fix. > >>>> > >>>> I see, thanks for sorting this out! > >>> > >>> Few more doubts about this (please mark me wrong if that is the case): > >>> > >>> - Task-work makes me feel like N completions waiting to be served by > >>> single task. > >>> Currently completions keep arriving and CQEs would be updated with > >>> result, but the user-space (submitter task) would not be poked. > >>> > >>> - Completion-code will set the task-work. But post that it cannot go > >>> immediately to its regular business of picking cqe and updating > >>> res/flags, as we cannot afford user-space to see the cqe before the > >>> pointer update. So it seems completion-code needs to spawn another > >>> work which will allocate/update cqe after waiting for pointer-update > >>> from task-work? > >> > >> The task work would post the completion CQE for the request after > >> writing the offset. > > > > Got it, thank you for making it simple. > > Overall if I try to put the tradeoffs of moving to indirect-offset > > (compared to current scheme)– > > > > Upside: > > - cqe res/flags would be intact, avoids future-headaches as you mentioned > > - short-write cases do not have to be failed in lower-layers (as > > cqe->res is there to report bytes-copied) > > I personally think it is a super bad idea to allow short asynchronous append > writes. The interface should allow the async zone append write to proceed only > and only if it can be stuffed entirely into a single BIO which necessarilly will > be a single request on the device side. Otherwise, the application would have no > guarantees as to where a split may happen, and since this is zone append, the > next async append will not leave any hole to complete a previous short write. > This will wreak the structure of the application data. > > For the sync case, this is fine. The application can just issue a new append > write with the remaining unwritten data from the previous append write. But in > the async case, if one write == one data record (e.g. a key-value tuple for an > SSTable in an LSM tree), then allowing a short write will destroy the record: > the partial write will be garbage data that will need garbage collection... There are cases when short-write is fine, isn't it? For example I can serve only 8K write (either because of space, or because of those file limits), but application sent 12K.....iov_iter_gets truncated to 8K and the write is successful. At least that's what O_APPEND and RWF_APPEND behaves currently. But in the current scheme there is no way to report number-of-bytes copied in io-uring, so I had to fail such short-write in lower-layer (which does not know whether it is talking to io_uring or aio). Failing such short-write is perhaps fine for zone-appened, but is it fine for generic file-append? > > Downside: > > - We may not be able to use RWF_APPEND, and need exposing a new > > type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this > > sounds outrageous, but is it OK to have uring-only flag which can be > > combined with RWF_APPEND? > > Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for > raw block device accesses. We could certainly define a meaning for these in the > context of zoned block devices. But application using O_APPEND/RWF_APPEND does not pass a pointer to be updated by kernel. While in kernel we would expect that, and may start writing something which is not a pointer. > I already commented on the need for first defining an interface (flags etc) and > its semantic (e.g. do we allow short zone append or not ? What happens for > regular files ? etc). Did you read my comment ? We really need to first agree on > something to clarify what needs to be done. I read and was planning to respond, sorry. But it seemed important to get the clarity on the uring-interface, as this seems to decide how this whole thing looks like (to application and to lower layers as well). > > - Expensive compared to sending results in cqe itself. But I agree > > that this may not be major, and only for one type of write. > > > > > > > -- > Damien Le Moal > Western Digital Research
On Fri, Jul 31, 2020 at 12:29 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote: > > On 2020/07/31 15:45, hch@infradead.org wrote: > > On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote: > >>> - We may not be able to use RWF_APPEND, and need exposing a new > >>> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this > >>> sounds outrageous, but is it OK to have uring-only flag which can be > >>> combined with RWF_APPEND? > >> > >> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for > >> raw block device accesses. We could certainly define a meaning for these in the > >> context of zoned block devices. > > > > We can't just add a meaning for O_APPEND on block devices now, > > as it was previously silently ignored. I also really don't think any > > of these semantics even fit the block device to start with. If you > > want to work on raw zones use zonefs, that's what is exists for. > > Which is fine with me. Just trying to say that I think this is exactly the > discussion we need to start with. What interface do we implement... > > Allowing zone append only through zonefs as the raw block device equivalent, all > the O_APPEND/RWF_APPEND semantic is defined and the "return written offset" > implementation in VFS would be common for all file systems, including regular > ones. Beside that, there is I think the question of short writes... Not sure if > short writes can currently happen with async RWF_APPEND writes to regular files. > I think not but that may depend on the FS. generic_write_check_limits (called by generic_write_checks, used by most FS) may make it short, and AFAIK it does not depend on async/sync. This was one of the reason why we chose to isolate the operation by a different IOCB flag and not by IOCB_APPEND alone. For block-device these checks are not done, but there is another place when it receives writes spanning beyond EOF and iov_iter_truncate() adjusts it before sending it down. And we return failure for that case in V4- "Ref: [PATCH v4 3/6] uio: return status with iov truncation"
On 2020/07/31 16:59, Kanchan Joshi wrote: > On Fri, Jul 31, 2020 at 12:29 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote: >> >> On 2020/07/31 15:45, hch@infradead.org wrote: >>> On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote: >>>>> - We may not be able to use RWF_APPEND, and need exposing a new >>>>> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this >>>>> sounds outrageous, but is it OK to have uring-only flag which can be >>>>> combined with RWF_APPEND? >>>> >>>> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for >>>> raw block device accesses. We could certainly define a meaning for these in the >>>> context of zoned block devices. >>> >>> We can't just add a meaning for O_APPEND on block devices now, >>> as it was previously silently ignored. I also really don't think any >>> of these semantics even fit the block device to start with. If you >>> want to work on raw zones use zonefs, that's what is exists for. >> >> Which is fine with me. Just trying to say that I think this is exactly the >> discussion we need to start with. What interface do we implement... >> >> Allowing zone append only through zonefs as the raw block device equivalent, all >> the O_APPEND/RWF_APPEND semantic is defined and the "return written offset" >> implementation in VFS would be common for all file systems, including regular >> ones. Beside that, there is I think the question of short writes... Not sure if >> short writes can currently happen with async RWF_APPEND writes to regular files. >> I think not but that may depend on the FS. > > generic_write_check_limits (called by generic_write_checks, used by > most FS) may make it short, and AFAIK it does not depend on > async/sync. Johannes has a patch (not posted yet) fixing all this for zonefs, differentiating sync and async cases, allow short writes or not, etc. This was done by not using generic_write_check_limits() and instead writing a zonefs_check_write() function that is zone append friendly. We can post that as a base for the discussion on semantic if you want... > This was one of the reason why we chose to isolate the operation by a > different IOCB flag and not by IOCB_APPEND alone. For zonefs, the plan is: * For the sync write case, zone append is always used. * For the async write case, if we see IOCB_APPEND, then zone append BIOs are used. If not, regular write BIOs are used. Simple enough I think. No need for a new flag.
On Fri, Jul 31, 2020 at 08:14:22AM +0000, Damien Le Moal wrote: > > > This was one of the reason why we chose to isolate the operation by a > > different IOCB flag and not by IOCB_APPEND alone. > > For zonefs, the plan is: > * For the sync write case, zone append is always used. > * For the async write case, if we see IOCB_APPEND, then zone append BIOs are > used. If not, regular write BIOs are used. > > Simple enough I think. No need for a new flag. Simple, but wrong. Sync vs async really doesn't matter, even sync writes will have problems if there are other writers. We need a flag for "report the actual offset for appending writes", and based on that flag we need to not allow short writes (or split extents for real file systems). We also need a fcntl or ioctl to report this max atomic write size so that applications can rely on it.
On 2020/07/31 18:14, hch@infradead.org wrote: > On Fri, Jul 31, 2020 at 08:14:22AM +0000, Damien Le Moal wrote: >> >>> This was one of the reason why we chose to isolate the operation by a >>> different IOCB flag and not by IOCB_APPEND alone. >> >> For zonefs, the plan is: >> * For the sync write case, zone append is always used. >> * For the async write case, if we see IOCB_APPEND, then zone append BIOs are >> used. If not, regular write BIOs are used. >> >> Simple enough I think. No need for a new flag. > > Simple, but wrong. Sync vs async really doesn't matter, even sync > writes will have problems if there are other writers. We need a flag > for "report the actual offset for appending writes", and based on that > flag we need to not allow short writes (or split extents for real > file systems). We also need a fcntl or ioctl to report this max atomic > write size so that applications can rely on it. > Sync writes are done under the inode lock, so there cannot be other writers at the same time. And for the sync case, since the actual written offset is necessarily equal to the file size before the write, there is no need to report it (there is no system call that can report that anyway). For this sync case, the only change that the use of zone append introduces compared to regular writes is the potential for more short writes. Adding a flag for "report the actual offset for appending writes" is fine with me, but do you also mean to use this flag for driving zone append write vs regular writes in zonefs ? The fcntl or ioctl for getting the max atomic write size would be fine too. Given that zonefs is very close to the underlying zoned drive, I was assuming that the application can simply consult the device sysfs zone_append_max_bytes queue attribute. For regular file systems, this value would be used internally only. I do not really see how it can be useful to applications. Furthermore, the file system may have a hard time giving that information to the application depending on its underlying storage configuration (e.g. erasure coding/declustered RAID).
On Fri, Jul 31, 2020 at 1:44 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote: > > On 2020/07/31 16:59, Kanchan Joshi wrote: > > On Fri, Jul 31, 2020 at 12:29 PM Damien Le Moal <Damien.LeMoal@wdc.com> wrote: > >> > >> On 2020/07/31 15:45, hch@infradead.org wrote: > >>> On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote: > >>>>> - We may not be able to use RWF_APPEND, and need exposing a new > >>>>> type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this > >>>>> sounds outrageous, but is it OK to have uring-only flag which can be > >>>>> combined with RWF_APPEND? > >>>> > >>>> Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for > >>>> raw block device accesses. We could certainly define a meaning for these in the > >>>> context of zoned block devices. > >>> > >>> We can't just add a meaning for O_APPEND on block devices now, > >>> as it was previously silently ignored. I also really don't think any > >>> of these semantics even fit the block device to start with. If you > >>> want to work on raw zones use zonefs, that's what is exists for. > >> > >> Which is fine with me. Just trying to say that I think this is exactly the > >> discussion we need to start with. What interface do we implement... > >> > >> Allowing zone append only through zonefs as the raw block device equivalent, all > >> the O_APPEND/RWF_APPEND semantic is defined and the "return written offset" > >> implementation in VFS would be common for all file systems, including regular > >> ones. Beside that, there is I think the question of short writes... Not sure if > >> short writes can currently happen with async RWF_APPEND writes to regular files. > >> I think not but that may depend on the FS. > > > > generic_write_check_limits (called by generic_write_checks, used by > > most FS) may make it short, and AFAIK it does not depend on > > async/sync. > > Johannes has a patch (not posted yet) fixing all this for zonefs, > differentiating sync and async cases, allow short writes or not, etc. This was > done by not using generic_write_check_limits() and instead writing a > zonefs_check_write() function that is zone append friendly. > > We can post that as a base for the discussion on semantic if you want... There is no problem in about how-to-do-it. That part is simple - we have the iocb, and sync/async can be known whether ki_complete callback is set. This point to be discussed was whether-to-allow-short-write-or-not if we are talking about a generic file-append-returning-location. That said, since we are talking about moving to indirect-offset in io-uring, short-write is not an issue anymore I suppose (it goes back to how it was). But the unsettled thing is - whether we can use O/RWF_APPEND with indirect-offset (pointer) scheme. > > This was one of the reason why we chose to isolate the operation by a > > different IOCB flag and not by IOCB_APPEND alone. > > For zonefs, the plan is: > * For the sync write case, zone append is always used. > * For the async write case, if we see IOCB_APPEND, then zone append BIOs are > used. If not, regular write BIOs are used. > > Simple enough I think. No need for a new flag. Maybe simple if we only think of ZoneFS (how user-space sends async-append and gets result is a common problem). Add Block I/O in scope - it gets slightly more complicated because it has to cater to non-zoned devices. And there already is a well-established understanding that append does nothing...so code like "if (flags & IOCB_APPEND) { do something; }" in block I/O path may surprise someone resuming after a hiatus. Add File I/O in scope - It gets further complicated. I think it would make sense to make it opt-in rather than compulsory, but most of them already implement a behavior for IOCB_APPEND. How to make it opt-in without new flags. New flags (FMODE_SOME_NAME, IOCB_SOME_NAME) serve that purpose. Please assess the need (for isolation) considering all three cases.
On Fri, Jul 31, 2020 at 09:34:50AM +0000, Damien Le Moal wrote: > Sync writes are done under the inode lock, so there cannot be other writers at > the same time. And for the sync case, since the actual written offset is > necessarily equal to the file size before the write, there is no need to report > it (there is no system call that can report that anyway). For this sync case, > the only change that the use of zone append introduces compared to regular > writes is the potential for more short writes. > > Adding a flag for "report the actual offset for appending writes" is fine with > me, but do you also mean to use this flag for driving zone append write vs > regular writes in zonefs ? Let's keep semantics and implementation separate. For the case where we report the actual offset we need a size imitation and no short writes. Anything with those semantics can be implemented using Zone Append trivially in zonefs, and we don't even need the exclusive lock in that case. But even without that flag anything that has an exclusive lock can at least in theory be implemented using Zone Append, it just need support for submitting another request from the I/O completion handler of the first. I just don't think it is worth it - with the exclusive lock we do have access to the zone serialied so a normal write works just fine. Both for the sync and async case. > The fcntl or ioctl for getting the max atomic write size would be fine too. > Given that zonefs is very close to the underlying zoned drive, I was assuming > that the application can simply consult the device sysfs zone_append_max_bytes > queue attribute. For zonefs we can, yes. But in many ways that is a lot more cumbersome that having an API that works on the fd you want to write on. > For regular file systems, this value would be used internally > only. I do not really see how it can be useful to applications. Furthermore, the > file system may have a hard time giving that information to the application > depending on its underlying storage configuration (e.g. erasure > coding/declustered RAID). File systems might have all kinds of limits of their own (e.g. extent sizes). And a good API that just works everywhere and is properly documented is much better than heaps of cargo culted crap all over applications.
On 2020/07/31 18:41, hch@infradead.org wrote: > On Fri, Jul 31, 2020 at 09:34:50AM +0000, Damien Le Moal wrote: >> Sync writes are done under the inode lock, so there cannot be other writers at >> the same time. And for the sync case, since the actual written offset is >> necessarily equal to the file size before the write, there is no need to report >> it (there is no system call that can report that anyway). For this sync case, >> the only change that the use of zone append introduces compared to regular >> writes is the potential for more short writes. >> >> Adding a flag for "report the actual offset for appending writes" is fine with >> me, but do you also mean to use this flag for driving zone append write vs >> regular writes in zonefs ? > > Let's keep semantics and implementation separate. For the case > where we report the actual offset we need a size imitation and no > short writes. OK. So the name of the flag confused me. The flag name should reflect "Do zone append and report written offset", right ? Just to clarify, here was my thinking for zonefs: 1) file open with O_APPEND/aio has RWF_APPEND: then it is OK to assume that the application did not set the aio offset since APPEND means offset==file size. In that case, do zone append and report back the written offset. 2) file open without O_APPEND/aio does not have RWF_APPEND: the application specified an aio offset and we must respect it, write it that exact same order, so use regular writes. For regular file systems, with case (1) condition, the FS use whatever it wants for the implementation, and report back the written offset, which will always be the file size at the time the aio was issued. Your method with a new flag to switch between (1) and (2) is OK with me, but the "no short writes" may be difficult to achieve in a regular FS, no ? I do not think current FSes have such guarantees... Especially in the case of buffered async writes I think. > Anything with those semantics can be implemented using Zone Append > trivially in zonefs, and we don't even need the exclusive lock in that > case. But even without that flag anything that has an exclusive lock can > at least in theory be implemented using Zone Append, it just need > support for submitting another request from the I/O completion handler > of the first. I just don't think it is worth it - with the exclusive > lock we do have access to the zone serialied so a normal write works > just fine. Both for the sync and async case. We did switch to have zonefs do append writes in the sync case, always. Hmmm... Not sure anymore it was such a good idea. > >> The fcntl or ioctl for getting the max atomic write size would be fine too. >> Given that zonefs is very close to the underlying zoned drive, I was assuming >> that the application can simply consult the device sysfs zone_append_max_bytes >> queue attribute. > > For zonefs we can, yes. But in many ways that is a lot more cumbersome > that having an API that works on the fd you want to write on. Got it. Makes sense. >> For regular file systems, this value would be used internally >> only. I do not really see how it can be useful to applications. Furthermore, the >> file system may have a hard time giving that information to the application >> depending on its underlying storage configuration (e.g. erasure >> coding/declustered RAID). > > File systems might have all kinds of limits of their own (e.g. extent > sizes). And a good API that just works everywhere and is properly > documented is much better than heaps of cargo culted crap all over > applications. OK. Makes sense. That said, taking Naohiro's work on btrfs as an example, zone append is used for every data write, no matter if it is O_APPEND/RWF_APPEND or not. The size limitation for zone append writes is not needed at all by applications. Maximum extent size is aligned to the max append write size internally, and if the application issued a larger write, it loops over multiple extents, similarly to any regular write may do (if there is overwrite etc...). For the regular FS case, my thinking on the semantic really was: if asked to do so, return the written offset for a RWF_APPEND aios. And I think that implementing that does not depend in any way on what the FS does internally. But I think I am starting to see the picture you are drawing here: 1) Introduce a fcntl() to get "maximum size for atomic append writes" 2) Introduce an aio flag specifying "Do atomic append write and report written offset" 3) For an aio specifying "Do atomic append write and report written offset", if the aio is larger than "maximum size for atomic append writes", fail it on submission, no short writes. 4) For any other aio, it is business as usual, aio is processed as they are now. And the implementation is actually completely free to use zone append writes or regular writes regardless of the "Do atomic append write and report written offset" being used or not. Is it your thinking ? That would work for me. That actually end up completely unifying the interface behavior for zonefs and regular FS. Same semantic.
On Fri, Jul 31, 2020 at 10:16:49AM +0000, Damien Le Moal wrote: > > > > Let's keep semantics and implementation separate. For the case > > where we report the actual offset we need a size imitation and no > > short writes. > > OK. So the name of the flag confused me. The flag name should reflect "Do zone > append and report written offset", right ? Well, we already have O_APPEND, which is the equivalent to append to the write pointer. The only interesting addition is that we also want to report where we wrote. So I'd rather have RWF_REPORT_OFFSET or so. > Just to clarify, here was my thinking for zonefs: > 1) file open with O_APPEND/aio has RWF_APPEND: then it is OK to assume that the > application did not set the aio offset since APPEND means offset==file size. In > that case, do zone append and report back the written offset. Yes. > 2) file open without O_APPEND/aio does not have RWF_APPEND: the application > specified an aio offset and we must respect it, write it that exact same order, > so use regular writes. Yes. > > For regular file systems, with case (1) condition, the FS use whatever it wants > for the implementation, and report back the written offset, which will always > be the file size at the time the aio was issued. Yes. > > Your method with a new flag to switch between (1) and (2) is OK with me, but the > "no short writes" may be difficult to achieve in a regular FS, no ? I do not > think current FSes have such guarantees... Especially in the case of buffered > async writes I think. I'll have to check what Jens recently changed with io_uring, but traditionally the only short write case for a normal file system is the artifical INT_MAX limit for the write size. > > Anything with those semantics can be implemented using Zone Append > > trivially in zonefs, and we don't even need the exclusive lock in that > > case. But even without that flag anything that has an exclusive lock can > > at least in theory be implemented using Zone Append, it just need > > support for submitting another request from the I/O completion handler > > of the first. I just don't think it is worth it - with the exclusive > > lock we do have access to the zone serialied so a normal write works > > just fine. Both for the sync and async case. > > We did switch to have zonefs do append writes in the sync case, always. Hmmm... > Not sure anymore it was such a good idea. It might be a good idea as long as we have the heavy handed scheduler locking. But if we don't have that there doesn't seem to be much of a reason for zone append. > OK. Makes sense. That said, taking Naohiro's work on btrfs as an example, zone > append is used for every data write, no matter if it is O_APPEND/RWF_APPEND or > not. The size limitation for zone append writes is not needed at all by > applications. Maximum extent size is aligned to the max append write size > internally, and if the application issued a larger write, it loops over multiple > extents, similarly to any regular write may do (if there is overwrite etc...). True, we probably don't need the limit for a normal file system. > For the regular FS case, my thinking on the semantic really was: if asked to do > so, return the written offset for a RWF_APPEND aios. And I think that > implementing that does not depend in any way on what the FS does internally. Exactly. > > But I think I am starting to see the picture you are drawing here: > 1) Introduce a fcntl() to get "maximum size for atomic append writes" > 2) Introduce an aio flag specifying "Do atomic append write and report written > offset" I think we just need the 'report written offset flag', in fact even for zonefs with the right locking we can handle unlimited write sizes, just with lower performance. E.g. 1) check if the write size is larger than the zone append limit if no: - take the shared lock and issue a zone append, done if yes: - take the exclusive per-inode (zone) lock and just issue either normal writes or zone append at your choice, relying on the lock to serialize other writers. For the async case this means we need a lock than can be release in a different context than it was acquired, which is a little ugly but can be done. > And the implementation is actually completely free to use zone append writes or > regular writes regardless of the "Do atomic append write and report written > offset" being used or not. Yes, that was my point about keeping the semantics and implementation separate.
And FYI, this is what I'd do for a hacky aio-only prototype (untested): diff --git a/fs/aio.c b/fs/aio.c index 91e7cc4a9f179b..42b1934e38758b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1438,7 +1438,10 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) } iocb->ki_res.res = res; - iocb->ki_res.res2 = res2; + if ((kiocb->ki_flags & IOCB_REPORT_OFFSET) && res > 0) + iocb->ki_res.res2 = kiocb->ki_pos - res; + else + iocb->ki_res.res2 = res2; iocb_put(iocb); } @@ -1452,6 +1455,8 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; + if (iocb->aio_flags & IOCB_FLAG_REPORT_OFFSET) + req->ki_flags |= IOCB_REPORT_OFFSET; req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* diff --git a/include/linux/fs.h b/include/linux/fs.h index f5abba86107d86..522b0a3437d420 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -316,6 +316,7 @@ enum rw_hint { #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) #define IOCB_NOIO (1 << 9) +#define IOCB_REPORT_OFFSET (1 << 10) struct kiocb { struct file *ki_filp; diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index 8387e0af0f768a..e4313d7aa3b7e7 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -55,6 +55,7 @@ enum { */ #define IOCB_FLAG_RESFD (1 << 0) #define IOCB_FLAG_IOPRIO (1 << 1) +#define IOCB_FLAG_REPORT_OFFSET (1 << 2) /* read() from /dev/aio returns these structures. */ struct io_event {
On Fri, Jul 31, 2020 at 6:38 PM hch@infradead.org <hch@infradead.org> wrote: > > And FYI, this is what I'd do for a hacky aio-only prototype (untested): > > > diff --git a/fs/aio.c b/fs/aio.c > index 91e7cc4a9f179b..42b1934e38758b 100644 > --- a/fs/aio.c > +++ b/fs/aio.c > @@ -1438,7 +1438,10 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) > } > > iocb->ki_res.res = res; > - iocb->ki_res.res2 = res2; > + if ((kiocb->ki_flags & IOCB_REPORT_OFFSET) && res > 0) > + iocb->ki_res.res2 = kiocb->ki_pos - res; > + else > + iocb->ki_res.res2 = res2; > iocb_put(iocb); > } > > @@ -1452,6 +1455,8 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) > req->ki_flags = iocb_flags(req->ki_filp); > if (iocb->aio_flags & IOCB_FLAG_RESFD) > req->ki_flags |= IOCB_EVENTFD; > + if (iocb->aio_flags & IOCB_FLAG_REPORT_OFFSET) > + req->ki_flags |= IOCB_REPORT_OFFSET; > req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); > if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { > /* > diff --git a/include/linux/fs.h b/include/linux/fs.h > index f5abba86107d86..522b0a3437d420 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -316,6 +316,7 @@ enum rw_hint { > #define IOCB_WRITE (1 << 6) > #define IOCB_NOWAIT (1 << 7) > #define IOCB_NOIO (1 << 9) > +#define IOCB_REPORT_OFFSET (1 << 10) > > struct kiocb { > struct file *ki_filp; > diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h > index 8387e0af0f768a..e4313d7aa3b7e7 100644 > --- a/include/uapi/linux/aio_abi.h > +++ b/include/uapi/linux/aio_abi.h > @@ -55,6 +55,7 @@ enum { > */ > #define IOCB_FLAG_RESFD (1 << 0) > #define IOCB_FLAG_IOPRIO (1 << 1) > +#define IOCB_FLAG_REPORT_OFFSET (1 << 2) > > /* read() from /dev/aio returns these structures. */ > struct io_event { Looks good, but it drops io_uring. How about two flags - 1. RWF_REPORT_OFFSET (only for aio) ----> aio fails the second one 2. RWF_REPORT_OFFSET_INDIRECT (for io_uring). ----> uring fails the first one Since these are RWF flags, they can be used by other sync/async transports also in future if need be. Either of these flags will set single IOCB_REPORT_OFFSET, which can be used by FS/Block etc (they don't have to worry how uring/aio sends it up). This is what I mean in code - diff --git a/fs/aio.c b/fs/aio.c index 91e7cc4a9f17..307dfbfb04f7 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1472,6 +1472,11 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) return ret; + /* support only direct offset */ + if (unlikely(iocb->aio_rw_flags & RWF_REPORT_OFFSET_INDIRECT)) + return -EOPNOTSUPP; + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ return 0; diff --git a/fs/io_uring.c b/fs/io_uring.c index 3e406bc1f855..5fa21644251f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2451,6 +2451,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct kiocb *kiocb = &req->rw.kiocb; unsigned ioprio; int ret; + rwf_t rw_flags; if (S_ISREG(file_inode(req->file)->i_mode)) req->flags |= REQ_F_ISREG; @@ -2462,9 +2463,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); - ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + rw_flags = READ_ONCE(sqe->rw_flags); + ret = kiocb_set_rw_flags(kiocb, rw_flags); if (unlikely(ret)) return ret; + /* support only indirect offset */ + if (unlikely(rw_flags & RWF_REPORT_OFFSET_DIRECT)) + return -EOPNOTSUPP; ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 8a00ba99284e..fe2f1f5c5d33 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3296,8 +3296,17 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) ki->ki_flags |= IOCB_DSYNC; if (flags & RWF_SYNC) ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); - if (flags & RWF_APPEND) + if (flags & RWF_APPEND) { ki->ki_flags |= IOCB_APPEND; + /* + * 1. These flags do not make sense when used standalone + * 2. RWF_REPORT_OFFSET_DIRECT = report result directly (for aio) + * 3. RWF_REPORT_INDIRECT_OFFSER = use pointer (for io_uring) + * */ + if (flags & RWF_REPORT_OFFSET_DIRECT || + flags & RWF_REPORT_OFFSET_INDIRECT) + ki->ki_flags |= IOCB_REPORT_OFFSET;
On 2020/07/31 21:51, hch@infradead.org wrote: > On Fri, Jul 31, 2020 at 10:16:49AM +0000, Damien Le Moal wrote: >>> >>> Let's keep semantics and implementation separate. For the case >>> where we report the actual offset we need a size imitation and no >>> short writes. >> >> OK. So the name of the flag confused me. The flag name should reflect "Do zone >> append and report written offset", right ? > > Well, we already have O_APPEND, which is the equivalent to append to > the write pointer. The only interesting addition is that we also want > to report where we wrote. So I'd rather have RWF_REPORT_OFFSET or so. That works for me. But that rules out having the same interface for raw block devices since O_APPEND has no meaning in that case. So for raw block devices, it will have to be through zonefs. That works for me, and I think it was your idea all along. Can you confirm please ? >> But I think I am starting to see the picture you are drawing here: >> 1) Introduce a fcntl() to get "maximum size for atomic append writes" >> 2) Introduce an aio flag specifying "Do atomic append write and report written >> offset" > > I think we just need the 'report written offset flag', in fact even for > zonefs with the right locking we can handle unlimited write sizes, just > with lower performance. E.g. > > 1) check if the write size is larger than the zone append limit > > if no: > > - take the shared lock and issue a zone append, done > > if yes: > > - take the exclusive per-inode (zone) lock and just issue either normal > writes or zone append at your choice, relying on the lock to > serialize other writers. For the async case this means we need a > lock than can be release in a different context than it was acquired, > which is a little ugly but can be done. Yes, that would be possible. But likely, this will also need calls to inode_dio_wait() to avoid ending up with a mix of regular write and zone append writes in flight (which likely would result in the regular write failing as the zone append writes would go straight to the device without waiting for the zone write lock like regular writes do). This all sound sensible to me. One last point though, specific to zonefs: if the user opens a zone file with O_APPEND, I do want to have that necessarily mean "use zone append". And same for the "RWF_REPORT_OFFSET". The point here is that both O_APPEND and RWF_REPORT_OFFSET can be used with both regular writes and zone append writes, but none of them actually clearly specify if the application/user tolerates writing data to disk in a different order than the issuing order... So another flag to indicate "atomic out-of-order writes" (== zone append) ? Without a new flag, we can only have these cases to enable zone append: 1) No O_APPEND: ignore RWF_REPORT_OFFSET and use regular writes using ->aio_ofst 2) O_APPEND without RWF_REPORT_OFFSET: use regular write with file size as ->aio_ofst (as done today already), do not report the write offset on completion. 3) O_APPEND with RWF_REPORT_OFFSET: use zone append, implying potentially out of order writes. And case (3) is not nice. I would rather prefer something like: 3) O_APPEND with RWF_REPORT_OFFSET: use regular write with file size as ->aio_ofst (as done today already), report the write offset on completion. 4) O_APPEND with RWF_ATOMIC_APPEND: use zone append writes, do not report the write offset on completion. 5) O_APPEND with RWF_ATOMIC_APPEND+RWF_REPORT_OFFSET: use zone append writes, report the write offset on completion. RWF_ATOMIC_APPEND could also always imply RWF_REPORT_OFFSET. ANy aio with RWF_ATOMIC_APPEND that is too large would be failed. Thoughts ?
On Wed, Aug 05, 2020 at 07:35:28AM +0000, Damien Le Moal wrote: > > the write pointer. The only interesting addition is that we also want > > to report where we wrote. So I'd rather have RWF_REPORT_OFFSET or so. > > That works for me. But that rules out having the same interface for raw block > devices since O_APPEND has no meaning in that case. So for raw block devices, it > will have to be through zonefs. That works for me, and I think it was your idea > all along. Can you confirm please ? Yes. I don't think think raw syscall level access to the zone append primitive makes sense. Either use zonefs for a file-like API, or use the NVMe pass through interface for 100% raw access. > > - take the exclusive per-inode (zone) lock and just issue either normal > > writes or zone append at your choice, relying on the lock to > > serialize other writers. For the async case this means we need a > > lock than can be release in a different context than it was acquired, > > which is a little ugly but can be done. > > Yes, that would be possible. But likely, this will also need calls to > inode_dio_wait() to avoid ending up with a mix of regular write and zone append > writes in flight (which likely would result in the regular write failing as the > zone append writes would go straight to the device without waiting for the zone > write lock like regular writes do). inode_dio_wait is a really bad implementation of almost a lock. I've started some work that I need to finish to just replace it with proper non-owner rwsems (or even the range locks Dave has been looking into). > > This all sound sensible to me. One last point though, specific to zonefs: if the > user opens a zone file with O_APPEND, I do want to have that necessarily mean > "use zone append". And same for the "RWF_REPORT_OFFSET". The point here is that > both O_APPEND and RWF_REPORT_OFFSET can be used with both regular writes and > zone append writes, but none of them actually clearly specify if the > application/user tolerates writing data to disk in a different order than the > issuing order... So another flag to indicate "atomic out-of-order writes" (== > zone append) ? O_APPEND pretty much implies out of order, as there is no way for an application to know which thread wins the race to write the next chunk.
On 2020/08/14 17:14, hch@infradead.org wrote: > On Wed, Aug 05, 2020 at 07:35:28AM +0000, Damien Le Moal wrote: >>> the write pointer. The only interesting addition is that we also want >>> to report where we wrote. So I'd rather have RWF_REPORT_OFFSET or so. >> >> That works for me. But that rules out having the same interface for raw block >> devices since O_APPEND has no meaning in that case. So for raw block devices, it >> will have to be through zonefs. That works for me, and I think it was your idea >> all along. Can you confirm please ? > > Yes. I don't think think raw syscall level access to the zone append > primitive makes sense. Either use zonefs for a file-like API, or > use the NVMe pass through interface for 100% raw access. > >>> - take the exclusive per-inode (zone) lock and just issue either normal >>> writes or zone append at your choice, relying on the lock to >>> serialize other writers. For the async case this means we need a >>> lock than can be release in a different context than it was acquired, >>> which is a little ugly but can be done. >> >> Yes, that would be possible. But likely, this will also need calls to >> inode_dio_wait() to avoid ending up with a mix of regular write and zone append >> writes in flight (which likely would result in the regular write failing as the >> zone append writes would go straight to the device without waiting for the zone >> write lock like regular writes do). > > inode_dio_wait is a really bad implementation of almost a lock. I've > started some work that I need to finish to just replace it with proper > non-owner rwsems (or even the range locks Dave has been looking into). OK. >> This all sound sensible to me. One last point though, specific to zonefs: if the >> user opens a zone file with O_APPEND, I do want to have that necessarily mean >> "use zone append". And same for the "RWF_REPORT_OFFSET". The point here is that >> both O_APPEND and RWF_REPORT_OFFSET can be used with both regular writes and >> zone append writes, but none of them actually clearly specify if the >> application/user tolerates writing data to disk in a different order than the >> issuing order... So another flag to indicate "atomic out-of-order writes" (== >> zone append) ? > > O_APPEND pretty much implies out of order, as there is no way for an > application to know which thread wins the race to write the next chunk. Yes and no. If the application threads do not synchronize their calls to io_submit(), then yes indeed, things can get out of order. But if the application threads are synchronized, then the offset set for each append AIO will be in sequence of submission, so the user will not see its writes completing at different write offsets than this implied offsets. If O_APPEND is the sole flag that triggers the use of zone append, then we loose this current implied and predictable positioning of the writes. Even for a single thread by the way. Hence my thinking to preserve this, meaning that O_APPEND alone will see zonefs using regular writes. O_APPEND or RWF_APPEND + RWF_SOME_NICELY_NAMED_FLAG_for_ZA would trigger the use of zone append, with the implied effect that writes may not end up in the same order as they are submitted. So the flag name could be: RWF_RELAXED_ORDER or something like that (I am bad at naming...). And we also have RWF_REPORT_OFFSET which applies to all cases of append writes, regardless of the command used. Does this make sense ?
On Fri, Aug 14, 2020 at 08:27:13AM +0000, Damien Le Moal wrote: > > > > O_APPEND pretty much implies out of order, as there is no way for an > > application to know which thread wins the race to write the next chunk. > > Yes and no. If the application threads do not synchronize their calls to > io_submit(), then yes indeed, things can get out of order. But if the > application threads are synchronized, then the offset set for each append AIO > will be in sequence of submission, so the user will not see its writes > completing at different write offsets than this implied offsets. Nothing gurantees any kind of ordering for two separate io_submit calls. The kernel may delay one of them for any reason. Now if you are doing two fully synchronous write calls on an O_APPEND fd, yes they are serialized. But using Zone Append won't change that.
On 2020/08/14 21:04, hch@infradead.org wrote: > On Fri, Aug 14, 2020 at 08:27:13AM +0000, Damien Le Moal wrote: >>> >>> O_APPEND pretty much implies out of order, as there is no way for an >>> application to know which thread wins the race to write the next chunk. >> >> Yes and no. If the application threads do not synchronize their calls to >> io_submit(), then yes indeed, things can get out of order. But if the >> application threads are synchronized, then the offset set for each append AIO >> will be in sequence of submission, so the user will not see its writes >> completing at different write offsets than this implied offsets. > > Nothing gurantees any kind of ordering for two separate io_submit calls. > The kernel may delay one of them for any reason. Ah. Yes. The inode locking is at the single aio issuing level, not the io_submit syscall level... So yes, in the end, the aios offsets and their execution order can be anything. I see it now. So O_APPEND implying zone append is fine for zonefs. > > Now if you are doing two fully synchronous write calls on an > O_APPEND fd, yes they are serialized. But using Zone Append won't > change that. Yep. That zonefs already does. OK. So I think I will send a writeup of the semantic discussed so far. We also still need a solution for io_uring interface for the written offset report and we can implement.
On Fri, Aug 14, 2020 at 1:44 PM hch@infradead.org <hch@infradead.org> wrote: > > On Wed, Aug 05, 2020 at 07:35:28AM +0000, Damien Le Moal wrote: > > > the write pointer. The only interesting addition is that we also want > > > to report where we wrote. So I'd rather have RWF_REPORT_OFFSET or so. > > > > That works for me. But that rules out having the same interface for raw block > > devices since O_APPEND has no meaning in that case. So for raw block devices, it > > will have to be through zonefs. That works for me, and I think it was your idea > > all along. Can you confirm please ? > > Yes. I don't think think raw syscall level access to the zone append > primitive makes sense. Either use zonefs for a file-like API, or > use the NVMe pass through interface for 100% raw access. But there are use-cases which benefit from supporting zone-append on raw block-dev path. Certain user-space log-structured/cow FS/DB will use the device that way. Aerospike is one example. Pass-through is synchronous, and we lose the ability to use io-uring. For async uring/aio to block-dev, file-pointer will not be moved to EoF, but that position was not very important anyway- as with this interface we expect many async appends outstanding, all with zone-start. Do you think RWF_APPEND | RWF_REPORT_OFFSET_DIRECT/INDIRECT is too bad for direct block-dev. Could you please suggest another way to go about it? -- Kanchan
On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote: > But there are use-cases which benefit from supporting zone-append on > raw block-dev path. > Certain user-space log-structured/cow FS/DB will use the device that > way. Aerospike is one example. > Pass-through is synchronous, and we lose the ability to use io-uring. So use zonefs, which is designed exactly for that use case.
On Tue, Sep 8, 2020 at 8:48 PM hch@infradead.org <hch@infradead.org> wrote: > > On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote: > > But there are use-cases which benefit from supporting zone-append on > > raw block-dev path. > > Certain user-space log-structured/cow FS/DB will use the device that > > way. Aerospike is one example. > > Pass-through is synchronous, and we lose the ability to use io-uring. > > So use zonefs, which is designed exactly for that use case. Not specific to zone-append, but in general it may not be good to lock new features/interfaces to ZoneFS alone, given that direct-block interface has its own merits. Mapping one file to a one zone is good for some use-cases, but limiting for others. Some user-space FS/DBs would be more efficient (less meta, indirection) with the freedom to decide file-to-zone mapping/placement. - Rocksdb and those LSM style DBs would map SSTable to zone, but SSTable file may be two small (initially) and may become too large (after compaction) for a zone. - The internal parallelism of a single zone is a design-choice, and depends on the drive. Writing multiple zones parallely (striped/raid way) can give better performance than writing on one. In that case one would want to file that seamlessly combines multiple-zones in a striped fashion. Also it seems difficult (compared to block dev) to fit simple-copy TP in ZoneFS. The new command needs: one NVMe drive, list of source LBAs and one destination LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone file, and one destination zone file) for that. While with block interface, we do not need more than one file-descriptor representing the entire device. With more zone-files, we face open/close overhead too.
On 2020/09/25 2:20, Kanchan Joshi wrote: > On Tue, Sep 8, 2020 at 8:48 PM hch@infradead.org <hch@infradead.org> wrote: >> >> On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote: >>> But there are use-cases which benefit from supporting zone-append on >>> raw block-dev path. >>> Certain user-space log-structured/cow FS/DB will use the device that >>> way. Aerospike is one example. >>> Pass-through is synchronous, and we lose the ability to use io-uring. >> >> So use zonefs, which is designed exactly for that use case. > > Not specific to zone-append, but in general it may not be good to lock > new features/interfaces to ZoneFS alone, given that direct-block > interface has its own merits. > Mapping one file to a one zone is good for some use-cases, but > limiting for others. > Some user-space FS/DBs would be more efficient (less meta, indirection) > with the freedom to decide file-to-zone mapping/placement. There is no metadata in zonefs. One file == one zone and the mapping between zonefs files and zones is static, determined at mount time simply using report zones. Zonefs files cannot be renamed nor deleted in anyway. Choosing a zonefs file *is* the same as choosing a zone. Zonfes is *not* a POSIX file system doing dynamic block allocation to files. The backing storage of files in zonefs is static and fixed to the zone they represent. The difference between zonefs vs raw zoned block device is the API that has to be used by the application, that is, file descriptor representing the entire disk for raw disk vs file descriptor representing one zone in zonefs. Note that the later has *a lot* of advantages over the former: enables O_APPEND use, protects against bugs with user write offsets mistakes, adds consistency of cached data against zone resets, and more. > - Rocksdb and those LSM style DBs would map SSTable to zone, but > SSTable file may be two small (initially) and may become too large > (after compaction) for a zone. You are contradicting yourself here. If a SSTable is mapped to a zone, then its size cannot exceed the zone capacity, regardless of the interface used to access the zones. And except for L0 tables which can be smaller (and are in memory anyway), all levels tables have the same maximum size, which for zoned drives must be the zone capacity. In any case, solving any problem in this area does not depend in any way on zonefs vs raw disk interface. The implementation will differ depending on the chosen interface, but what needs to be done to map SSTables to zones is the same in both cases. > - The internal parallelism of a single zone is a design-choice, and > depends on the drive. Writing multiple zones parallely (striped/raid > way) can give better performance than writing on one. In that case one > would want to file that seamlessly combines multiple-zones in a > striped fashion. Then write a FS for that... Or have a library do it in user space. For the library case, the implementation will differ for zonefs vs raw disk due to the different API (regular file vs block devicer file), but the principles to follow for stripping zones into a single storage object remain the same. > Also it seems difficult (compared to block dev) to fit simple-copy TP > in ZoneFS. The new > command needs: one NVMe drive, list of source LBAs and one destination > LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone > file, and one destination zone file) for that. While with block > interface, we do not need more than one file-descriptor representing > the entire device. With more zone-files, we face open/close overhead too. Are you expecting simple-copy to allow requests that are not zone aligned ? I do not think that will ever happen. Otherwise, the gotcha cases for it would be far too numerous. Simple-copy is essentially an optimized regular write command. Similarly to that command, it will not allow copies over zone boundaries and will need the destination LBA to be aligned to the destination zone WP. I have not checked the TP though and given the NVMe NDA, I will stop the discussion here. filesend() could be used as the interface for simple-copy. Implementing that in zonefs would not be that hard. What is your plan for simple-copy interface for raw block device ? An ioctl ? filesend() too ? As as with any other user level API, we should not be restricted to a particular device type if we can avoid it, so in-kernel emulation of the feature is needed for devices that do not have simple-copy or scsi extended copy. filesend() seems to me like the best choice since all of that is already implemented there. As for the open()/close() overhead for zonefs, may be some use cases may suffer from it, but my tests with LevelDB+zonefs did not show any significant difference. zonefs open()/close() operations are way faster than for a regular file system since there is no metadata and all inodes always exist in-memory. And zonefs() now supports MAR/MOR limits for O_WRONLY open(). That can simplify things for the user.
On Fri, Sep 25, 2020 at 8:22 AM Damien Le Moal <Damien.LeMoal@wdc.com> wrote: > > On 2020/09/25 2:20, Kanchan Joshi wrote: > > On Tue, Sep 8, 2020 at 8:48 PM hch@infradead.org <hch@infradead.org> wrote: > >> > >> On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote: > >>> But there are use-cases which benefit from supporting zone-append on > >>> raw block-dev path. > >>> Certain user-space log-structured/cow FS/DB will use the device that > >>> way. Aerospike is one example. > >>> Pass-through is synchronous, and we lose the ability to use io-uring. > >> > >> So use zonefs, which is designed exactly for that use case. > > > > Not specific to zone-append, but in general it may not be good to lock > > new features/interfaces to ZoneFS alone, given that direct-block > > interface has its own merits. > > Mapping one file to a one zone is good for some use-cases, but > > limiting for others. > > Some user-space FS/DBs would be more efficient (less meta, indirection) > > with the freedom to decide file-to-zone mapping/placement. > > There is no metadata in zonefs. One file == one zone and the mapping between > zonefs files and zones is static, determined at mount time simply using report > zones. Zonefs files cannot be renamed nor deleted in anyway. Choosing a zonefs > file *is* the same as choosing a zone. Zonfes is *not* a POSIX file system doing > dynamic block allocation to files. The backing storage of files in zonefs is > static and fixed to the zone they represent. The difference between zonefs vs > raw zoned block device is the API that has to be used by the application, that > is, file descriptor representing the entire disk for raw disk vs file descriptor > representing one zone in zonefs. Note that the later has *a lot* of advantages > over the former: enables O_APPEND use, protects against bugs with user write > offsets mistakes, adds consistency of cached data against zone resets, and more. > > > - Rocksdb and those LSM style DBs would map SSTable to zone, but > > SSTable file may be two small (initially) and may become too large > > (after compaction) for a zone. > > You are contradicting yourself here. If a SSTable is mapped to a zone, then its > size cannot exceed the zone capacity, regardless of the interface used to access > the zones. And except for L0 tables which can be smaller (and are in memory > anyway), all levels tables have the same maximum size, which for zoned drives > must be the zone capacity. In any case, solving any problem in this area does > not depend in any way on zonefs vs raw disk interface. The implementation will > differ depending on the chosen interface, but what needs to be done to map > SSTables to zones is the same in both cases. > > > - The internal parallelism of a single zone is a design-choice, and > > depends on the drive. Writing multiple zones parallely (striped/raid > > way) can give better performance than writing on one. In that case one > > would want to file that seamlessly combines multiple-zones in a > > striped fashion. > > Then write a FS for that... Or have a library do it in user space. For the > library case, the implementation will differ for zonefs vs raw disk due to the > different API (regular file vs block devicer file), but the principles to follow > for stripping zones into a single storage object remain the same. ZoneFS is better when it is about dealing at single-zone granularity, and direct-block seems better when it is about grouping zones (in various ways including striping). The latter case (i.e. grouping zones) requires more involved mapping, and I agree that it can be left to application (for both ZoneFS and raw-block backends). But when an application tries that on ZoneFS, apart from mapping there would be additional cost of indirection/fd-management (due to file-on-files). And if new features (zone-append for now) are available only on ZoneFS, it forces application to use something that maynot be most optimal for its need. Coming to the original problem of plumbing append - I think divergence started because RWF_APPEND did not have any meaning for block device. Did I miss any other reason? How about write-anywhere semantics (RWF_RELAXED_WRITE or RWF_ANONYMOUS_WRITE flag) on block-dev. Zone-append works a lot like write-anywhere on block-dev (or on any other file that combines multiple-zones, in non-sequential fashion). > > Also it seems difficult (compared to block dev) to fit simple-copy TP > > in ZoneFS. The new > > command needs: one NVMe drive, list of source LBAs and one destination > > LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone > > file, and one destination zone file) for that. While with block > > interface, we do not need more than one file-descriptor representing > > the entire device. With more zone-files, we face open/close overhead too. > > Are you expecting simple-copy to allow requests that are not zone aligned ? I do > not think that will ever happen. Otherwise, the gotcha cases for it would be far > too numerous. Simple-copy is essentially an optimized regular write command. > Similarly to that command, it will not allow copies over zone boundaries and > will need the destination LBA to be aligned to the destination zone WP. I have > not checked the TP though and given the NVMe NDA, I will stop the discussion here. TP is ratified, if that is the problem you are referring to. > filesend() could be used as the interface for simple-copy. Implementing that in > zonefs would not be that hard. What is your plan for simple-copy interface for > raw block device ? An ioctl ? filesend() too ? As as with any other user level > API, we should not be restricted to a particular device type if we can avoid it, > so in-kernel emulation of the feature is needed for devices that do not have > simple-copy or scsi extended copy. filesend() seems to me like the best choice > since all of that is already implemented there. At this moment, ioctl as sync and io-uring for async. sendfile() and copy_file_range() takes two fds....with that we can represent copy from one source zone to another zone. But it does not fit to represent larger copy (from N source zones to one destination zone). Not sure if I am clear, perhaps sending RFC would be better for discussion on simple-copy. > As for the open()/close() overhead for zonefs, may be some use cases may suffer > from it, but my tests with LevelDB+zonefs did not show any significant > difference. zonefs open()/close() operations are way faster than for a regular > file system since there is no metadata and all inodes always exist in-memory. > And zonefs() now supports MAR/MOR limits for O_WRONLY open(). That can simplify > things for the user. > > > -- > Damien Le Moal > Western Digital Research
On 2020/09/29 3:58, Kanchan Joshi wrote: [...] > ZoneFS is better when it is about dealing at single-zone granularity, > and direct-block seems better when it is about grouping zones (in > various ways including striping). The latter case (i.e. grouping > zones) requires more involved mapping, and I agree that it can be left > to application (for both ZoneFS and raw-block backends). > But when an application tries that on ZoneFS, apart from mapping there > would be additional cost of indirection/fd-management (due to > file-on-files). There is no indirection in zonefs. fd-to-struct file/inode conversion is very fast and happens for every system call anyway, regardless of what the fd represents. So I really do not understand what your worry is here. If you are worried about overhead/performance, then please show numbers. If something is wrong, we can work on fixing it. > And if new features (zone-append for now) are available only on > ZoneFS, it forces application to use something that maynot be most > optimal for its need. "may" is not enough to convince me... > Coming to the original problem of plumbing append - I think divergence > started because RWF_APPEND did not have any meaning for block device. > Did I miss any other reason? Correct. > How about write-anywhere semantics (RWF_RELAXED_WRITE or > RWF_ANONYMOUS_WRITE flag) on block-dev. "write-anywhere" ? What do you mean ? That is not possible on zoned devices, even with zone append, since you at least need to guarantee that zones have enough unwritten space to accept an append command. > Zone-append works a lot like write-anywhere on block-dev (or on any > other file that combines multiple-zones, in non-sequential fashion). That is an over-simplification that is not helpful at all. Zone append is not "write anywhere" at all. And "write anywhere" is not a concept that exist on regular block devices anyway. Writes only go to the offset that the user decided, through lseek(), pwrite() or aio->aio_offset. It is not like the block layer decides where the writes land. The same constraint applies to zone append: the user decide the target zone. That is not "anywhere". Please be precise with wording and implied/desired semantic. Narrow down the scope of your concept names for clarity. And talking about "file that combines multiple-zones" would mean that we are now back in FS land, not raw block device file accesses anymore. So which one are we talking about ? It looks like you are confusing what the application does and how it uses whatever usable interface to the device with what that interface actually is. It is very confusing. >>> Also it seems difficult (compared to block dev) to fit simple-copy TP >>> in ZoneFS. The new >>> command needs: one NVMe drive, list of source LBAs and one destination >>> LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone >>> file, and one destination zone file) for that. While with block >>> interface, we do not need more than one file-descriptor representing >>> the entire device. With more zone-files, we face open/close overhead too. >> >> Are you expecting simple-copy to allow requests that are not zone aligned ? I do >> not think that will ever happen. Otherwise, the gotcha cases for it would be far >> too numerous. Simple-copy is essentially an optimized regular write command. >> Similarly to that command, it will not allow copies over zone boundaries and >> will need the destination LBA to be aligned to the destination zone WP. I have >> not checked the TP though and given the NVMe NDA, I will stop the discussion here. > > TP is ratified, if that is the problem you are referring to. Ah. Yes. Got confused with ZRWA. Simple-copy is a different story anyway. Let's not mix it into zone append user interface please. > >> filesend() could be used as the interface for simple-copy. Implementing that in >> zonefs would not be that hard. What is your plan for simple-copy interface for >> raw block device ? An ioctl ? filesend() too ? As as with any other user level >> API, we should not be restricted to a particular device type if we can avoid it, >> so in-kernel emulation of the feature is needed for devices that do not have >> simple-copy or scsi extended copy. filesend() seems to me like the best choice >> since all of that is already implemented there. > > At this moment, ioctl as sync and io-uring for async. sendfile() and > copy_file_range() takes two fds....with that we can represent copy > from one source zone to another zone. > But it does not fit to represent larger copy (from N source zones to > one destination zone). nvme passthrough ? If that does not fit your use case, then think of an interface, its definition/semantic and propose it. But again, use a different thread. This is mixing up zone-append and simple copy, which I do not think are directly related. > Not sure if I am clear, perhaps sending RFC would be better for > discussion on simple-copy. Separate this discussion from zone append please. Mixing up 2 problems in one thread is not helpful to make progress.
On Tue, Sep 29, 2020 at 6:54 AM Damien Le Moal <Damien.LeMoal@wdc.com> wrote: > > On 2020/09/29 3:58, Kanchan Joshi wrote: > [...] > > ZoneFS is better when it is about dealing at single-zone granularity, > > and direct-block seems better when it is about grouping zones (in > > various ways including striping). The latter case (i.e. grouping > > zones) requires more involved mapping, and I agree that it can be left > > to application (for both ZoneFS and raw-block backends). > > But when an application tries that on ZoneFS, apart from mapping there > > would be additional cost of indirection/fd-management (due to > > file-on-files). > > There is no indirection in zonefs. fd-to-struct file/inode conversion is very > fast and happens for every system call anyway, regardless of what the fd > represents. So I really do not understand what your worry is here. file-over-files.....if application (user-space FS/DB) has to place file-abstraction over zonefs again, to group/map the zones in a different manner (larger files, striped mapping etc.). Imagine a file over say 4 zones.....with zonefs backend, application will invoke kernel at least 4 times to open the fds. With raw-block backend, these system calls won't be required in the first place. > If you are > worried about overhead/performance, then please show numbers. If something is > wrong, we can work on fixing it. > > > And if new features (zone-append for now) are available only on > > ZoneFS, it forces application to use something that maynot be most > > optimal for its need. > > "may" is not enough to convince me... > > > Coming to the original problem of plumbing append - I think divergence > > started because RWF_APPEND did not have any meaning for block device. > > Did I miss any other reason? > > Correct. > > > How about write-anywhere semantics (RWF_RELAXED_WRITE or > > RWF_ANONYMOUS_WRITE flag) on block-dev. > > "write-anywhere" ? What do you mean ? That is not possible on zoned devices, > even with zone append, since you at least need to guarantee that zones have > enough unwritten space to accept an append command. > > > Zone-append works a lot like write-anywhere on block-dev (or on any > > other file that combines multiple-zones, in non-sequential fashion). > > That is an over-simplification that is not helpful at all. Zone append is not > "write anywhere" at all. And "write anywhere" is not a concept that exist on > regular block devices anyway. Writes only go to the offset that the user > decided, through lseek(), pwrite() or aio->aio_offset. It is not like the block > layer decides where the writes land. The same constraint applies to zone append: > the user decide the target zone. That is not "anywhere". Please be precise with > wording and implied/desired semantic. Narrow down the scope of your concept > names for clarity. This - <start> Issue write on offset X with no flag, it happens at X. Issue write on offset X with "anywhere" flag, it happens anywhere....and application comes to know that on completion. </start> Above is fairly generic as far as high-level interface is concerned. Return offset can be file-pointer or supplied-offset or something completely different. If anywhere-flag is passed, the caller should not assume anything and must explicitly check where write happened. The "anywhere" part can be decided by the component that implements the interface. For zoned block dev, this "anywhere" can come by issuing zone-append underneath. Some other components are free to implement "anywhere" in another way, or do nothing....in that case write continues to happen at X. For zoned raw-block, we have got an address-space that contains N zones placed sequentially. Write on offset O1 with anywhere flag: => The O1 is rounded-down to zone-start (say Z1) by direct-io code, zone-append is issued on Z1, and completion-offset O2 is returned. write-anywhere on another offset O3 of address space => zone-start could be Z2, and completion-offset O4 is returned. Application picks completion offsets O3 and O4 and goes about its business, not needing to know about Z1 or Z2. > And talking about "file that combines multiple-zones" would mean that we are now > back in FS land, not raw block device file accesses anymore. So which one are we > talking about ? About user-space FS/DB/SDS using zoned-storage, aiming optimized placement. In all this discussion, I have been referring to ZoneFS and Raw-block as backends for higher-level abstraction residing in user-space. > It looks like you are confusing what the application does and > how it uses whatever usable interface to the device with what that interface > actually is. It is very confusing. > > >>> Also it seems difficult (compared to block dev) to fit simple-copy TP > >>> in ZoneFS. The new > >>> command needs: one NVMe drive, list of source LBAs and one destination > >>> LBA. In ZoneFS, we would deal with N+1 file-descriptors (N source zone > >>> file, and one destination zone file) for that. While with block > >>> interface, we do not need more than one file-descriptor representing > >>> the entire device. With more zone-files, we face open/close overhead too. > >> > >> Are you expecting simple-copy to allow requests that are not zone aligned ? I do > >> not think that will ever happen. Otherwise, the gotcha cases for it would be far > >> too numerous. Simple-copy is essentially an optimized regular write command. > >> Similarly to that command, it will not allow copies over zone boundaries and > >> will need the destination LBA to be aligned to the destination zone WP. I have > >> not checked the TP though and given the NVMe NDA, I will stop the discussion here. > > > > TP is ratified, if that is the problem you are referring to. > > Ah. Yes. Got confused with ZRWA. Simple-copy is a different story anyway. Let's > not mix it into zone append user interface please. > > > > >> filesend() could be used as the interface for simple-copy. Implementing that in > >> zonefs would not be that hard. What is your plan for simple-copy interface for > >> raw block device ? An ioctl ? filesend() too ? As as with any other user level > >> API, we should not be restricted to a particular device type if we can avoid it, > >> so in-kernel emulation of the feature is needed for devices that do not have > >> simple-copy or scsi extended copy. filesend() seems to me like the best choice > >> since all of that is already implemented there. > > > > At this moment, ioctl as sync and io-uring for async. sendfile() and > > copy_file_range() takes two fds....with that we can represent copy > > from one source zone to another zone. > > But it does not fit to represent larger copy (from N source zones to > > one destination zone). > > nvme passthrough ? If that does not fit your use case, then think of an > interface, its definition/semantic and propose it. But again, use a different > thread. This is mixing up zone-append and simple copy, which I do not think are > directly related. > > > Not sure if I am clear, perhaps sending RFC would be better for > > discussion on simple-copy. > > Separate this discussion from zone append please. Mixing up 2 problems in one > thread is not helpful to make progress. Fine.
On Tue, Sep 08, 2020 at 04:18:01PM +0100, hch@infradead.org wrote: > On Mon, Sep 07, 2020 at 12:31:42PM +0530, Kanchan Joshi wrote: > > But there are use-cases which benefit from supporting zone-append on > > raw block-dev path. > > Certain user-space log-structured/cow FS/DB will use the device that > > way. Aerospike is one example. > > Pass-through is synchronous, and we lose the ability to use io-uring. > > So use zonefs, which is designed exactly for that use case. Using zonefs to test append alone can introduce a slight overhead with the VFS if we want to do something such as just testing any hot path with append and the block layer. If we want to live with that, that's fine! Just saying. Luis
On Fri, Jul 31, 2020 at 02:08:02PM +0100, hch@infradead.org wrote:
> And FYI, this is what I'd do for a hacky aio-only prototype (untested):
So... are we OK with an aio-only approach (instead of using io-uring)
for raw access to append?
Luis
On Fri, Jul 31, 2020 at 07:45:26AM +0100, hch@infradead.org wrote: > On Fri, Jul 31, 2020 at 06:42:10AM +0000, Damien Le Moal wrote: > > > - We may not be able to use RWF_APPEND, and need exposing a new > > > type/flag (RWF_INDIRECT_OFFSET etc.) user-space. Not sure if this > > > sounds outrageous, but is it OK to have uring-only flag which can be > > > combined with RWF_APPEND? > > > > Why ? Where is the problem ? O_APPEND/RWF_APPEND is currently meaningless for > > raw block device accesses. We could certainly define a meaning for these in the > > context of zoned block devices. > > We can't just add a meaning for O_APPEND on block devices now, Make sense. Is a new call system call for nameless writes called for instead then? Then there is no baggage. Or is this completely stupid? > as it was previously silently ignored. I also really don't think any > of these semantics even fit the block device to start with. If you > want to work on raw zones use zonefs, that's what is exists for. Using zonefs adds a slight VFS overhead. Fine if we want to live with that, but I have a feeling if we want to do something like just testing hot paths alone to compare apples to apples we'd want something more fine grained. Luis
diff --git a/fs/io_uring.c b/fs/io_uring.c index 7809ab2..6510cf5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -401,7 +401,14 @@ struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; u64 addr; - u64 len; + union { + /* + * len is used only during submission. + * append_offset is used only during completion. + */ + u64 len; + u64 append_offset; + }; }; struct io_connect { @@ -541,6 +548,7 @@ enum { REQ_F_NO_FILE_TABLE_BIT, REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, + REQ_F_ZONE_APPEND_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -598,6 +606,8 @@ enum { REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), + /* to return zone append offset */ + REQ_F_ZONE_APPEND = BIT(REQ_F_ZONE_APPEND_BIT), }; struct async_poll { @@ -1244,8 +1254,15 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); - WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, req->cflags); + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { + if (likely(req->result > 0)) + WRITE_ONCE(cqe->res64, req->rw.append_offset); + else + WRITE_ONCE(cqe->res64, req->result); + } else { + WRITE_ONCE(cqe->res, req->result); + WRITE_ONCE(cqe->flags, req->cflags); + } } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1284,8 +1301,15 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) cqe = io_get_cqring(ctx); if (likely(cqe)) { WRITE_ONCE(cqe->user_data, req->user_data); - WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, cflags); + if (unlikely(req->flags & REQ_F_ZONE_APPEND)) { + if (likely(res > 0)) + WRITE_ONCE(cqe->res64, req->rw.append_offset); + else + WRITE_ONCE(cqe->res64, res); + } else { + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, cflags); + } } else if (ctx->cq_overflow_flushed) { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1943,7 +1967,7 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } -static void io_complete_rw_common(struct kiocb *kiocb, long res) +static void io_complete_rw_common(struct kiocb *kiocb, long res, long long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); int cflags = 0; @@ -1955,6 +1979,9 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); + if (req->flags & REQ_F_ZONE_APPEND) + req->rw.append_offset = res2; + __io_cqring_add_event(req, res, cflags); } @@ -1962,7 +1989,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - io_complete_rw_common(kiocb, res); + io_complete_rw_common(kiocb, res, res2); io_put_req(req); } @@ -1976,8 +2003,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long long res2) if (res != req->result) req_set_fail_links(req); req->result = res; - if (res != -EAGAIN) + if (res != -EAGAIN) { + if (req->flags & REQ_F_ZONE_APPEND) + req->rw.append_offset = res2; WRITE_ONCE(req->iopoll_completed, 1); + } } /* @@ -2739,6 +2769,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) SB_FREEZE_WRITE); } kiocb->ki_flags |= IOCB_WRITE; + /* zone-append requires few extra steps during completion */ + if (kiocb->ki_flags & IOCB_ZONE_APPEND) + req->flags |= REQ_F_ZONE_APPEND; if (!force_nonblock) current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 92c2269..2580d93 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -156,8 +156,13 @@ enum { */ struct io_uring_cqe { __u64 user_data; /* sqe->data submission passed back */ - __s32 res; /* result code for this event */ - __u32 flags; + union { + struct { + __s32 res; /* result code for this event */ + __u32 flags; + }; + __s64 res64; /* appending offset for zone append */ + }; }; /*