Message ID | 20210603051836.2614535-3-dkadashev@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | io_uring: add mkdir, [sym]linkat and mknodat support | expand |
On 6/3/21 6:18 AM, Dmitry Kadashev wrote: > IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags > and arguments. > > Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com> > --- > fs/io_uring.c | 55 +++++++++++++++++++++++++++++++++++ > include/uapi/linux/io_uring.h | 1 + > 2 files changed, 56 insertions(+) > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index a1ca6badff36..8ab4eb559520 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -665,6 +665,13 @@ struct io_unlink { > struct filename *filename; > }; > > +struct io_mkdir { > + struct file *file; > + int dfd; > + umode_t mode; > + struct filename *filename; > +}; > + > struct io_completion { > struct file *file; > struct list_head list; > @@ -809,6 +816,7 @@ struct io_kiocb { > struct io_shutdown shutdown; > struct io_rename rename; > struct io_unlink unlink; > + struct io_mkdir mkdir; > /* use only after cleaning per-op data, see io_clean_op() */ > struct io_completion compl; > }; > @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = { > }, > [IORING_OP_RENAMEAT] = {}, > [IORING_OP_UNLINKAT] = {}, > + [IORING_OP_MKDIRAT] = {}, > }; > > static bool io_disarm_next(struct io_kiocb *req); > @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) > return 0; > } > > +static int io_mkdirat_prep(struct io_kiocb *req, > + const struct io_uring_sqe *sqe) > +{ > + struct io_mkdir *mkd = &req->mkdir; > + const char __user *fname; > + > + if (unlikely(req->flags & REQ_F_FIXED_FILE)) > + return -EBADF; > + > + mkd->dfd = READ_ONCE(sqe->fd); > + mkd->mode = READ_ONCE(sqe->len); > + > + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); > + mkd->filename = getname(fname); > + if (IS_ERR(mkd->filename)) > + return PTR_ERR(mkd->filename); We have to check unused fields, e.g. buf_index and off, to be able to use them in the future if needed. if (sqe->buf_index || sqe->off) return -EINVAL; Please double check what fields are not used, and same goes for all other opcodes. > + > + req->flags |= REQ_F_NEED_CLEANUP; > + return 0; > +} > + > +static int io_mkdirat(struct io_kiocb *req, int issue_flags) > +{ > + struct io_mkdir *mkd = &req->mkdir; > + int ret; > + > + if (issue_flags & IO_URING_F_NONBLOCK) > + return -EAGAIN; > + > + ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); > + > + req->flags &= ~REQ_F_NEED_CLEANUP; > + if (ret < 0) > + req_set_fail_links(req); > + io_req_complete(req, ret); > + return 0; > +} > + > static int io_shutdown_prep(struct io_kiocb *req, > const struct io_uring_sqe *sqe) > { > @@ -5936,6 +5983,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > return io_renameat_prep(req, sqe); > case IORING_OP_UNLINKAT: > return io_unlinkat_prep(req, sqe); > + case IORING_OP_MKDIRAT: > + return io_mkdirat_prep(req, sqe); > } > > printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", > @@ -6077,6 +6126,9 @@ static void io_clean_op(struct io_kiocb *req) > case IORING_OP_UNLINKAT: > putname(req->unlink.filename); > break; > + case IORING_OP_MKDIRAT: > + putname(req->mkdir.filename); > + break; > } > req->flags &= ~REQ_F_NEED_CLEANUP; > } > @@ -6203,6 +6255,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) > case IORING_OP_UNLINKAT: > ret = io_unlinkat(req, issue_flags); > break; > + case IORING_OP_MKDIRAT: > + ret = io_mkdirat(req, issue_flags); > + break; > default: > ret = -EINVAL; > break; > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index e1ae46683301..bf9d720d371f 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -137,6 +137,7 @@ enum { > IORING_OP_SHUTDOWN, > IORING_OP_RENAMEAT, > IORING_OP_UNLINKAT, > + IORING_OP_MKDIRAT, > > /* this goes last, obviously */ > IORING_OP_LAST, >
On 6/22/21 12:41 PM, Pavel Begunkov wrote: > On 6/3/21 6:18 AM, Dmitry Kadashev wrote: >> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags >> and arguments. >> >> Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com> >> --- >> fs/io_uring.c | 55 +++++++++++++++++++++++++++++++++++ >> include/uapi/linux/io_uring.h | 1 + >> 2 files changed, 56 insertions(+) >> >> diff --git a/fs/io_uring.c b/fs/io_uring.c >> index a1ca6badff36..8ab4eb559520 100644 >> --- a/fs/io_uring.c >> +++ b/fs/io_uring.c >> @@ -665,6 +665,13 @@ struct io_unlink { >> struct filename *filename; >> }; >> >> +struct io_mkdir { >> + struct file *file; >> + int dfd; >> + umode_t mode; >> + struct filename *filename; >> +}; >> + >> struct io_completion { >> struct file *file; >> struct list_head list; >> @@ -809,6 +816,7 @@ struct io_kiocb { >> struct io_shutdown shutdown; >> struct io_rename rename; >> struct io_unlink unlink; >> + struct io_mkdir mkdir; >> /* use only after cleaning per-op data, see io_clean_op() */ >> struct io_completion compl; >> }; >> @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = { >> }, >> [IORING_OP_RENAMEAT] = {}, >> [IORING_OP_UNLINKAT] = {}, >> + [IORING_OP_MKDIRAT] = {}, >> }; >> >> static bool io_disarm_next(struct io_kiocb *req); >> @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) >> return 0; >> } >> >> +static int io_mkdirat_prep(struct io_kiocb *req, >> + const struct io_uring_sqe *sqe) >> +{ >> + struct io_mkdir *mkd = &req->mkdir; >> + const char __user *fname; >> + >> + if (unlikely(req->flags & REQ_F_FIXED_FILE)) >> + return -EBADF; >> + >> + mkd->dfd = READ_ONCE(sqe->fd); >> + mkd->mode = READ_ONCE(sqe->len); >> + >> + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); >> + mkd->filename = getname(fname); >> + if (IS_ERR(mkd->filename)) >> + return PTR_ERR(mkd->filename); > > We have to check unused fields, e.g. buf_index and off, > to be able to use them in the future if needed. > > if (sqe->buf_index || sqe->off) > return -EINVAL; > > Please double check what fields are not used, and > same goes for all other opcodes. + opcode specific flags, e.g. if (sqe->rw_flags) return -EINVAL;
On 6/3/21 6:18 AM, Dmitry Kadashev wrote: > IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags > and arguments. Jens, a fold-in er discussed, and it will get you a conflict at 8/10 diff --git a/fs/io_uring.c b/fs/io_uring.c index 4b215e0f8dd8..c0e469ebd22d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3589,7 +3589,7 @@ static int io_mkdirat(struct io_kiocb *req, int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } > Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com> > --- > fs/io_uring.c | 55 +++++++++++++++++++++++++++++++++++ > include/uapi/linux/io_uring.h | 1 + > 2 files changed, 56 insertions(+) > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index a1ca6badff36..8ab4eb559520 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -665,6 +665,13 @@ struct io_unlink { > struct filename *filename; > }; > > +struct io_mkdir { > + struct file *file; > + int dfd; > + umode_t mode; > + struct filename *filename; > +}; > + > struct io_completion { > struct file *file; > struct list_head list; > @@ -809,6 +816,7 @@ struct io_kiocb { > struct io_shutdown shutdown; > struct io_rename rename; > struct io_unlink unlink; > + struct io_mkdir mkdir; > /* use only after cleaning per-op data, see io_clean_op() */ > struct io_completion compl; > }; > @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = { > }, > [IORING_OP_RENAMEAT] = {}, > [IORING_OP_UNLINKAT] = {}, > + [IORING_OP_MKDIRAT] = {}, > }; > > static bool io_disarm_next(struct io_kiocb *req); > @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) > return 0; > } > > +static int io_mkdirat_prep(struct io_kiocb *req, > + const struct io_uring_sqe *sqe) > +{ > + struct io_mkdir *mkd = &req->mkdir; > + const char __user *fname; > + > + if (unlikely(req->flags & REQ_F_FIXED_FILE)) > + return -EBADF; > + > + mkd->dfd = READ_ONCE(sqe->fd); > + mkd->mode = READ_ONCE(sqe->len); > + > + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); > + mkd->filename = getname(fname); > + if (IS_ERR(mkd->filename)) > + return PTR_ERR(mkd->filename); > + > + req->flags |= REQ_F_NEED_CLEANUP; > + return 0; > +} > + > +static int io_mkdirat(struct io_kiocb *req, int issue_flags) > +{ > + struct io_mkdir *mkd = &req->mkdir; > + int ret; > + > + if (issue_flags & IO_URING_F_NONBLOCK) > + return -EAGAIN; > + > + ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); > + > + req->flags &= ~REQ_F_NEED_CLEANUP; > + if (ret < 0) > + req_set_fail_links(req); > + io_req_complete(req, ret); > + return 0; > +} > + > static int io_shutdown_prep(struct io_kiocb *req, > const struct io_uring_sqe *sqe) > { > @@ -5936,6 +5983,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > return io_renameat_prep(req, sqe); > case IORING_OP_UNLINKAT: > return io_unlinkat_prep(req, sqe); > + case IORING_OP_MKDIRAT: > + return io_mkdirat_prep(req, sqe); > } > > printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", > @@ -6077,6 +6126,9 @@ static void io_clean_op(struct io_kiocb *req) > case IORING_OP_UNLINKAT: > putname(req->unlink.filename); > break; > + case IORING_OP_MKDIRAT: > + putname(req->mkdir.filename); > + break; > } > req->flags &= ~REQ_F_NEED_CLEANUP; > } > @@ -6203,6 +6255,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) > case IORING_OP_UNLINKAT: > ret = io_unlinkat(req, issue_flags); > break; > + case IORING_OP_MKDIRAT: > + ret = io_mkdirat(req, issue_flags); > + break; > default: > ret = -EINVAL; > break; > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index e1ae46683301..bf9d720d371f 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -137,6 +137,7 @@ enum { > IORING_OP_SHUTDOWN, > IORING_OP_RENAMEAT, > IORING_OP_UNLINKAT, > + IORING_OP_MKDIRAT, > > /* this goes last, obviously */ > IORING_OP_LAST, >
On 6/22/21 11:41 AM, Pavel Begunkov wrote: > On 6/3/21 6:18 AM, Dmitry Kadashev wrote: >> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags >> and arguments. > > Jens, a fold-in er discussed, and it will get you > a conflict at 8/10 Folded in, thanks.
On Wed, Jun 23, 2021 at 12:41 AM Pavel Begunkov <asml.silence@gmail.com> wrote: > > On 6/3/21 6:18 AM, Dmitry Kadashev wrote: > > IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags > > and arguments. > > Jens, a fold-in er discussed, and it will get you > a conflict at 8/10 Thanks, Pavel
On Tue, Jun 22, 2021 at 6:50 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > > On 6/22/21 12:41 PM, Pavel Begunkov wrote: > > On 6/3/21 6:18 AM, Dmitry Kadashev wrote: > >> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags > >> and arguments. > >> > >> Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com> > >> --- > >> fs/io_uring.c | 55 +++++++++++++++++++++++++++++++++++ > >> include/uapi/linux/io_uring.h | 1 + > >> 2 files changed, 56 insertions(+) > >> > >> diff --git a/fs/io_uring.c b/fs/io_uring.c > >> index a1ca6badff36..8ab4eb559520 100644 > >> --- a/fs/io_uring.c > >> +++ b/fs/io_uring.c > >> @@ -665,6 +665,13 @@ struct io_unlink { > >> struct filename *filename; > >> }; > >> > >> +struct io_mkdir { > >> + struct file *file; > >> + int dfd; > >> + umode_t mode; > >> + struct filename *filename; > >> +}; > >> + > >> struct io_completion { > >> struct file *file; > >> struct list_head list; > >> @@ -809,6 +816,7 @@ struct io_kiocb { > >> struct io_shutdown shutdown; > >> struct io_rename rename; > >> struct io_unlink unlink; > >> + struct io_mkdir mkdir; > >> /* use only after cleaning per-op data, see io_clean_op() */ > >> struct io_completion compl; > >> }; > >> @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = { > >> }, > >> [IORING_OP_RENAMEAT] = {}, > >> [IORING_OP_UNLINKAT] = {}, > >> + [IORING_OP_MKDIRAT] = {}, > >> }; > >> > >> static bool io_disarm_next(struct io_kiocb *req); > >> @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) > >> return 0; > >> } > >> > >> +static int io_mkdirat_prep(struct io_kiocb *req, > >> + const struct io_uring_sqe *sqe) > >> +{ > >> + struct io_mkdir *mkd = &req->mkdir; > >> + const char __user *fname; > >> + > >> + if (unlikely(req->flags & REQ_F_FIXED_FILE)) > >> + return -EBADF; > >> + > >> + mkd->dfd = READ_ONCE(sqe->fd); > >> + mkd->mode = READ_ONCE(sqe->len); > >> + > >> + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); > >> + mkd->filename = getname(fname); > >> + if (IS_ERR(mkd->filename)) > >> + return PTR_ERR(mkd->filename); > > > > We have to check unused fields, e.g. buf_index and off, > > to be able to use them in the future if needed. > > > > if (sqe->buf_index || sqe->off) > > return -EINVAL; > > > > Please double check what fields are not used, and > > same goes for all other opcodes. This changeset is based on some other ops that were added a while ago (renameat, unlinkat), which lack the check as well. I guess I'll just go over all of them and add the checks in a single patch if that's OK. I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of the existing checks like this lack it too btw. I suppose I can fix those in a separate commit if that makes sense. > > + opcode specific flags, e.g. > > if (sqe->rw_flags) > return -EINVAL;
On 6/23/21 7:41 AM, Dmitry Kadashev wrote: > On Tue, Jun 22, 2021 at 6:50 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >> >> On 6/22/21 12:41 PM, Pavel Begunkov wrote: >>> On 6/3/21 6:18 AM, Dmitry Kadashev wrote: >>>> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags >>>> and arguments. >>>> >>>> Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com> >>>> --- [...] >>> We have to check unused fields, e.g. buf_index and off, >>> to be able to use them in the future if needed. >>> >>> if (sqe->buf_index || sqe->off) >>> return -EINVAL; >>> >>> Please double check what fields are not used, and >>> same goes for all other opcodes. > > This changeset is based on some other ops that were added a while ago > (renameat, unlinkat), which lack the check as well. I guess I'll just go over That's not great... Thanks for letting know > all of them and add the checks in a single patch if that's OK. For newly added opcodes a single patch on top is ok, rename and unlink should be patched separately. > I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of > the existing checks like this lack it too btw. I suppose I can fix those in a > separate commit if that makes sense. When we really use a field there should be a READ_ONCE(), but I wouldn't care about those we check for compatibility reasons, but that's only my opinion. >> + opcode specific flags, e.g. >> >> if (sqe->rw_flags) >> return -EINVAL; >
On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > > On 6/23/21 7:41 AM, Dmitry Kadashev wrote: > > I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of > > the existing checks like this lack it too btw. I suppose I can fix those in a > > separate commit if that makes sense. > > When we really use a field there should be a READ_ONCE(), > but I wouldn't care about those we check for compatibility > reasons, but that's only my opinion. I'm not sure how the compatibility check reads are special. The code is either correct or not. If a compatibility check has correctness problems then it's pretty much as bad as any other part of the code having such problems, no? That said, I'll just go ahead and use the approach that the rest of the code (or rather most of it) uses (no READ_ONCE). If it needs fixing then the whole bunch can probably be fixed in one go (either a single patch or a series). Thanks for your help, Pavel!
On 6/24/21 12:11 PM, Dmitry Kadashev wrote: > On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >> >> On 6/23/21 7:41 AM, Dmitry Kadashev wrote: >>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of >>> the existing checks like this lack it too btw. I suppose I can fix those in a >>> separate commit if that makes sense. >> >> When we really use a field there should be a READ_ONCE(), >> but I wouldn't care about those we check for compatibility >> reasons, but that's only my opinion. > > I'm not sure how the compatibility check reads are special. The code is > either correct or not. If a compatibility check has correctness problems > then it's pretty much as bad as any other part of the code having such > problems, no? If it reads and verifies a values first, e.g. index into some internal array, and then compiler plays a joke and reloads it, we might be absolutely screwed expecting 'segfaults', kernel data leakages and all the fun stuff. If that's a compatibility check, whether it's loaded earlier or later, or whatever, it's not a big deal, the userspace can in any case change the memory at any moment it wishes, even tightly around the moment we're reading it. > That said, I'll just go ahead and use the approach that the rest of the > code (or rather most of it) uses (no READ_ONCE). If it needs fixing then > the whole bunch can probably be fixed in one go (either a single patch > or a series). > > Thanks for your help, Pavel! >
On Thu, Jun 24, 2021 at 7:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > > On 6/24/21 12:11 PM, Dmitry Kadashev wrote: > > On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > >> > >> On 6/23/21 7:41 AM, Dmitry Kadashev wrote: > >>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of > >>> the existing checks like this lack it too btw. I suppose I can fix those in a > >>> separate commit if that makes sense. > >> > >> When we really use a field there should be a READ_ONCE(), > >> but I wouldn't care about those we check for compatibility > >> reasons, but that's only my opinion. > > > > I'm not sure how the compatibility check reads are special. The code is > > either correct or not. If a compatibility check has correctness problems > > then it's pretty much as bad as any other part of the code having such > > problems, no? > > If it reads and verifies a values first, e.g. index into some internal > array, and then compiler plays a joke and reloads it, we might be > absolutely screwed expecting 'segfaults', kernel data leakages and all > the fun stuff. > > If that's a compatibility check, whether it's loaded earlier or later, > or whatever, it's not a big deal, the userspace can in any case change > the memory at any moment it wishes, even tightly around the moment > we're reading it. Sorry for the slow reply, I have to balance this with my actual job that is not directly related to the kernel development :) I'm no kernel concurrency expert (actually I'm not any kind of kernel expert), but my understanding is READ_ONCE does not just mean "do not read more than once", but rather "read exactly once" (and more than that), and if it's not applied then the compiler is within its rights to optimize the read out, so the compatibility check can effectively be disabled. I don't think it's likely to happen, but "bad things do not happen in practice" and "it is technically correct" are two different things :) FWIW I'm not arguing it has to be changed, I just want to understand things better (and if it helps to spot a bug at some point then great). So if my reasoning is wrong then please point out where. And if it's just the simplicity / clarity of the code that is the goal here and any negative effects are considered to be unlikely then it's OK, I can understand that.
On 6/28/21 9:17 AM, Dmitry Kadashev wrote: > On Thu, Jun 24, 2021 at 7:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >> >> On 6/24/21 12:11 PM, Dmitry Kadashev wrote: >>> On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >>>> >>>> On 6/23/21 7:41 AM, Dmitry Kadashev wrote: >>>>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of >>>>> the existing checks like this lack it too btw. I suppose I can fix those in a >>>>> separate commit if that makes sense. >>>> >>>> When we really use a field there should be a READ_ONCE(), >>>> but I wouldn't care about those we check for compatibility >>>> reasons, but that's only my opinion. >>> >>> I'm not sure how the compatibility check reads are special. The code is >>> either correct or not. If a compatibility check has correctness problems >>> then it's pretty much as bad as any other part of the code having such >>> problems, no? >> >> If it reads and verifies a values first, e.g. index into some internal >> array, and then compiler plays a joke and reloads it, we might be >> absolutely screwed expecting 'segfaults', kernel data leakages and all >> the fun stuff. >> >> If that's a compatibility check, whether it's loaded earlier or later, >> or whatever, it's not a big deal, the userspace can in any case change >> the memory at any moment it wishes, even tightly around the moment >> we're reading it. > > Sorry for the slow reply, I have to balance this with my actual job that > is not directly related to the kernel development :) > > I'm no kernel concurrency expert (actually I'm not any kind of kernel > expert), but my understanding is READ_ONCE does not just mean "do not > read more than once", but rather "read exactly once" (and more than > that), and if it's not applied then the compiler is within its rights to > optimize the read out, so the compatibility check can effectively be > disabled. Yep, as they say it's about all the "inventive" transformations compilers can do, double read is just one of those that may turn very nasty for us. One big difference for me is whether it have a potential to crash the kernel or not, though it's just one side. Compilers can't drop the check just because, it first should be proven to be safe to do, and there are all sorts barriers around and limitations on how CQEs and SQEs are used, making impossible to alias memory. E.g. CQEs and SQEs can't be reused in a single syscall, they're only written and read respectively, and so on. Maybe, the only one I'd worry about is the call to io_commit_sqring(), i.e. for SQE reads not happening after it, but we need to take a look whether it's theoretically possible. > I don't think it's likely to happen, but "bad things do not happen in > practice" and "it is technically correct" are two different things :) > > FWIW I'm not arguing it has to be changed, I just want to understand > things better (and if it helps to spot a bug at some point then great). > So if my reasoning is wrong then please point out where. And if it's > just the simplicity / clarity of the code that is the goal here and any > negative effects are considered to be unlikely then it's OK, I can > understand that.
On Wed, Jul 7, 2021 at 9:06 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > > On 6/28/21 9:17 AM, Dmitry Kadashev wrote: > > On Thu, Jun 24, 2021 at 7:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > >> > >> On 6/24/21 12:11 PM, Dmitry Kadashev wrote: > >>> On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote: > >>>> > >>>> On 6/23/21 7:41 AM, Dmitry Kadashev wrote: > >>>>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of > >>>>> the existing checks like this lack it too btw. I suppose I can fix those in a > >>>>> separate commit if that makes sense. > >>>> > >>>> When we really use a field there should be a READ_ONCE(), > >>>> but I wouldn't care about those we check for compatibility > >>>> reasons, but that's only my opinion. > >>> > >>> I'm not sure how the compatibility check reads are special. The code is > >>> either correct or not. If a compatibility check has correctness problems > >>> then it's pretty much as bad as any other part of the code having such > >>> problems, no? > >> > >> If it reads and verifies a values first, e.g. index into some internal > >> array, and then compiler plays a joke and reloads it, we might be > >> absolutely screwed expecting 'segfaults', kernel data leakages and all > >> the fun stuff. > >> > >> If that's a compatibility check, whether it's loaded earlier or later, > >> or whatever, it's not a big deal, the userspace can in any case change > >> the memory at any moment it wishes, even tightly around the moment > >> we're reading it. > > > > Sorry for the slow reply, I have to balance this with my actual job that > > is not directly related to the kernel development :) > > > > I'm no kernel concurrency expert (actually I'm not any kind of kernel > > expert), but my understanding is READ_ONCE does not just mean "do not > > read more than once", but rather "read exactly once" (and more than > > that), and if it's not applied then the compiler is within its rights to > > optimize the read out, so the compatibility check can effectively be > > disabled. > > Yep, as they say it's about all the "inventive" transformations > compilers can do, double read is just one of those that may turn very > nasty for us. > > One big difference for me is whether it have a potential to crash the > kernel or not, though it's just one side. Ah, that makes sense. > Compilers can't drop the check just because, it first should be proven > to be safe to do, and there are all sorts barriers around and > limitations on how CQEs and SQEs are used, making impossible to alias > memory. E.g. CQEs and SQEs can't be reused in a single syscall, they're > only written and read respectively, and so on. Maybe, the only one I'd > worry about is the call to io_commit_sqring(), i.e. for SQE reads not > happening after it, but we need to take a look whether it's > theoretically possible. Thanks for the explanation, Pavel!
On 7/12/21 1:44 PM, Dmitry Kadashev wrote: > On Wed, Jul 7, 2021 at 9:06 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >> On 6/28/21 9:17 AM, Dmitry Kadashev wrote: >>> On Thu, Jun 24, 2021 at 7:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >>>> On 6/24/21 12:11 PM, Dmitry Kadashev wrote: >>>>> On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote: >>>>>> On 6/23/21 7:41 AM, Dmitry Kadashev wrote: >>>>>>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of >>>>>>> the existing checks like this lack it too btw. I suppose I can fix those in a >>>>>>> separate commit if that makes sense. >>>>>> >>>>>> When we really use a field there should be a READ_ONCE(), >>>>>> but I wouldn't care about those we check for compatibility >>>>>> reasons, but that's only my opinion. >>>>> >>>>> I'm not sure how the compatibility check reads are special. The code is >>>>> either correct or not. If a compatibility check has correctness problems >>>>> then it's pretty much as bad as any other part of the code having such >>>>> problems, no? >>>> >>>> If it reads and verifies a values first, e.g. index into some internal >>>> array, and then compiler plays a joke and reloads it, we might be >>>> absolutely screwed expecting 'segfaults', kernel data leakages and all >>>> the fun stuff. >>>> >>>> If that's a compatibility check, whether it's loaded earlier or later, >>>> or whatever, it's not a big deal, the userspace can in any case change >>>> the memory at any moment it wishes, even tightly around the moment >>>> we're reading it. >>> >>> Sorry for the slow reply, I have to balance this with my actual job that >>> is not directly related to the kernel development :) >>> >>> I'm no kernel concurrency expert (actually I'm not any kind of kernel >>> expert), but my understanding is READ_ONCE does not just mean "do not >>> read more than once", but rather "read exactly once" (and more than >>> that), and if it's not applied then the compiler is within its rights to >>> optimize the read out, so the compatibility check can effectively be >>> disabled. >> >> Yep, as they say it's about all the "inventive" transformations >> compilers can do, double read is just one of those that may turn very >> nasty for us. >> >> One big difference for me is whether it have a potential to crash the >> kernel or not, though it's just one side. > > Ah, that makes sense. > >> Compilers can't drop the check just because, it first should be proven >> to be safe to do, and there are all sorts barriers around and >> limitations on how CQEs and SQEs are used, making impossible to alias >> memory. E.g. CQEs and SQEs can't be reused in a single syscall, they're >> only written and read respectively, and so on. Maybe, the only one I'd >> worry about is the call to io_commit_sqring(), i.e. for SQE reads not >> happening after it, but we need to take a look whether it's >> theoretically possible. > > Thanks for the explanation, Pavel! btw, that was for why we were rather reluctant about that. It could be a good idea to add READ_ONCE as you mentioned, at least would be less confusing.
diff --git a/fs/io_uring.c b/fs/io_uring.c index a1ca6badff36..8ab4eb559520 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -665,6 +665,13 @@ struct io_unlink { struct filename *filename; }; +struct io_mkdir { + struct file *file; + int dfd; + umode_t mode; + struct filename *filename; +}; + struct io_completion { struct file *file; struct list_head list; @@ -809,6 +816,7 @@ struct io_kiocb { struct io_shutdown shutdown; struct io_rename rename; struct io_unlink unlink; + struct io_mkdir mkdir; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_RENAMEAT] = {}, [IORING_OP_UNLINKAT] = {}, + [IORING_OP_MKDIRAT] = {}, }; static bool io_disarm_next(struct io_kiocb *req); @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_mkdirat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_mkdir *mkd = &req->mkdir; + const char __user *fname; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + mkd->dfd = READ_ONCE(sqe->fd); + mkd->mode = READ_ONCE(sqe->len); + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + mkd->filename = getname(fname); + if (IS_ERR(mkd->filename)) + return PTR_ERR(mkd->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_mkdirat(struct io_kiocb *req, int issue_flags) +{ + struct io_mkdir *mkd = &req->mkdir; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5936,6 +5983,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_renameat_prep(req, sqe); case IORING_OP_UNLINKAT: return io_unlinkat_prep(req, sqe); + case IORING_OP_MKDIRAT: + return io_mkdirat_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6077,6 +6126,9 @@ static void io_clean_op(struct io_kiocb *req) case IORING_OP_UNLINKAT: putname(req->unlink.filename); break; + case IORING_OP_MKDIRAT: + putname(req->mkdir.filename); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } @@ -6203,6 +6255,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_UNLINKAT: ret = io_unlinkat(req, issue_flags); break; + case IORING_OP_MKDIRAT: + ret = io_mkdirat(req, issue_flags); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e1ae46683301..bf9d720d371f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -137,6 +137,7 @@ enum { IORING_OP_SHUTDOWN, IORING_OP_RENAMEAT, IORING_OP_UNLINKAT, + IORING_OP_MKDIRAT, /* this goes last, obviously */ IORING_OP_LAST,
IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags and arguments. Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com> --- fs/io_uring.c | 55 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 56 insertions(+)