Message ID | df0e24ff-f3a0-4818-8282-2a4e03b7b5a6@kernel.dk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring/openclose: add support for IORING_OP_FIXED_FD_INSTALL | expand |
Here's a sample test case showing the various transformations and testing validity. /* SPDX-License-Identifier: MIT */ /* * Description: test installing a direct descriptor into the regular * file table * */ #include <errno.h> #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <string.h> #include <fcntl.h> #include "liburing.h" #include "helpers.h" static int no_fd_install; static void io_uring_prep_fixed_fd_install(struct io_uring_sqe *sqe, int fd) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = IORING_OP_FIXED_FD_INSTALL; sqe->fd = fd; } int main(int argc, char *argv[]) { struct io_uring_sqe *sqe; struct io_uring_cqe *cqe; struct io_uring ring; int ret, fds[2]; char buf[32]; if (argc > 1) return T_EXIT_SKIP; ret = io_uring_queue_init(1, &ring, 0); if (ret) { fprintf(stderr, "ring setup failed: %d\n", ret); return T_EXIT_FAIL; } if (pipe(fds) < 0) { perror("pipe"); return T_EXIT_FAIL; } /* register read side */ ret = io_uring_register_files(&ring, &fds[0], 1); if (ret) { fprintf(stderr, "failed register files %d\n", ret); return T_EXIT_FAIL; } /* close normal descriptor */ close(fds[0]); /* normal read should fail */ ret = read(fds[0], buf, 1); if (ret != -1) { fprintf(stderr, "unexpected read ret %d\n", ret); return T_EXIT_FAIL; } if (errno != EBADF) { fprintf(stderr, "unexpected read failure %d\n", errno); return T_EXIT_FAIL; } /* verify we can read the data */ sqe = io_uring_get_sqe(&ring); io_uring_prep_read(sqe, 0, buf, sizeof(buf), 0); sqe->flags = IOSQE_FIXED_FILE; io_uring_submit(&ring); /* put some data in the pipe */ write(fds[1], "Hello", 5); ret = io_uring_wait_cqe(&ring, &cqe); if (ret) { fprintf(stderr, "wait cqe %d\n", ret); return T_EXIT_FAIL; } if (cqe->res != 5) { fprintf(stderr, "weird pipe read ret %d\n", cqe->res); return T_EXIT_FAIL; } io_uring_cqe_seen(&ring, cqe); /* fixed pipe read worked, now re-install as a regular fd */ sqe = io_uring_get_sqe(&ring); io_uring_prep_fixed_fd_install(sqe, 0); sqe->flags = IOSQE_FIXED_FILE; io_uring_submit(&ring); ret = io_uring_wait_cqe(&ring, &cqe); if (ret) { fprintf(stderr, "wait cqe %d\n", ret); return T_EXIT_FAIL; } if (cqe->res == -EINVAL) { no_fd_install = 1; return T_EXIT_SKIP; } if (cqe->res < 0) { fprintf(stderr, "failed install fd: %d\n", cqe->res); return T_EXIT_FAIL; } /* stash new pipe read side fd in old spot */ fds[0] = cqe->res; io_uring_cqe_seen(&ring, cqe); write(fds[1], "Hello", 5); /* normal pipe read should now work with new fd */ ret = read(fds[0], buf, sizeof(buf)); if (ret != 5) { fprintf(stderr, "unexpected read ret %d\n", ret); return T_EXIT_FAIL; } /* close fixed file */ sqe = io_uring_get_sqe(&ring); io_uring_prep_close_direct(sqe, 0); io_uring_submit(&ring); ret = io_uring_wait_cqe(&ring, &cqe); if (ret) { fprintf(stderr, "wait cqe %d\n", ret); return T_EXIT_FAIL; } if (cqe->res) { fprintf(stderr, "close fixed fd %d\n", cqe->res); return T_EXIT_FAIL; } io_uring_cqe_seen(&ring, cqe); write(fds[1], "Hello", 5); /* normal pipe read should still work with new fd */ ret = read(fds[0], buf, sizeof(buf)); if (ret != 5) { fprintf(stderr, "unexpected read ret %d\n", ret); return T_EXIT_FAIL; } /* fixed fd pipe read should now fail */ sqe = io_uring_get_sqe(&ring); io_uring_prep_read(sqe, 0, buf, sizeof(buf), 0); sqe->flags = IOSQE_FIXED_FILE; io_uring_submit(&ring); /* put some data in the pipe */ write(fds[1], "Hello", 5); ret = io_uring_wait_cqe(&ring, &cqe); if (ret) { fprintf(stderr, "wait cqe %d\n", ret); return T_EXIT_FAIL; } if (cqe->res != -EBADF) { fprintf(stderr, "weird pipe read ret %d\n", cqe->res); return T_EXIT_FAIL; } io_uring_cqe_seen(&ring, cqe); return T_EXIT_PASS; }
On Thu, Dec 07, 2023 at 08:11:45PM -0700, Jens Axboe wrote: > io_uring can currently open/close regular files or fixed/direct > descriptors. Or you can instantiate a fixed descriptor from a regular > one, and then close the regular descriptor. But you currently can't turn > a purely fixed/direct descriptor into a regular file descriptor. > > IORING_OP_FIXED_FD_INSTALL adds support for installing a direct > descriptor into the normal file table, just like receiving a file > descriptor or opening a new file would do. This is all nicely abstracted > into receive_fd(), and hence adding support for this is truly trivial. > > Since direct descriptors are only usable within io_uring itself, it can > be useful to turn them into real file descriptors if they ever need to > be accessed via normal syscalls. This can either be a transitory thing, > or just a permanent transition for a given direct descriptor. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > --- Would you mind giving me a Suggested-by? So this interesting. Usually we allow to receive fds if the task we're taking a reference to a file from: (1) is cooperating: SCM_RIGHTS (2) is the same task that's taking an addition reference: dup*(), seccomp notify addfd Both cases ensure that the file->f_count == and fdtable->count == 1 optimizations continue to work correctly in the regular system call fdget*() routines. I guess that this is a variant of (2). Effectively a dup*() from a private file table into the regular fdtable. So I just want to make sure we don't break this here because we broke that on accident before (pidfd_getfd()): A fixed file always has file->f_count == 1. The private io_uring fdtable is: * shared between all io workers? * accessible to all processes that have mapped the io_uring ring? And fixed files can only be used from within io_uring itself. So multiple caller might issue fixed file rquests to different io workers for concurrent operations. The file->f_count stays at 1 for all of them. Someone now requests a regular fd for that fixed file. So now an fixed-to-fd requests comes in. The file->f_count is bumped > 1 and that fd is installed into the fdtable of the caller through one of the io worker threads spawned for that caller? Assume caller now issues a readdir() operation on that fd. file->f_count will be > 1 for as long as it's used as a fixed file. So regular system call fdget*() locking rules should work even with that new extension added. I think that's sound. But I want to share some doubts as well. I think the concept of fixed files is pretty neat. But it poses a new semantic challenge to userspace. io_uring_setup() gives you an fd back that you use to mmap() the io_uring. You make that io_uring_setup() fd a fixed file. You close the regular fd but now the io_uring instance stays around. You create and register a bunch of fixed files. So now you've set up an io_uring instance that is completely made up of fixed files including the io_uring file itself. So a process that assumes that once it has closed all fds in its fdtable of all threads that all resources have been released. But that's not true now. At least for some pretty obvious cases such as filesystems. For example, io_uring could just pin random filesystems in there. Say it holds open a file in a btrfs mount and the service manager having closed all fds now tries unmount it and fails inexplicably. So effectively, fixed files let you completely hide files and that's potentially a problem. So cool idea but this might cause us some trouble later down the road. Reviewed-by: Christian Brauner <brauner@kernel.org> Two comments below. > include/uapi/linux/io_uring.h | 2 ++ > io_uring/opdef.c | 9 +++++++++ > io_uring/openclose.c | 37 +++++++++++++++++++++++++++++++++++ > io_uring/openclose.h | 3 +++ > 4 files changed, 51 insertions(+) > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index f1c16f817742..af82aab9e632 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -71,6 +71,7 @@ struct io_uring_sqe { > __u32 uring_cmd_flags; > __u32 waitid_flags; > __u32 futex_flags; > + __u32 install_fd_flags; > }; > __u64 user_data; /* data to be passed back at completion time */ > /* pack this to avoid bogus arm OABI complaints */ > @@ -253,6 +254,7 @@ enum io_uring_op { > IORING_OP_FUTEX_WAIT, > IORING_OP_FUTEX_WAKE, > IORING_OP_FUTEX_WAITV, > + IORING_OP_FIXED_FD_INSTALL, > > /* this goes last, obviously */ > IORING_OP_LAST, > diff --git a/io_uring/opdef.c b/io_uring/opdef.c > index 799db44283c7..6705634e5f52 100644 > --- a/io_uring/opdef.c > +++ b/io_uring/opdef.c > @@ -469,6 +469,12 @@ const struct io_issue_def io_issue_defs[] = { > .prep = io_eopnotsupp_prep, > #endif > }, > + [IORING_OP_FIXED_FD_INSTALL] = { > + .needs_file = 1, > + .audit_skip = 1, > + .prep = io_install_fixed_fd_prep, > + .issue = io_install_fixed_fd, > + }, > }; > > const struct io_cold_def io_cold_defs[] = { > @@ -704,6 +710,9 @@ const struct io_cold_def io_cold_defs[] = { > [IORING_OP_FUTEX_WAITV] = { > .name = "FUTEX_WAITV", > }, > + [IORING_OP_FIXED_FD_INSTALL] = { > + .name = "FIXED_FD_INSTALL", > + }, > }; > > const char *io_uring_get_opcode(u8 opcode) > diff --git a/io_uring/openclose.c b/io_uring/openclose.c > index fb73adb89067..5b8f79edef26 100644 > --- a/io_uring/openclose.c > +++ b/io_uring/openclose.c > @@ -31,6 +31,11 @@ struct io_close { > u32 file_slot; > }; > > +struct io_fixed_install { > + struct file *file; > + int flags; > +}; > + > static bool io_openat_force_async(struct io_open *open) > { > /* > @@ -254,3 +259,35 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags) > io_req_set_res(req, ret, 0); > return IOU_OK; > } > + > +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > +{ > + struct io_fixed_install *ifi; > + > + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || > + sqe->splice_fd_in || sqe->addr3) > + return -EINVAL; > + > + /* must be a fixed file */ > + if (!(req->flags & REQ_F_FIXED_FILE)) > + return -EBADF; > + > + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); > + > + /* really just O_CLOEXEC or not */ > + ifi->flags = READ_ONCE(sqe->install_fd_flags); I'm a big big fan of having all new fd returning apis return their fds O_CLOEXEC by default and forcing userspace to explicitly turn this off via fcntl(). pidfds are cloexec by default, so are seccomp notifier fds. > + return 0; > +} > + > +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) > +{ > + struct io_fixed_install *ifi; > + int ret; > + > + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); > + ret = receive_fd(req->file, ifi->flags); After changes currently in vfs.misc the helper is now: receive_fd(req->file, NULL, ifi->flags) So fyi, this will cause a build failure in -next. > + if (ret < 0) > + req_set_fail(req); > + io_req_set_res(req, ret, 0); > + return IOU_OK; > +} > diff --git a/io_uring/openclose.h b/io_uring/openclose.h > index 4b1c28d3a66c..8a93c98ad0ad 100644 > --- a/io_uring/openclose.h > +++ b/io_uring/openclose.h > @@ -12,3 +12,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); > > int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); > int io_close(struct io_kiocb *req, unsigned int issue_flags); > + > +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); > +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); > -- > 2.42.0 > > -- > Jens Axboe >
On 12/8/23 2:14 PM, Christian Brauner wrote: > On Thu, Dec 07, 2023 at 08:11:45PM -0700, Jens Axboe wrote: >> io_uring can currently open/close regular files or fixed/direct >> descriptors. Or you can instantiate a fixed descriptor from a regular >> one, and then close the regular descriptor. But you currently can't turn >> a purely fixed/direct descriptor into a regular file descriptor. >> >> IORING_OP_FIXED_FD_INSTALL adds support for installing a direct >> descriptor into the normal file table, just like receiving a file >> descriptor or opening a new file would do. This is all nicely abstracted >> into receive_fd(), and hence adding support for this is truly trivial. >> >> Since direct descriptors are only usable within io_uring itself, it can >> be useful to turn them into real file descriptors if they ever need to >> be accessed via normal syscalls. This can either be a transitory thing, >> or just a permanent transition for a given direct descriptor. >> >> Signed-off-by: Jens Axboe <axboe@kernel.dk> >> --- > > Would you mind giving me a Suggested-by? Sure, I can do that. I'll send out a v2, added a few comments and also changed the flags location. But no real changes otherwise. > So this interesting. Usually we allow to receive fds if the task we're > taking a reference to a file from: > > (1) is cooperating: SCM_RIGHTS > (2) is the same task that's taking an addition reference: dup*(), > seccomp notify addfd > > Both cases ensure that the file->f_count == and fdtable->count == 1 > optimizations continue to work correctly in the regular system call > fdget*() routines. > > I guess that this is a variant of (2). Effectively a dup*() from a > private file table into the regular fdtable. Right. > So I just want to make sure we don't break this here because we broke > that on accident before (pidfd_getfd()): > > A fixed file always has file->f_count == 1. The private io_uring fdtable > is: It doesn't necessarily always have f_count of 1. One way to instantiate a direct descriptor is to open the file normally, then register it. If the task doesn't close the normal file descriptor, then it'd have an elevated count of 2. But normally you'd expect the normal file to be closed, or the file opened/instantiated as a direct descriptor upfront. In that case, it should have a ref of 1. > * shared between all io workers? Yep, they are just like threads in that sense and share the file table. > * accessible to all processes that have mapped the io_uring ring? The direct descriptors are just like a file_table, except it's private to the ring itself. If you can invoke io_uring_enter(2) on that ring, then you have access to that direct descriptor table. > And fixed files can only be used from within io_uring itself. Correct, the index only makes sense within a specific ring. > So multiple caller might issue fixed file rquests to different io > workers for concurrent operations. The file->f_count stays at 1 for all > of them. Someone now requests a regular fd for that fixed file. > > So now an fixed-to-fd requests comes in. The file->f_count is bumped > 1 > and that fd is installed into the fdtable of the caller through one of > the io worker threads spawned for that caller? No worker thread is involved for this, unless you asked for that by eg setting IOSQE_ASYNC in the sqe before it is submitted. The default would be that the task that does io_uring_enter(2) and submits that sqe would be the one that installs the file X with direct descriptor Y into its normal file table. > Assume caller now issues a readdir() operation on that fd. file->f_count > will be > 1 for as long as it's used as a fixed file. So regular system > call fdget*() locking rules should work even with that new extension > added. > > I think that's sound. Right, I don't think this changes fget/fdget rules. > But I want to share some doubts as well. I think the concept of fixed > files is pretty neat. But it poses a new semantic challenge to > userspace. > > io_uring_setup() gives you an fd back that you use to mmap() the > io_uring. You make that io_uring_setup() fd a fixed file. You close the > regular fd but now the io_uring instance stays around. You create and > register a bunch of fixed files. > > So now you've set up an io_uring instance that is completely made up of > fixed files including the io_uring file itself. > > So a process that assumes that once it has closed all fds in its fdtable > of all threads that all resources have been released. > > But that's not true now. At least for some pretty obvious cases such as > filesystems. For example, io_uring could just pin random filesystems in > there. Say it holds open a file in a btrfs mount and the service > manager having closed all fds now tries unmount it and fails > inexplicably. > > So effectively, fixed files let you completely hide files and that's > potentially a problem. So cool idea but this might cause us some trouble > later down the road. That is true, you'd have to explicitly or implicitly cancel (this happens at exec time too, for example) requests and then unregister the fixed files to release those resources. Or teardown the ring, which would do both. Or just not close the ring fd itself even if it's registered, then it would continue to reside in the normal file table as well. > Reviewed-by: Christian Brauner <brauner@kernel.org> Thanks! > Two comments below. > >> include/uapi/linux/io_uring.h | 2 ++ >> io_uring/opdef.c | 9 +++++++++ >> io_uring/openclose.c | 37 +++++++++++++++++++++++++++++++++++ >> io_uring/openclose.h | 3 +++ >> 4 files changed, 51 insertions(+) >> >> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >> index f1c16f817742..af82aab9e632 100644 >> --- a/include/uapi/linux/io_uring.h >> +++ b/include/uapi/linux/io_uring.h >> @@ -71,6 +71,7 @@ struct io_uring_sqe { >> __u32 uring_cmd_flags; >> __u32 waitid_flags; >> __u32 futex_flags; >> + __u32 install_fd_flags; >> }; >> __u64 user_data; /* data to be passed back at completion time */ >> /* pack this to avoid bogus arm OABI complaints */ >> @@ -253,6 +254,7 @@ enum io_uring_op { >> IORING_OP_FUTEX_WAIT, >> IORING_OP_FUTEX_WAKE, >> IORING_OP_FUTEX_WAITV, >> + IORING_OP_FIXED_FD_INSTALL, >> >> /* this goes last, obviously */ >> IORING_OP_LAST, >> diff --git a/io_uring/opdef.c b/io_uring/opdef.c >> index 799db44283c7..6705634e5f52 100644 >> --- a/io_uring/opdef.c >> +++ b/io_uring/opdef.c >> @@ -469,6 +469,12 @@ const struct io_issue_def io_issue_defs[] = { >> .prep = io_eopnotsupp_prep, >> #endif >> }, >> + [IORING_OP_FIXED_FD_INSTALL] = { >> + .needs_file = 1, >> + .audit_skip = 1, >> + .prep = io_install_fixed_fd_prep, >> + .issue = io_install_fixed_fd, >> + }, >> }; >> >> const struct io_cold_def io_cold_defs[] = { >> @@ -704,6 +710,9 @@ const struct io_cold_def io_cold_defs[] = { >> [IORING_OP_FUTEX_WAITV] = { >> .name = "FUTEX_WAITV", >> }, >> + [IORING_OP_FIXED_FD_INSTALL] = { >> + .name = "FIXED_FD_INSTALL", >> + }, >> }; >> >> const char *io_uring_get_opcode(u8 opcode) >> diff --git a/io_uring/openclose.c b/io_uring/openclose.c >> index fb73adb89067..5b8f79edef26 100644 >> --- a/io_uring/openclose.c >> +++ b/io_uring/openclose.c >> @@ -31,6 +31,11 @@ struct io_close { >> u32 file_slot; >> }; >> >> +struct io_fixed_install { >> + struct file *file; >> + int flags; >> +}; >> + >> static bool io_openat_force_async(struct io_open *open) >> { >> /* >> @@ -254,3 +259,35 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags) >> io_req_set_res(req, ret, 0); >> return IOU_OK; >> } >> + >> +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >> +{ >> + struct io_fixed_install *ifi; >> + >> + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || >> + sqe->splice_fd_in || sqe->addr3) >> + return -EINVAL; >> + >> + /* must be a fixed file */ >> + if (!(req->flags & REQ_F_FIXED_FILE)) >> + return -EBADF; >> + >> + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); >> + >> + /* really just O_CLOEXEC or not */ >> + ifi->flags = READ_ONCE(sqe->install_fd_flags); > > I'm a big big fan of having all new fd returning apis return their fds > O_CLOEXEC by default and forcing userspace to explicitly turn this off > via fcntl(). pidfds are cloexec by default, so are seccomp notifier fds. io_uring fd itself is also O_CLOEXEC by default. We can certainly make this tweak here, but it is directly configurable by the task that issues the sqe. If you want O_CLOEXEC, then you should set it. That said, not opposed to making this the default. But it does mean I'd have to define a private opcode flag for this, so it can be turned off. At least that seems saner than needing to do fcntl() after the fact. This isn't a huge issue and we can certainly do that. Let me know what you prefer! >> + return 0; >> +} >> + >> +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) >> +{ >> + struct io_fixed_install *ifi; >> + int ret; >> + >> + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); >> + ret = receive_fd(req->file, ifi->flags); > > After changes currently in vfs.misc the helper is now: > > receive_fd(req->file, NULL, ifi->flags) > > So fyi, this will cause a build failure in -next. Ah, I should probably pull that in first then. Thanks for the headsup.
On Fri, Dec 08, 2023 at 02:49:37PM -0700, Jens Axboe wrote: > On 12/8/23 2:14 PM, Christian Brauner wrote: > > On Thu, Dec 07, 2023 at 08:11:45PM -0700, Jens Axboe wrote: > >> io_uring can currently open/close regular files or fixed/direct > >> descriptors. Or you can instantiate a fixed descriptor from a regular > >> one, and then close the regular descriptor. But you currently can't turn > >> a purely fixed/direct descriptor into a regular file descriptor. > >> > >> IORING_OP_FIXED_FD_INSTALL adds support for installing a direct > >> descriptor into the normal file table, just like receiving a file > >> descriptor or opening a new file would do. This is all nicely abstracted > >> into receive_fd(), and hence adding support for this is truly trivial. > >> > >> Since direct descriptors are only usable within io_uring itself, it can > >> be useful to turn them into real file descriptors if they ever need to > >> be accessed via normal syscalls. This can either be a transitory thing, > >> or just a permanent transition for a given direct descriptor. > >> > >> Signed-off-by: Jens Axboe <axboe@kernel.dk> > >> --- > > > > Would you mind giving me a Suggested-by? > > Sure, I can do that. I'll send out a v2, added a few comments and also > changed the flags location. But no real changes otherwise. > > > So this interesting. Usually we allow to receive fds if the task we're > > taking a reference to a file from: > > > > (1) is cooperating: SCM_RIGHTS > > (2) is the same task that's taking an addition reference: dup*(), > > seccomp notify addfd > > > > Both cases ensure that the file->f_count == and fdtable->count == 1 > > optimizations continue to work correctly in the regular system call > > fdget*() routines. > > > > I guess that this is a variant of (2). Effectively a dup*() from a > > private file table into the regular fdtable. > > Right. > > > So I just want to make sure we don't break this here because we broke > > that on accident before (pidfd_getfd()): > > > > A fixed file always has file->f_count == 1. The private io_uring fdtable > > is: > > It doesn't necessarily always have f_count of 1. One way to instantiate > a direct descriptor is to open the file normally, then register it. If > the task doesn't close the normal file descriptor, then it'd have an > elevated count of 2. But normally you'd expect the normal file to be > closed, or the file opened/instantiated as a direct descriptor upfront. > In that case, it should have a ref of 1. Yes, of course. I was thinking of the two common cases: (1) direct-to-fixed (2) open-regular-fd + egister-as-fixed + close-fd > > > * shared between all io workers? > > Yep, they are just like threads in that sense and share the file table. > > > * accessible to all processes that have mapped the io_uring ring? > > The direct descriptors are just like a file_table, except it's private > to the ring itself. If you can invoke io_uring_enter(2) on that ring, > then you have access to that direct descriptor table. > > > And fixed files can only be used from within io_uring itself. > > Correct, the index only makes sense within a specific ring. > > > So multiple caller might issue fixed file rquests to different io > > workers for concurrent operations. The file->f_count stays at 1 for all > > of them. Someone now requests a regular fd for that fixed file. > > > > So now an fixed-to-fd requests comes in. The file->f_count is bumped > 1 > > and that fd is installed into the fdtable of the caller through one of > > the io worker threads spawned for that caller? > > No worker thread is involved for this, unless you asked for that by eg > setting IOSQE_ASYNC in the sqe before it is submitted. The default would But it is possible, that's what I wondered. But that's ok because the io worker is just like a regular thread of that process. > be that the task that does io_uring_enter(2) and submits that sqe would > be the one that installs the file X with direct descriptor Y into its > normal file table. Ok, cool. > > > Assume caller now issues a readdir() operation on that fd. file->f_count > > will be > 1 for as long as it's used as a fixed file. So regular system > > call fdget*() locking rules should work even with that new extension > > added. > > > > I think that's sound. > > Right, I don't think this changes fget/fdget rules. Great. > > > But I want to share some doubts as well. I think the concept of fixed > > files is pretty neat. But it poses a new semantic challenge to > > userspace. > > > > io_uring_setup() gives you an fd back that you use to mmap() the > > io_uring. You make that io_uring_setup() fd a fixed file. You close the > > regular fd but now the io_uring instance stays around. You create and > > register a bunch of fixed files. > > > > So now you've set up an io_uring instance that is completely made up of > > fixed files including the io_uring file itself. > > > > So a process that assumes that once it has closed all fds in its fdtable > > of all threads that all resources have been released. > > > > But that's not true now. At least for some pretty obvious cases such as > > filesystems. For example, io_uring could just pin random filesystems in > > there. Say it holds open a file in a btrfs mount and the service > > manager having closed all fds now tries unmount it and fails > > inexplicably. > > > > So effectively, fixed files let you completely hide files and that's > > potentially a problem. So cool idea but this might cause us some trouble > > later down the road. > > That is true, you'd have to explicitly or implicitly cancel (this > happens at exec time too, for example) requests and then unregister the > fixed files to release those resources. Or teardown the ring, which > would do both. Or just not close the ring fd itself even if it's > registered, then it would continue to reside in the normal file table as > well. > > > Reviewed-by: Christian Brauner <brauner@kernel.org> > > Thanks! > > > Two comments below. > > > >> include/uapi/linux/io_uring.h | 2 ++ > >> io_uring/opdef.c | 9 +++++++++ > >> io_uring/openclose.c | 37 +++++++++++++++++++++++++++++++++++ > >> io_uring/openclose.h | 3 +++ > >> 4 files changed, 51 insertions(+) > >> > >> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > >> index f1c16f817742..af82aab9e632 100644 > >> --- a/include/uapi/linux/io_uring.h > >> +++ b/include/uapi/linux/io_uring.h > >> @@ -71,6 +71,7 @@ struct io_uring_sqe { > >> __u32 uring_cmd_flags; > >> __u32 waitid_flags; > >> __u32 futex_flags; > >> + __u32 install_fd_flags; > >> }; > >> __u64 user_data; /* data to be passed back at completion time */ > >> /* pack this to avoid bogus arm OABI complaints */ > >> @@ -253,6 +254,7 @@ enum io_uring_op { > >> IORING_OP_FUTEX_WAIT, > >> IORING_OP_FUTEX_WAKE, > >> IORING_OP_FUTEX_WAITV, > >> + IORING_OP_FIXED_FD_INSTALL, > >> > >> /* this goes last, obviously */ > >> IORING_OP_LAST, > >> diff --git a/io_uring/opdef.c b/io_uring/opdef.c > >> index 799db44283c7..6705634e5f52 100644 > >> --- a/io_uring/opdef.c > >> +++ b/io_uring/opdef.c > >> @@ -469,6 +469,12 @@ const struct io_issue_def io_issue_defs[] = { > >> .prep = io_eopnotsupp_prep, > >> #endif > >> }, > >> + [IORING_OP_FIXED_FD_INSTALL] = { > >> + .needs_file = 1, > >> + .audit_skip = 1, > >> + .prep = io_install_fixed_fd_prep, > >> + .issue = io_install_fixed_fd, > >> + }, > >> }; > >> > >> const struct io_cold_def io_cold_defs[] = { > >> @@ -704,6 +710,9 @@ const struct io_cold_def io_cold_defs[] = { > >> [IORING_OP_FUTEX_WAITV] = { > >> .name = "FUTEX_WAITV", > >> }, > >> + [IORING_OP_FIXED_FD_INSTALL] = { > >> + .name = "FIXED_FD_INSTALL", > >> + }, > >> }; > >> > >> const char *io_uring_get_opcode(u8 opcode) > >> diff --git a/io_uring/openclose.c b/io_uring/openclose.c > >> index fb73adb89067..5b8f79edef26 100644 > >> --- a/io_uring/openclose.c > >> +++ b/io_uring/openclose.c > >> @@ -31,6 +31,11 @@ struct io_close { > >> u32 file_slot; > >> }; > >> > >> +struct io_fixed_install { > >> + struct file *file; > >> + int flags; > >> +}; > >> + > >> static bool io_openat_force_async(struct io_open *open) > >> { > >> /* > >> @@ -254,3 +259,35 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags) > >> io_req_set_res(req, ret, 0); > >> return IOU_OK; > >> } > >> + > >> +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > >> +{ > >> + struct io_fixed_install *ifi; > >> + > >> + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || > >> + sqe->splice_fd_in || sqe->addr3) > >> + return -EINVAL; > >> + > >> + /* must be a fixed file */ > >> + if (!(req->flags & REQ_F_FIXED_FILE)) > >> + return -EBADF; > >> + > >> + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); > >> + > >> + /* really just O_CLOEXEC or not */ > >> + ifi->flags = READ_ONCE(sqe->install_fd_flags); > > > > I'm a big big fan of having all new fd returning apis return their fds > > O_CLOEXEC by default and forcing userspace to explicitly turn this off > > via fcntl(). pidfds are cloexec by default, so are seccomp notifier fds. > > io_uring fd itself is also O_CLOEXEC by default. We can certainly make > this tweak here, but it is directly configurable by the task that issues > the sqe. If you want O_CLOEXEC, then you should set it. > > That said, not opposed to making this the default. But it does mean I'd > have to define a private opcode flag for this, so it can be turned off. > At least that seems saner than needing to do fcntl() after the fact. > This isn't a huge issue and we can certainly do that. Let me know what > you prefer! Meh, new opcode would suck. Don't deviate from the standard apis then.
On 12/8/23 2:56 PM, Christian Brauner wrote: > On Fri, Dec 08, 2023 at 02:49:37PM -0700, Jens Axboe wrote: >> On 12/8/23 2:14 PM, Christian Brauner wrote: >>> On Thu, Dec 07, 2023 at 08:11:45PM -0700, Jens Axboe wrote: >>>> io_uring can currently open/close regular files or fixed/direct >>>> descriptors. Or you can instantiate a fixed descriptor from a regular >>>> one, and then close the regular descriptor. But you currently can't turn >>>> a purely fixed/direct descriptor into a regular file descriptor. >>>> >>>> IORING_OP_FIXED_FD_INSTALL adds support for installing a direct >>>> descriptor into the normal file table, just like receiving a file >>>> descriptor or opening a new file would do. This is all nicely abstracted >>>> into receive_fd(), and hence adding support for this is truly trivial. >>>> >>>> Since direct descriptors are only usable within io_uring itself, it can >>>> be useful to turn them into real file descriptors if they ever need to >>>> be accessed via normal syscalls. This can either be a transitory thing, >>>> or just a permanent transition for a given direct descriptor. >>>> >>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>> --- >>> >>> Would you mind giving me a Suggested-by? >> >> Sure, I can do that. I'll send out a v2, added a few comments and also >> changed the flags location. But no real changes otherwise. >> >>> So this interesting. Usually we allow to receive fds if the task we're >>> taking a reference to a file from: >>> >>> (1) is cooperating: SCM_RIGHTS >>> (2) is the same task that's taking an addition reference: dup*(), >>> seccomp notify addfd >>> >>> Both cases ensure that the file->f_count == and fdtable->count == 1 >>> optimizations continue to work correctly in the regular system call >>> fdget*() routines. >>> >>> I guess that this is a variant of (2). Effectively a dup*() from a >>> private file table into the regular fdtable. >> >> Right. >> >>> So I just want to make sure we don't break this here because we broke >>> that on accident before (pidfd_getfd()): >>> >>> A fixed file always has file->f_count == 1. The private io_uring fdtable >>> is: >> >> It doesn't necessarily always have f_count of 1. One way to instantiate >> a direct descriptor is to open the file normally, then register it. If >> the task doesn't close the normal file descriptor, then it'd have an >> elevated count of 2. But normally you'd expect the normal file to be >> closed, or the file opened/instantiated as a direct descriptor upfront. >> In that case, it should have a ref of 1. > > Yes, of course. I was thinking of the two common cases: > > (1) direct-to-fixed > (2) open-regular-fd + egister-as-fixed + close-fd > >> >>> * shared between all io workers? >> >> Yep, they are just like threads in that sense and share the file table. >> >>> * accessible to all processes that have mapped the io_uring ring? >> >> The direct descriptors are just like a file_table, except it's private >> to the ring itself. If you can invoke io_uring_enter(2) on that ring, >> then you have access to that direct descriptor table. >> >>> And fixed files can only be used from within io_uring itself. >> >> Correct, the index only makes sense within a specific ring. >> >>> So multiple caller might issue fixed file rquests to different io >>> workers for concurrent operations. The file->f_count stays at 1 for all >>> of them. Someone now requests a regular fd for that fixed file. >>> >>> So now an fixed-to-fd requests comes in. The file->f_count is bumped > 1 >>> and that fd is installed into the fdtable of the caller through one of >>> the io worker threads spawned for that caller? >> >> No worker thread is involved for this, unless you asked for that by eg >> setting IOSQE_ASYNC in the sqe before it is submitted. The default would > > But it is possible, that's what I wondered. But that's ok because the io > worker is just like a regular thread of that process. It's certainly possible, you are correct. It's just not the default. >>>> +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>> +{ >>>> + struct io_fixed_install *ifi; >>>> + >>>> + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || >>>> + sqe->splice_fd_in || sqe->addr3) >>>> + return -EINVAL; >>>> + >>>> + /* must be a fixed file */ >>>> + if (!(req->flags & REQ_F_FIXED_FILE)) >>>> + return -EBADF; >>>> + >>>> + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); >>>> + >>>> + /* really just O_CLOEXEC or not */ >>>> + ifi->flags = READ_ONCE(sqe->install_fd_flags); >>> >>> I'm a big big fan of having all new fd returning apis return their fds >>> O_CLOEXEC by default and forcing userspace to explicitly turn this off >>> via fcntl(). pidfds are cloexec by default, so are seccomp notifier fds. >> >> io_uring fd itself is also O_CLOEXEC by default. We can certainly make >> this tweak here, but it is directly configurable by the task that issues >> the sqe. If you want O_CLOEXEC, then you should set it. >> >> That said, not opposed to making this the default. But it does mean I'd >> have to define a private opcode flag for this, so it can be turned off. >> At least that seems saner than needing to do fcntl() after the fact. >> This isn't a huge issue and we can certainly do that. Let me know what >> you prefer! > > Meh, new opcode would suck. Don't deviate from the standard apis then. Not a new opcode, it'd just be a flag for that opcode. We default to O_CLOEXEC is nothing is given, and you can do: io_uring_prep_fixed_fd_install(sqe, fixed_index, IORING_FIXED_FD_NO_CLOEXEC); to simply set that flag to turn it off. Only reason I bring it up as a bit annoying is that it'd be cleaner to have it be part of the O_* namespace as O_NOCLOEXEC, but it's not a huge deal. It retains the part you cared about, which is making O_CLOEXEC the default, but retains the option of turning it off rather than needing to do an fcntl() to retrieve flags, mask it, then another fcntl().
> > Meh, new opcode would suck. Don't deviate from the standard apis then. > > Not a new opcode, it'd just be a flag for that opcode. We default to > O_CLOEXEC is nothing is given, and you can do: > > io_uring_prep_fixed_fd_install(sqe, fixed_index, IORING_FIXED_FD_NO_CLOEXEC); > > to simply set that flag to turn it off. Only reason I bring it up as a > bit annoying is that it'd be cleaner to have it be part of the O_* > namespace as O_NOCLOEXEC, but it's not a huge deal. > > It retains the part you cared about, which is making O_CLOEXEC the > default, but retains the option of turning it off rather than needing to > do an fcntl() to retrieve flags, mask it, then another fcntl(). Ok, sounds good.
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f1c16f817742..af82aab9e632 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -71,6 +71,7 @@ struct io_uring_sqe { __u32 uring_cmd_flags; __u32 waitid_flags; __u32 futex_flags; + __u32 install_fd_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -253,6 +254,7 @@ enum io_uring_op { IORING_OP_FUTEX_WAIT, IORING_OP_FUTEX_WAKE, IORING_OP_FUTEX_WAITV, + IORING_OP_FIXED_FD_INSTALL, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 799db44283c7..6705634e5f52 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -469,6 +469,12 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_FIXED_FD_INSTALL] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_install_fixed_fd_prep, + .issue = io_install_fixed_fd, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -704,6 +710,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FUTEX_WAITV] = { .name = "FUTEX_WAITV", }, + [IORING_OP_FIXED_FD_INSTALL] = { + .name = "FIXED_FD_INSTALL", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/openclose.c b/io_uring/openclose.c index fb73adb89067..5b8f79edef26 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -31,6 +31,11 @@ struct io_close { u32 file_slot; }; +struct io_fixed_install { + struct file *file; + int flags; +}; + static bool io_openat_force_async(struct io_open *open) { /* @@ -254,3 +259,35 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_fixed_install *ifi; + + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || + sqe->splice_fd_in || sqe->addr3) + return -EINVAL; + + /* must be a fixed file */ + if (!(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); + + /* really just O_CLOEXEC or not */ + ifi->flags = READ_ONCE(sqe->install_fd_flags); + return 0; +} + +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_fixed_install *ifi; + int ret; + + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); + ret = receive_fd(req->file, ifi->flags); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 4b1c28d3a66c..8a93c98ad0ad 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -12,3 +12,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); + +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);
io_uring can currently open/close regular files or fixed/direct descriptors. Or you can instantiate a fixed descriptor from a regular one, and then close the regular descriptor. But you currently can't turn a purely fixed/direct descriptor into a regular file descriptor. IORING_OP_FIXED_FD_INSTALL adds support for installing a direct descriptor into the normal file table, just like receiving a file descriptor or opening a new file would do. This is all nicely abstracted into receive_fd(), and hence adding support for this is truly trivial. Since direct descriptors are only usable within io_uring itself, it can be useful to turn them into real file descriptors if they ever need to be accessed via normal syscalls. This can either be a transitory thing, or just a permanent transition for a given direct descriptor. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/uapi/linux/io_uring.h | 2 ++ io_uring/opdef.c | 9 +++++++++ io_uring/openclose.c | 37 +++++++++++++++++++++++++++++++++++ io_uring/openclose.h | 3 +++ 4 files changed, 51 insertions(+)