Message ID | 20241014171838.304953-6-maharmstone@fb.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: encoded reads via io_uring | expand |
On Mon, Oct 14, 2024 at 06:18:27PM +0100, Mark Harmstone wrote: > Adds an io_uring command for encoded reads, using the same interface as Add ... > the existing BTRFS_IOC_ENCODED_READ ioctl. This is probably a good summary in a changelog but the patch is quite long so it feels like this should be described in a more detail how it's done. Connecting two interfaces can be done in various ways, so at least mention that it's a simple pass through, or if there are any complications regardign locking, object lifetime and such. > Signed-off-by: Mark Harmstone <maharmstone@fb.com> > --- > fs/btrfs/file.c | 1 + > fs/btrfs/ioctl.c | 283 +++++++++++++++++++++++++++++++++++++++++++++++ > fs/btrfs/ioctl.h | 1 + > 3 files changed, 285 insertions(+) > > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index 2aeb8116549c..e33ca73fef8c 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -3774,6 +3774,7 @@ const struct file_operations btrfs_file_operations = { > .compat_ioctl = btrfs_compat_ioctl, > #endif > .remap_file_range = btrfs_remap_file_range, > + .uring_cmd = btrfs_uring_cmd, > .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, > }; > > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c > index 8c9ff4898ab0..c0393575cf5e 100644 > --- a/fs/btrfs/ioctl.c > +++ b/fs/btrfs/ioctl.c > @@ -29,6 +29,7 @@ > #include <linux/fileattr.h> > #include <linux/fsverity.h> > #include <linux/sched/xacct.h> > +#include <linux/io_uring/cmd.h> > #include "ctree.h" > #include "disk-io.h" > #include "export.h" > @@ -4723,6 +4724,288 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool > return ret; > } > > +struct btrfs_uring_priv { > + struct io_uring_cmd *cmd; > + struct page **pages; > + unsigned long nr_pages; > + struct kiocb iocb; > + struct iovec *iov; > + struct iov_iter iter; > + struct extent_state *cached_state; > + u64 count; > + bool compressed; This leaves a 7 byte hole. > + u64 start; > + u64 lockend; > +}; The whole structure should be documented and the members too if it's not obvious what they are used for. > + > +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, > + unsigned int issue_flags) > +{ > + struct btrfs_uring_priv *priv = (struct btrfs_uring_priv *)*(uintptr_t*)cmd->pdu; > + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); > + struct extent_io_tree *io_tree = &inode->io_tree; > + unsigned long i; Why is this long? > + u64 cur; > + size_t page_offset; > + ssize_t ret; > + > + if (priv->compressed) { > + i = 0; > + page_offset = 0; > + } else { > + i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; > + page_offset = (priv->iocb.ki_pos - priv->start) & (PAGE_SIZE - 1); Please don't open code page_offset() > + } > + cur = 0; > + while (cur < priv->count) { > + size_t bytes = min_t(size_t, priv->count - cur, > + PAGE_SIZE - page_offset); > + > + if (copy_page_to_iter(priv->pages[i], page_offset, bytes, > + &priv->iter) != bytes) { > + ret = -EFAULT; > + goto out; > + } > + > + i++; > + cur += bytes; > + page_offset = 0; > + } > + ret = priv->count; > + > +out: > + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); > + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); > + > + io_uring_cmd_done(cmd, ret, 0, issue_flags); > + add_rchar(current, ret); > + > + for (unsigned long i = 0; i < priv->nr_pages; i++) { Shadowing 'i' of the same type as is declared in the function. Maybe don't call it just 'i' but index as it's used outside of a loop. > + __free_page(priv->pages[i]); > + } Please drop the outer { } for a single statement block. > + > + kfree(priv->pages); > + kfree(priv->iov); > + kfree(priv); > +} > + > +static void btrfs_uring_read_extent_cb(void *ctx, int err) > +{ > + struct btrfs_uring_priv *priv = ctx; > + > + *(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv; Isn't there a helper for that? Type casting should be done in justified cases and as an exception. > + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); > +} > + > +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, > + u64 start, u64 lockend, > + struct extent_state *cached_state, > + u64 disk_bytenr, u64 disk_io_size, > + size_t count, bool compressed, > + struct iovec *iov, > + struct io_uring_cmd *cmd) > +{ > + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); > + struct extent_io_tree *io_tree = &inode->io_tree; > + struct page **pages; > + struct btrfs_uring_priv *priv = NULL; > + unsigned long nr_pages; > + int ret; > + > + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); > + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); > + if (!pages) > + return -ENOMEM; > + ret = btrfs_alloc_page_array(nr_pages, pages, 0); The allocation sizes are derived from disk_io_size that comes from the outside, potentially making large allocatoins. Or is there some inherent limit on the maximu size? > + if (ret) { > + ret = -ENOMEM; > + goto fail; > + } > + > + priv = kmalloc(sizeof(*priv), GFP_NOFS); > + if (!priv) { > + ret = -ENOMEM; > + goto fail; > + } > + > + priv->iocb = *iocb; > + priv->iov = iov; > + priv->iter = *iter; > + priv->count = count; > + priv->cmd = cmd; > + priv->cached_state = cached_state; > + priv->compressed = compressed; > + priv->nr_pages = nr_pages; > + priv->pages = pages; > + priv->start = start; > + priv->lockend = lockend; > + > + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, > + disk_io_size, pages, > + btrfs_uring_read_extent_cb, > + priv); > + if (ret) > + goto fail; > + > + return -EIOCBQUEUED; > + > +fail: > + unlock_extent(io_tree, start, lockend, &cached_state); > + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); > + kfree(priv); Does this leak pages and priv->pages? > + return ret; > +} > + > +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, > + unsigned int issue_flags) > +{ > + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, > + flags); > + size_t copy_end; > + struct btrfs_ioctl_encoded_io_args args = {0}; = { 0 } > + int ret; > + u64 disk_bytenr, disk_io_size; > + struct file *file = cmd->file; > + struct btrfs_inode *inode = BTRFS_I(file->f_inode); > + struct btrfs_fs_info *fs_info = inode->root->fs_info; > + struct extent_io_tree *io_tree = &inode->io_tree; > + struct iovec iovstack[UIO_FASTIOV]; > + struct iovec *iov = iovstack; > + struct iov_iter iter; > + loff_t pos; > + struct kiocb kiocb; > + struct extent_state *cached_state = NULL; > + u64 start, lockend; The stack consumption looks quite high. > + > + if (!capable(CAP_SYS_ADMIN)) { > + ret = -EPERM; > + goto out_acct; > + } > + > + if (issue_flags & IO_URING_F_COMPAT) { > +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) > + struct btrfs_ioctl_encoded_io_args_32 args32; > + > + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, > + flags); > + if (copy_from_user(&args32, (const void *)cmd->sqe->addr, > + copy_end)) { > + ret = -EFAULT; > + goto out_acct; > + } > + args.iov = compat_ptr(args32.iov); > + args.iovcnt = args32.iovcnt; > + args.offset = args32.offset; > + args.flags = args32.flags; > +#else > + return -ENOTTY; > +#endif > + } else { > + copy_end = copy_end_kernel; > + if (copy_from_user(&args, (const void *)cmd->sqe->addr, > + copy_end)) { > + ret = -EFAULT; > + goto out_acct; > + } > + } > + > + if (args.flags != 0) > + return -EINVAL; > + > + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), > + &iov, &iter); > + if (ret < 0) > + goto out_acct; > + > + if (iov_iter_count(&iter) == 0) { > + ret = 0; > + goto out_free; > + } > + > + pos = args.offset; > + ret = rw_verify_area(READ, file, &pos, args.len); > + if (ret < 0) > + goto out_free; > + > + init_sync_kiocb(&kiocb, file); > + kiocb.ki_pos = pos; > + > + start = ALIGN_DOWN(pos, fs_info->sectorsize); > + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; > + > + ret = btrfs_encoded_read(&kiocb, &iter, &args, > + issue_flags & IO_URING_F_NONBLOCK, > + &cached_state, &disk_bytenr, &disk_io_size); > + if (ret < 0 && ret != -EIOCBQUEUED) > + goto out_free; > + > + file_accessed(file); > + > + if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end, > + (char *)&args + copy_end_kernel, So many type casts again > + sizeof(args) - copy_end_kernel)) { > + if (ret == -EIOCBQUEUED) { > + unlock_extent(io_tree, start, lockend, &cached_state); > + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); > + } > + ret = -EFAULT; > + goto out_free; > + } > + > + if (ret == -EIOCBQUEUED) { > + u64 count; > + > + /* If we've optimized things by storing the iovecs on the stack, > + * undo this. */ This is not proper comment formatting. > + if (!iov) { > + iov = kmalloc(sizeof(struct iovec) * args.iovcnt, > + GFP_NOFS); > + if (!iov) { > + unlock_extent(io_tree, start, lockend, > + &cached_state); > + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); > + ret = -ENOMEM; > + goto out_acct; > + } > + > + memcpy(iov, iovstack, > + sizeof(struct iovec) * args.iovcnt); > + } > + > + count = min_t(u64, iov_iter_count(&iter), disk_io_size); > + > + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, > + cached_state, disk_bytenr, > + disk_io_size, count, > + args.compression, iov, cmd); > + > + goto out_acct; > + } > + > +out_free: > + kfree(iov); > + > +out_acct: > + if (ret > 0) > + add_rchar(current, ret); > + inc_syscr(current); > + > + return ret; > +} > + > +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) > +{ > + switch (cmd->cmd_op) { > + case BTRFS_IOC_ENCODED_READ: > +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) > + case BTRFS_IOC_ENCODED_READ_32: > +#endif > + return btrfs_uring_encoded_read(cmd, issue_flags); > + } > + > + return -EINVAL; > +} > + > long btrfs_ioctl(struct file *file, unsigned int > cmd, unsigned long arg) > {
On 10/21/24 14:50, David Sterba wrote: > On Mon, Oct 14, 2024 at 06:18:27PM +0100, Mark Harmstone wrote: >> Adds an io_uring command for encoded reads, using the same interface as > > Add ... > >> the existing BTRFS_IOC_ENCODED_READ ioctl. > > This is probably a good summary in a changelog but the patch is quite > long so it feels like this should be described in a more detail how it's > done. Connecting two interfaces can be done in various ways, so at least > mention that it's a simple pass through, or if there are any > complications regardign locking, object lifetime and such. > >> Signed-off-by: Mark Harmstone <maharmstone@fb.com> ... >> + >> + kfree(priv->pages); >> + kfree(priv->iov); >> + kfree(priv); >> +} >> + >> +static void btrfs_uring_read_extent_cb(void *ctx, int err) >> +{ >> + struct btrfs_uring_priv *priv = ctx; >> + >> + *(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv; > > Isn't there a helper for that? Type casting should be done in justified > cases and as an exception. FWIW, I haven't taken a look yet, but for this one, please use io_uring_cmd_to_pdu(cmd, type), it'll check the size for you. And there in no need to cast it to uintptr, would be much nicer to struct btrfs_cmd { struct btrfs_uring_priv *priv; }; struct btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct btrfs_cmd); bc->priv = priv; You get more type checking this way. You can also wrap around io_uring_cmd_to_pdu() into a static inline helper, if that looks better. ...>> + >> + start = ALIGN_DOWN(pos, fs_info->sectorsize); >> + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; >> + >> + ret = btrfs_encoded_read(&kiocb, &iter, &args, >> + issue_flags & IO_URING_F_NONBLOCK, >> + &cached_state, &disk_bytenr, &disk_io_size); >> + if (ret < 0 && ret != -EIOCBQUEUED) >> + goto out_free; >> + >> + file_accessed(file); >> + >> + if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end, >> + (char *)&args + copy_end_kernel, Be aware, SQE data is not stable, you should assume that the user space can change it at any moment. It should be a READ_ONCE, and likely you don't want it to be read twice, unless you handle it / verify values / etc. I'd recommend to save it early in the callback and stash somewhere, e.g. into struct btrfs_cmd I mentioned above. > > So many type casts again >
Thanks David. On 21/10/24 14:50, David Sterba wrote: >> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, >> + u64 start, u64 lockend, >> + struct extent_state *cached_state, >> + u64 disk_bytenr, u64 disk_io_size, >> + size_t count, bool compressed, >> + struct iovec *iov, >> + struct io_uring_cmd *cmd) >> +{ >> + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); >> + struct extent_io_tree *io_tree = &inode->io_tree; >> + struct page **pages; >> + struct btrfs_uring_priv *priv = NULL; >> + unsigned long nr_pages; >> + int ret; >> + >> + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); >> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); >> + if (!pages) >> + return -ENOMEM; >> + ret = btrfs_alloc_page_array(nr_pages, pages, 0); > > The allocation sizes are derived from disk_io_size that comes from the > outside, potentially making large allocatoins. Or is there some inherent > limit on the maximu size? Yes. It comes from btrfs_encoded_read, where it's limited to BTRFS_MAX_UNCOMPRESSED (i.e. 128KB). >> + if (ret) { >> + ret = -ENOMEM; >> + goto fail; >> + } >> + >> + priv = kmalloc(sizeof(*priv), GFP_NOFS); >> + if (!priv) { >> + ret = -ENOMEM; >> + goto fail; >> + } >> + >> + priv->iocb = *iocb; >> + priv->iov = iov; >> + priv->iter = *iter; >> + priv->count = count; >> + priv->cmd = cmd; >> + priv->cached_state = cached_state; >> + priv->compressed = compressed; >> + priv->nr_pages = nr_pages; >> + priv->pages = pages; >> + priv->start = start; >> + priv->lockend = lockend; >> + >> + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, >> + disk_io_size, pages, >> + btrfs_uring_read_extent_cb, >> + priv); >> + if (ret) >> + goto fail; >> + >> + return -EIOCBQUEUED; >> + >> +fail: >> + unlock_extent(io_tree, start, lockend, &cached_state); >> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); >> + kfree(priv); > > Does this leak pages and priv->pages? No, they get freed in btrfs_uring_read_finished. >> + return ret; >> +} >> + >> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, >> + unsigned int issue_flags) >> +{ >> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, >> + flags); >> + size_t copy_end; >> + struct btrfs_ioctl_encoded_io_args args = {0}; > = { 0 } >> + int ret; >> + u64 disk_bytenr, disk_io_size; >> + struct file *file = cmd->file; >> + struct btrfs_inode *inode = BTRFS_I(file->f_inode); >> + struct btrfs_fs_info *fs_info = inode->root->fs_info; >> + struct extent_io_tree *io_tree = &inode->io_tree; >> + struct iovec iovstack[UIO_FASTIOV]; >> + struct iovec *iov = iovstack; >> + struct iov_iter iter; >> + loff_t pos; >> + struct kiocb kiocb; >> + struct extent_state *cached_state = NULL; >> + u64 start, lockend; > > The stack consumption looks quite high. 696 bytes, compared to 672 in btrfs_ioctl_encoded_read. btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing here would be to allocate btrfs_uring_priv early and pass that around, I think. Do you have a recommendation for what the maximum stack size of a function should be? Mark
On Mon, Oct 21, 2024 at 05:05:20PM +0000, Mark Harmstone wrote: > >> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, > >> + unsigned int issue_flags) > >> +{ > >> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, > >> + flags); > >> + size_t copy_end; > >> + struct btrfs_ioctl_encoded_io_args args = {0}; > > = { 0 } > >> + int ret; > >> + u64 disk_bytenr, disk_io_size; > >> + struct file *file = cmd->file; > >> + struct btrfs_inode *inode = BTRFS_I(file->f_inode); > >> + struct btrfs_fs_info *fs_info = inode->root->fs_info; > >> + struct extent_io_tree *io_tree = &inode->io_tree; > >> + struct iovec iovstack[UIO_FASTIOV]; > >> + struct iovec *iov = iovstack; > >> + struct iov_iter iter; > >> + loff_t pos; > >> + struct kiocb kiocb; > >> + struct extent_state *cached_state = NULL; > >> + u64 start, lockend; > > > > The stack consumption looks quite high. > > 696 bytes, compared to 672 in btrfs_ioctl_encoded_read. > btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing > here would be to allocate btrfs_uring_priv early and pass that around, I > think. > > Do you have a recommendation for what the maximum stack size of a > function should be? It depends from where the function is called. For ioctl callbacks, like btrfs_ioctl_encoded_read it's the first function using kernel stack leaving enough for any deep IO stacks (DM/NFS/iSCSI/...). If something similar applies to the io_uring callbacks then it's probably fine. Using a separate off-stack structure works but it's a penalty as it needs the allcation. The io_uring is meant for high performance so if the on-stack allocation is safe then keep it like that. I've checked on a release config the stack consumption and the encoded ioctl functions are not the worst: tree-log.c:btrfs_sync_log 728 static scrub.c:scrub_verify_one_metadata 552 dynamic,bounded inode.c:print_data_reloc_error 544 dynamic,bounded uuid-tree.c:btrfs_uuid_scan_kthread 520 static tree-checker.c:check_root_item 504 static file-item.c:btrfs_csum_one_bio 496 static inode.c:btrfs_start_delalloc_roots 488 static scrub.c:scrub_raid56_parity_stripe 464 dynamic,bounded disk-io.c:write_dev_supers 464 static ioctl.c:btrfs_ioctl_encoded_write 456 dynamic,bounded ioctl.c:btrfs_ioctl_encoded_read 456 dynamic,bounded
On 21/10/24 19:23, David Sterba wrote: > > > On Mon, Oct 21, 2024 at 05:05:20PM +0000, Mark Harmstone wrote: >>>> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, >>>> + unsigned int issue_flags) >>>> +{ >>>> + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, >>>> + flags); >>>> + size_t copy_end; >>>> + struct btrfs_ioctl_encoded_io_args args = {0}; >>> = { 0 } >>>> + int ret; >>>> + u64 disk_bytenr, disk_io_size; >>>> + struct file *file = cmd->file; >>>> + struct btrfs_inode *inode = BTRFS_I(file->f_inode); >>>> + struct btrfs_fs_info *fs_info = inode->root->fs_info; >>>> + struct extent_io_tree *io_tree = &inode->io_tree; >>>> + struct iovec iovstack[UIO_FASTIOV]; >>>> + struct iovec *iov = iovstack; >>>> + struct iov_iter iter; >>>> + loff_t pos; >>>> + struct kiocb kiocb; >>>> + struct extent_state *cached_state = NULL; >>>> + u64 start, lockend; >>> >>> The stack consumption looks quite high. >> >> 696 bytes, compared to 672 in btrfs_ioctl_encoded_read. >> btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing >> here would be to allocate btrfs_uring_priv early and pass that around, I >> think. >> >> Do you have a recommendation for what the maximum stack size of a >> function should be? > > It depends from where the function is called. For ioctl callbacks, like > btrfs_ioctl_encoded_read it's the first function using kernel stack > leaving enough for any deep IO stacks (DM/NFS/iSCSI/...). If something > similar applies to the io_uring callbacks then it's probably fine. Thanks. Yes, the two should functions should be broadly equivalent. > Using a separate off-stack structure works but it's a penalty as it > needs the allcation. The io_uring is meant for high performance so if > the on-stack allocation is safe then keep it like that. Okay, I'll leave this bit as it is, then. I can revisit it if we start getting a spike of stack overflow crashes mentioning btrfs_uring_encoded_read. > > I've checked on a release config the stack consumption and the encoded > ioctl functions are not the worst: > > tree-log.c:btrfs_sync_log 728 static > scrub.c:scrub_verify_one_metadata 552 dynamic,bounded > inode.c:print_data_reloc_error 544 dynamic,bounded > uuid-tree.c:btrfs_uuid_scan_kthread 520 static > tree-checker.c:check_root_item 504 static > file-item.c:btrfs_csum_one_bio 496 static > inode.c:btrfs_start_delalloc_roots 488 static > scrub.c:scrub_raid56_parity_stripe 464 dynamic,bounded > disk-io.c:write_dev_supers 464 static > ioctl.c:btrfs_ioctl_encoded_write 456 dynamic,bounded > ioctl.c:btrfs_ioctl_encoded_read 456 dynamic,bounded
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2aeb8116549c..e33ca73fef8c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3774,6 +3774,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8c9ff4898ab0..c0393575cf5e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -29,6 +29,7 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> #include <linux/sched/xacct.h> +#include <linux/io_uring/cmd.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -4723,6 +4724,288 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool return ret; } +struct btrfs_uring_priv { + struct io_uring_cmd *cmd; + struct page **pages; + unsigned long nr_pages; + struct kiocb iocb; + struct iovec *iov; + struct iov_iter iter; + struct extent_state *cached_state; + u64 count; + bool compressed; + u64 start; + u64 lockend; +}; + +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct btrfs_uring_priv *priv = (struct btrfs_uring_priv *)*(uintptr_t*)cmd->pdu; + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + unsigned long i; + u64 cur; + size_t page_offset; + ssize_t ret; + + if (priv->compressed) { + i = 0; + page_offset = 0; + } else { + i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; + page_offset = (priv->iocb.ki_pos - priv->start) & (PAGE_SIZE - 1); + } + cur = 0; + while (cur < priv->count) { + size_t bytes = min_t(size_t, priv->count - cur, + PAGE_SIZE - page_offset); + + if (copy_page_to_iter(priv->pages[i], page_offset, bytes, + &priv->iter) != bytes) { + ret = -EFAULT; + goto out; + } + + i++; + cur += bytes; + page_offset = 0; + } + ret = priv->count; + +out: + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + add_rchar(current, ret); + + for (unsigned long i = 0; i < priv->nr_pages; i++) { + __free_page(priv->pages[i]); + } + + kfree(priv->pages); + kfree(priv->iov); + kfree(priv); +} + +static void btrfs_uring_read_extent_cb(void *ctx, int err) +{ + struct btrfs_uring_priv *priv = ctx; + + *(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv; + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); +} + +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state *cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + struct iovec *iov, + struct io_uring_cmd *cmd) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + struct btrfs_uring_priv *priv = NULL; + unsigned long nr_pages; + int ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto fail; + } + + priv = kmalloc(sizeof(*priv), GFP_NOFS); + if (!priv) { + ret = -ENOMEM; + goto fail; + } + + priv->iocb = *iocb; + priv->iov = iov; + priv->iter = *iter; + priv->count = count; + priv->cmd = cmd; + priv->cached_state = cached_state; + priv->compressed = compressed; + priv->nr_pages = nr_pages; + priv->pages = pages; + priv->start = start; + priv->lockend = lockend; + + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, + disk_io_size, pages, + btrfs_uring_read_extent_cb, + priv); + if (ret) + goto fail; + + return -EIOCBQUEUED; + +fail: + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + kfree(priv); + return ret; +} + +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, + flags); + size_t copy_end; + struct btrfs_ioctl_encoded_io_args args = {0}; + int ret; + u64 disk_bytenr, disk_io_size; + struct file *file = cmd->file; + struct btrfs_inode *inode = BTRFS_I(file->f_inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + struct extent_state *cached_state = NULL; + u64 start, lockend; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, + flags); + if (copy_from_user(&args32, (const void *)cmd->sqe->addr, + copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, (const void *)cmd->sqe->addr, + copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (args.flags != 0) + return -EINVAL; + + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_free; + } + + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_free; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + start = ALIGN_DOWN(pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + ret = btrfs_encoded_read(&kiocb, &iter, &args, + issue_flags & IO_URING_F_NONBLOCK, + &cached_state, &disk_bytenr, &disk_io_size); + if (ret < 0 && ret != -EIOCBQUEUED) + goto out_free; + + file_accessed(file); + + if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end, + (char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) { + if (ret == -EIOCBQUEUED) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + ret = -EFAULT; + goto out_free; + } + + if (ret == -EIOCBQUEUED) { + u64 count; + + /* If we've optimized things by storing the iovecs on the stack, + * undo this. */ + if (!iov) { + iov = kmalloc(sizeof(struct iovec) * args.iovcnt, + GFP_NOFS); + if (!iov) { + unlock_extent(io_tree, start, lockend, + &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + ret = -ENOMEM; + goto out_acct; + } + + memcpy(iov, iovstack, + sizeof(struct iovec) * args.iovcnt); + } + + count = min_t(u64, iov_iter_count(&iter), disk_io_size); + + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, + cached_state, disk_bytenr, + disk_io_size, count, + args.compression, iov, cmd); + + goto out_acct; + } + +out_free: + kfree(iov); + +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + + return ret; +} + +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case BTRFS_IOC_ENCODED_READ: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: +#endif + return btrfs_uring_encoded_read(cmd, issue_flags); + } + + return -EINVAL; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 19cd26b0244a..288f4f5c4409 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,5 +22,6 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(const u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); #endif
Adds an io_uring command for encoded reads, using the same interface as the existing BTRFS_IOC_ENCODED_READ ioctl. Signed-off-by: Mark Harmstone <maharmstone@fb.com> --- fs/btrfs/file.c | 1 + fs/btrfs/ioctl.c | 283 +++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 1 + 3 files changed, 285 insertions(+)