diff mbox series

[5/5] btrfs: add io_uring command for encoded reads

Message ID 20241022145024.1046883-6-maharmstone@fb.com (mailing list archive)
State New
Headers show
Series btrfs: io_uring interface for encoded reads | expand

Commit Message

Mark Harmstone Oct. 22, 2024, 2:50 p.m. UTC
Adds an io_uring command for encoded reads, using the same interface as
the existing BTRFS_IOC_ENCODED_READ ioctl.

btrfs_uring_encoded_read is an io_uring version of
btrfs_ioctl_encoded_read, which validates the user input and calls
btrfs_encoded_read to read the appropriate metadata. If we determine
that we need to read an extent from disk, we call
btrfs_encoded_read_regular_fill_pages through btrfs_uring_read_extent to
prepare the bio.

The existing btrfs_encoded_read_regular_fill_pages is changed so that if
it is passed a uring_ctx value, rather than waking up any waiting
threads it calls btrfs_uring_read_extent_endio. This in turn copies the
read data back to userspace, and calls io_uring_cmd_done to complete the
io_uring command.

Because we're potentially doing a non-blocking read,
btrfs_uring_read_extent doesn't clean up after itself if it returns
-EIOCBQUEUED. Instead, it allocates a priv struct, populates the fields
there that we will need to unlock the inode and free our allocations,
and defers this to the btrfs_uring_read_finished that gets called when
the bio completes.

Signed-off-by: Mark Harmstone <maharmstone@fb.com>
---
 fs/btrfs/btrfs_inode.h |   3 +-
 fs/btrfs/file.c        |   1 +
 fs/btrfs/inode.c       |  43 ++++--
 fs/btrfs/ioctl.c       | 309 +++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h       |   2 +
 fs/btrfs/send.c        |   3 +-
 6 files changed, 349 insertions(+), 12 deletions(-)

Comments

David Sterba Oct. 29, 2024, 9:51 p.m. UTC | #1
On Tue, Oct 22, 2024 at 03:50:20PM +0100, Mark Harmstone wrote:
> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> +				    unsigned int issue_flags)
> +{
> +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> +					     flags);
> +	size_t copy_end;
> +	struct btrfs_ioctl_encoded_io_args args = { 0 };
> +	int ret;
> +	u64 disk_bytenr, disk_io_size;
> +	struct file *file = cmd->file;
> +	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> +	struct extent_io_tree *io_tree = &inode->io_tree;
> +	struct iovec iovstack[UIO_FASTIOV];
> +	struct iovec *iov = iovstack;
> +	struct iov_iter iter;
> +	loff_t pos;
> +	struct kiocb kiocb;
> +	struct extent_state *cached_state = NULL;
> +	u64 start, lockend;
> +	void __user *sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
> +
> +	if (!capable(CAP_SYS_ADMIN)) {
> +		ret = -EPERM;
> +		goto out_acct;
> +	}

Access level check must be done first before any data are read, in this
case cmd->file and sqe_addr. Fixed.
Pavel Begunkov Oct. 30, 2024, 12:59 a.m. UTC | #2
On 10/22/24 15:50, Mark Harmstone wrote:
...
> +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
> +				      unsigned int issue_flags)
> +{
> +	struct btrfs_uring_priv *priv =
> +		*io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *);
> +	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
> +	struct extent_io_tree *io_tree = &inode->io_tree;
> +	unsigned long i;
> +	u64 cur;
> +	size_t page_offset;
> +	ssize_t ret;
> +
> +	if (priv->err) {
> +		ret = priv->err;
> +		goto out;
> +	}
> +
> +	if (priv->compressed) {
> +		i = 0;
> +		page_offset = 0;
> +	} else {
> +		i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
> +		page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
> +	}
> +	cur = 0;
> +	while (cur < priv->count) {
> +		size_t bytes = min_t(size_t, priv->count - cur,
> +				     PAGE_SIZE - page_offset);
> +
> +		if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
> +				      &priv->iter) != bytes) {

If that's an iovec backed iter that might fail, you'd need to
steal this patch

https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-12-9739c753666e@ddn.com/

and fail if "issue_flags & IO_URING_F_TASK_DEAD", see

https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-13-9739c753666e@ddn.com/


> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		i++;
> +		cur += bytes;
> +		page_offset = 0;
> +	}
> +	ret = priv->count;
> +
> +out:
> +	unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
> +	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);

When called via io_uring_cmd_complete_in_task() this function might
not get run in any reasonable amount of time. Even worse, a
misbehaving user can block it until the task dies.

I don't remember if rwsem allows unlock being executed in a different
task than the pairing lock, but blocking it for that long could be a
problem. I might not remember it right but I think Boris meantioned
that the O_DIRECT path drops the inode lock right after submission
without waiting for bios to complete. Is that right? Can we do it
here as well?

> +
> +	io_uring_cmd_done(cmd, ret, 0, issue_flags);
> +	add_rchar(current, ret);
> +
> +	for (unsigned long index = 0; index < priv->nr_pages; index++)
> +		__free_page(priv->pages[index]);
> +
> +	kfree(priv->pages);
> +	kfree(priv->iov);
> +	kfree(priv);
> +}
> +
> +void btrfs_uring_read_extent_endio(void *ctx, int err)
> +{
> +	struct btrfs_uring_priv *priv = ctx;
> +
> +	priv->err = err;
> +
> +	*io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv;

a nit, I'd suggest to create a temp var, should be easier to
read. It'd even be nicer if you wrap it into a structure
as suggested last time.

struct io_btrfs_cmd {
	struct btrfs_uring_priv *priv;
};

struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
bc->priv = priv;

> +	io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
> +}
> +
> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
> +				   u64 start, u64 lockend,
> +				   struct extent_state *cached_state,
> +				   u64 disk_bytenr, u64 disk_io_size,
> +				   size_t count, bool compressed,
> +				   struct iovec *iov,
> +				   struct io_uring_cmd *cmd)
> +{
> +	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
> +	struct extent_io_tree *io_tree = &inode->io_tree;
> +	struct page **pages;
> +	struct btrfs_uring_priv *priv = NULL;
> +	unsigned long nr_pages;
> +	int ret;
> +
> +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> +	if (!pages)
> +		return -ENOMEM;
> +	ret = btrfs_alloc_page_array(nr_pages, pages, 0);
> +	if (ret) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	priv = kmalloc(sizeof(*priv), GFP_NOFS);
> +	if (!priv) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	priv->iocb = *iocb;
> +	priv->iov = iov;
> +	priv->iter = *iter;
> +	priv->count = count;
> +	priv->cmd = cmd;
> +	priv->cached_state = cached_state;
> +	priv->compressed = compressed;
> +	priv->nr_pages = nr_pages;
> +	priv->pages = pages;
> +	priv->start = start;
> +	priv->lockend = lockend;
> +	priv->err = 0;
> +
> +	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
> +						    disk_io_size, pages,
> +						    priv);
> +	if (ret && ret != -EIOCBQUEUED)
> +		goto fail;

Turning both into return EIOCBQUEUED is a bit suspicious, but
I lack context to say. Might make sense to return ret and let
the caller handle it.

> +
> +	/*
> +	 * If we return -EIOCBQUEUED, we're deferring the cleanup to
> +	 * btrfs_uring_read_finished, which will handle unlocking the extent
> +	 * and inode and freeing the allocations.
> +	 */
> +
> +	return -EIOCBQUEUED;
> +
> +fail:
> +	unlock_extent(io_tree, start, lockend, &cached_state);
> +	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +	kfree(priv);
> +	return ret;
> +}
> +
> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> +				    unsigned int issue_flags)
> +{
> +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> +					     flags);
> +	size_t copy_end;
> +	struct btrfs_ioctl_encoded_io_args args = { 0 };
> +	int ret;
> +	u64 disk_bytenr, disk_io_size;
> +	struct file *file = cmd->file;
> +	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> +	struct extent_io_tree *io_tree = &inode->io_tree;
> +	struct iovec iovstack[UIO_FASTIOV];
> +	struct iovec *iov = iovstack;
> +	struct iov_iter iter;
> +	loff_t pos;
> +	struct kiocb kiocb;
> +	struct extent_state *cached_state = NULL;
> +	u64 start, lockend;
> +	void __user *sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));

Let's rename it, I was taken aback for a brief second why
you're copy_from_user() from an SQE / the ring, which turns
out to be a user pointer to a btrfs structure.

...
> +	ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
> +				 &disk_bytenr, &disk_io_size);
> +	if (ret < 0 && ret != -EIOCBQUEUED)
> +		goto out_free;
> +
> +	file_accessed(file);
> +
> +	if (copy_to_user(sqe_addr + copy_end, (char *)&args + copy_end_kernel,
> +			 sizeof(args) - copy_end_kernel)) {
> +		if (ret == -EIOCBQUEUED) {
> +			unlock_extent(io_tree, start, lockend, &cached_state);
> +			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +		}> +		ret = -EFAULT;
> +		goto out_free;

It seems we're saving iov in the priv structure, who can access the iovec
after the request is submitted? -EIOCBQUEUED in general means that the
request is submitted and will get completed async, e.g. via callback, and
if the bio callback can use the iov maybe via the iter, this goto will be
a use after free.

Also, you're returning -EFAULT back to io_uring, which will kill the
io_uring request / cmd while there might still be in flight bios that
can try to access it.

Can you inject errors into the copy and test please?

> +	}
> +
> +	if (ret == -EIOCBQUEUED) {
> +		u64 count;
> +
> +		/*
> +		 * If we've optimized things by storing the iovecs on the stack,
> +		 * undo this.
> +		 */> +		if (!iov) {
> +			iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
> +				      GFP_NOFS);
> +			if (!iov) {
> +				unlock_extent(io_tree, start, lockend,
> +					      &cached_state);
> +				btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +				ret = -ENOMEM;
> +				goto out_acct;
> +			}
> +
> +			memcpy(iov, iovstack,
> +			       sizeof(struct iovec) * args.iovcnt);
> +		}
> +
> +		count = min_t(u64, iov_iter_count(&iter), disk_io_size);
> +
> +		/* Match ioctl by not returning past EOF if uncompressed */
> +		if (!args.compression)
> +			count = min_t(u64, count, args.len);
> +
> +		ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
> +					      cached_state, disk_bytenr,
> +					      disk_io_size, count,
> +					      args.compression, iov, cmd);
> +
> +		goto out_acct;
> +	}
> +
> +out_free:
> +	kfree(iov);
> +
> +out_acct:
> +	if (ret > 0)
> +		add_rchar(current, ret);
> +	inc_syscr(current);
> +
> +	return ret;
> +}
David Sterba Oct. 30, 2024, 1:24 a.m. UTC | #3
On Wed, Oct 30, 2024 at 12:59:33AM +0000, Pavel Begunkov wrote:
> On 10/22/24 15:50, Mark Harmstone wrote:
> ...
> > +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
> > +				      unsigned int issue_flags)
> > +{
> > +	struct btrfs_uring_priv *priv =
> > +		*io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *);
> > +	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
> > +	struct extent_io_tree *io_tree = &inode->io_tree;
> > +	unsigned long i;
> > +	u64 cur;
> > +	size_t page_offset;
> > +	ssize_t ret;
> > +
> > +	if (priv->err) {
> > +		ret = priv->err;
> > +		goto out;
> > +	}
> > +
> > +	if (priv->compressed) {
> > +		i = 0;
> > +		page_offset = 0;
> > +	} else {
> > +		i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
> > +		page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
> > +	}
> > +	cur = 0;
> > +	while (cur < priv->count) {
> > +		size_t bytes = min_t(size_t, priv->count - cur,
> > +				     PAGE_SIZE - page_offset);
> > +
> > +		if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
> > +				      &priv->iter) != bytes) {
> 
> If that's an iovec backed iter that might fail, you'd need to
> steal this patch
> 
> https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-12-9739c753666e@ddn.com/
> 
> and fail if "issue_flags & IO_URING_F_TASK_DEAD", see
> 
> https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-13-9739c753666e@ddn.com/
> 
> 
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> > +
> > +		i++;
> > +		cur += bytes;
> > +		page_offset = 0;
> > +	}
> > +	ret = priv->count;
> > +
> > +out:
> > +	unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
> > +	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> 
> When called via io_uring_cmd_complete_in_task() this function might
> not get run in any reasonable amount of time. Even worse, a
> misbehaving user can block it until the task dies.
> 
> I don't remember if rwsem allows unlock being executed in a different
> task than the pairing lock, but blocking it for that long could be a
> problem. I might not remember it right but I think Boris meantioned
> that the O_DIRECT path drops the inode lock right after submission
> without waiting for bios to complete. Is that right? Can we do it
> here as well?
> 
> > +
> > +	io_uring_cmd_done(cmd, ret, 0, issue_flags);
> > +	add_rchar(current, ret);
> > +
> > +	for (unsigned long index = 0; index < priv->nr_pages; index++)
> > +		__free_page(priv->pages[index]);
> > +
> > +	kfree(priv->pages);
> > +	kfree(priv->iov);
> > +	kfree(priv);
> > +}
> > +
> > +void btrfs_uring_read_extent_endio(void *ctx, int err)
> > +{
> > +	struct btrfs_uring_priv *priv = ctx;
> > +
> > +	priv->err = err;
> > +
> > +	*io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv;
> 
> a nit, I'd suggest to create a temp var, should be easier to
> read. It'd even be nicer if you wrap it into a structure
> as suggested last time.
> 
> struct io_btrfs_cmd {
> 	struct btrfs_uring_priv *priv;
> };
> 
> struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
> bc->priv = priv;
> 
> > +	io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
> > +}
> > +
> > +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
> > +				   u64 start, u64 lockend,
> > +				   struct extent_state *cached_state,
> > +				   u64 disk_bytenr, u64 disk_io_size,
> > +				   size_t count, bool compressed,
> > +				   struct iovec *iov,
> > +				   struct io_uring_cmd *cmd)
> > +{
> > +	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
> > +	struct extent_io_tree *io_tree = &inode->io_tree;
> > +	struct page **pages;
> > +	struct btrfs_uring_priv *priv = NULL;
> > +	unsigned long nr_pages;
> > +	int ret;
> > +
> > +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> > +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> > +	if (!pages)
> > +		return -ENOMEM;
> > +	ret = btrfs_alloc_page_array(nr_pages, pages, 0);
> > +	if (ret) {
> > +		ret = -ENOMEM;
> > +		goto fail;
> > +	}
> > +
> > +	priv = kmalloc(sizeof(*priv), GFP_NOFS);
> > +	if (!priv) {
> > +		ret = -ENOMEM;
> > +		goto fail;
> > +	}
> > +
> > +	priv->iocb = *iocb;
> > +	priv->iov = iov;
> > +	priv->iter = *iter;
> > +	priv->count = count;
> > +	priv->cmd = cmd;
> > +	priv->cached_state = cached_state;
> > +	priv->compressed = compressed;
> > +	priv->nr_pages = nr_pages;
> > +	priv->pages = pages;
> > +	priv->start = start;
> > +	priv->lockend = lockend;
> > +	priv->err = 0;
> > +
> > +	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
> > +						    disk_io_size, pages,
> > +						    priv);
> > +	if (ret && ret != -EIOCBQUEUED)
> > +		goto fail;
> 
> Turning both into return EIOCBQUEUED is a bit suspicious, but
> I lack context to say. Might make sense to return ret and let
> the caller handle it.
> 
> > +
> > +	/*
> > +	 * If we return -EIOCBQUEUED, we're deferring the cleanup to
> > +	 * btrfs_uring_read_finished, which will handle unlocking the extent
> > +	 * and inode and freeing the allocations.
> > +	 */
> > +
> > +	return -EIOCBQUEUED;
> > +
> > +fail:
> > +	unlock_extent(io_tree, start, lockend, &cached_state);
> > +	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> > +	kfree(priv);
> > +	return ret;
> > +}
> > +
> > +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> > +				    unsigned int issue_flags)
> > +{
> > +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> > +					     flags);
> > +	size_t copy_end;
> > +	struct btrfs_ioctl_encoded_io_args args = { 0 };
> > +	int ret;
> > +	u64 disk_bytenr, disk_io_size;
> > +	struct file *file = cmd->file;
> > +	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> > +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> > +	struct extent_io_tree *io_tree = &inode->io_tree;
> > +	struct iovec iovstack[UIO_FASTIOV];
> > +	struct iovec *iov = iovstack;
> > +	struct iov_iter iter;
> > +	loff_t pos;
> > +	struct kiocb kiocb;
> > +	struct extent_state *cached_state = NULL;
> > +	u64 start, lockend;
> > +	void __user *sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
> 
> Let's rename it, I was taken aback for a brief second why
> you're copy_from_user() from an SQE / the ring, which turns
> out to be a user pointer to a btrfs structure.
> 
> ...
> > +	ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
> > +				 &disk_bytenr, &disk_io_size);
> > +	if (ret < 0 && ret != -EIOCBQUEUED)
> > +		goto out_free;
> > +
> > +	file_accessed(file);
> > +
> > +	if (copy_to_user(sqe_addr + copy_end, (char *)&args + copy_end_kernel,
> > +			 sizeof(args) - copy_end_kernel)) {
> > +		if (ret == -EIOCBQUEUED) {
> > +			unlock_extent(io_tree, start, lockend, &cached_state);
> > +			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> > +		}> +		ret = -EFAULT;
> > +		goto out_free;
> 
> It seems we're saving iov in the priv structure, who can access the iovec
> after the request is submitted? -EIOCBQUEUED in general means that the
> request is submitted and will get completed async, e.g. via callback, and
> if the bio callback can use the iov maybe via the iter, this goto will be
> a use after free.
> 
> Also, you're returning -EFAULT back to io_uring, which will kill the
> io_uring request / cmd while there might still be in flight bios that
> can try to access it.
> 
> Can you inject errors into the copy and test please?

Thanks for the comments. I get the impression that there are known
problems on the io_uring side, so until that is resolved the btrfs part
may be insecure or with known runtime bugs, but in the end it does not
need any change. We just need to wait until it's resoved on the
interface level.

The patches you point to are from FUSE trying to wire up io_uring so
this looks like an interface problem. We recently have gained a config
option level gurard for experimental and unstable features so we can add
the code but don't have to expose users to the functionality unless they
konw there are risks or known problems. The io_uring and encoded read
has a performance benefit and I'd like to get the patches in for 6.13
but if there's something serious, one option is not add the code or at
least guard it (behind a config option).

I'm open to both and we have at least one -rc kernel to decide.
Pavel Begunkov Oct. 30, 2024, 2:32 a.m. UTC | #4
On 10/30/24 01:24, David Sterba wrote:
> On Wed, Oct 30, 2024 at 12:59:33AM +0000, Pavel Begunkov wrote:
>> On 10/22/24 15:50, Mark Harmstone wrote:
...
>> It seems we're saving iov in the priv structure, who can access the iovec
>> after the request is submitted? -EIOCBQUEUED in general means that the
>> request is submitted and will get completed async, e.g. via callback, and
>> if the bio callback can use the iov maybe via the iter, this goto will be
>> a use after free.
>>
>> Also, you're returning -EFAULT back to io_uring, which will kill the
>> io_uring request / cmd while there might still be in flight bios that
>> can try to access it.
>>
>> Can you inject errors into the copy and test please?
> 
> Thanks for the comments. I get the impression that there are known
> problems on the io_uring side, so until that is resolved the btrfs part
> may be insecure or with known runtime bugs, but in the end it does not
> need any change. We just need to wait until it's resoved on the
> interface level.

There is nothing wrong with io_uring, it's jumping from synchronous
to asynchronous that concerns me, or more specifically how this series
handles it and all races. Basic stuff like not freeing / changing
without protection memory that the async part might still be using.
That's up to this series to do it right.

> The patches you point to are from FUSE trying to wire up io_uring so
> this looks like an interface problem. We recently have gained a config

That's the easiest part of all, it can only happen when the
task dies and mm becomes unavaliable, sane userspace shouldn't
have problems like that. Mark just needs to include the referred
patch into the series and handle the request as mentioned.

> option level gurard for experimental and unstable features so we can add
> the code but don't have to expose users to the functionality unless they
> konw there are risks or known problems. The io_uring and encoded read
> has a performance benefit and

Good to hear that

> I'd like to get the patches in for 6.13
> but if there's something serious, one option is not add the code or at
> least guard it (behind a config option).

Let's see what Mark replies, I might be missing some things, and
you and other btrfs folks can help to answer the unlock question.

> I'm open to both and we have at least one -rc kernel to decide.
Mark Harmstone Oct. 31, 2024, 5:08 p.m. UTC | #5
Thanks Pavel.

On 30/10/24 00:59, Pavel Begunkov wrote:
> > 
> On 10/22/24 15:50, Mark Harmstone wrote:
> ...
>> +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
>> +                      unsigned int issue_flags)
>> +{
>> +    struct btrfs_uring_priv *priv =
>> +        *io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *);
>> +    struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
>> +    struct extent_io_tree *io_tree = &inode->io_tree;
>> +    unsigned long i;
>> +    u64 cur;
>> +    size_t page_offset;
>> +    ssize_t ret;
>> +
>> +    if (priv->err) {
>> +        ret = priv->err;
>> +        goto out;
>> +    }
>> +
>> +    if (priv->compressed) {
>> +        i = 0;
>> +        page_offset = 0;
>> +    } else {
>> +        i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
>> +        page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
>> +    }
>> +    cur = 0;
>> +    while (cur < priv->count) {
>> +        size_t bytes = min_t(size_t, priv->count - cur,
>> +                     PAGE_SIZE - page_offset);
>> +
>> +        if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
>> +                      &priv->iter) != bytes) {
> 
> If that's an iovec backed iter that might fail, you'd need to
> steal this patch
> 
> https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-12-9739c753666e@ddn.com/
> 
> and fail if "issue_flags & IO_URING_F_TASK_DEAD", see
> 
> https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-13-9739c753666e@ddn.com/

Thanks, I've sent a patchset including your patch. Does it make a 
difference, though? If the task has died, presumably copy_page_to_iter 
can't copy to another process' memory...?

>> +            ret = -EFAULT;
>> +            goto out;
>> +        }
>> +
>> +        i++;
>> +        cur += bytes;
>> +        page_offset = 0;
>> +    }
>> +    ret = priv->count;
>> +
>> +out:
>> +    unlock_extent(io_tree, priv->start, priv->lockend, 
>> &priv->cached_state);
>> +    btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> 
> When called via io_uring_cmd_complete_in_task() this function might
> not get run in any reasonable amount of time. Even worse, a
> misbehaving user can block it until the task dies.
> 
> I don't remember if rwsem allows unlock being executed in a different
> task than the pairing lock, but blocking it for that long could be a
> problem. I might not remember it right but I think Boris meantioned
> that the O_DIRECT path drops the inode lock right after submission
> without waiting for bios to complete. Is that right? Can we do it
> here as well?

We can't release the inode lock until we've released the extent lock. I 
do intend to look into reducing the amount of time we hold the extent 
lock, if we can, but it's not trivial to do this in a safe manner.
We could perhaps move the unlocking to btrfs_uring_read_extent_endio 
instead, but it looks unlocking an rwsem in a different context might 
cause problems with PREEMPT_RT(?).

>> +
>> +    io_uring_cmd_done(cmd, ret, 0, issue_flags);
>> +    add_rchar(current, ret);
>> +
>> +    for (unsigned long index = 0; index < priv->nr_pages; index++)
>> +        __free_page(priv->pages[index]);
>> +
>> +    kfree(priv->pages);
>> +    kfree(priv->iov);
>> +    kfree(priv);
>> +}
>> +
>> +void btrfs_uring_read_extent_endio(void *ctx, int err)
>> +{
>> +    struct btrfs_uring_priv *priv = ctx;
>> +
>> +    priv->err = err;
>> +
>> +    *io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv;
> 
> a nit, I'd suggest to create a temp var, should be easier to
> read. It'd even be nicer if you wrap it into a structure
> as suggested last time.
> 
> struct io_btrfs_cmd {
>      struct btrfs_uring_priv *priv;
> };
> 
> struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
> bc->priv = priv;

No problem, I've sent a patch for this.

>> +    io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
>> +}
>> +
>> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct 
>> iov_iter *iter,
>> +                   u64 start, u64 lockend,
>> +                   struct extent_state *cached_state,
>> +                   u64 disk_bytenr, u64 disk_io_size,
>> +                   size_t count, bool compressed,
>> +                   struct iovec *iov,
>> +                   struct io_uring_cmd *cmd)
>> +{
>> +    struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
>> +    struct extent_io_tree *io_tree = &inode->io_tree;
>> +    struct page **pages;
>> +    struct btrfs_uring_priv *priv = NULL;
>> +    unsigned long nr_pages;
>> +    int ret;
>> +
>> +    nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
>> +    pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
>> +    if (!pages)
>> +        return -ENOMEM;
>> +    ret = btrfs_alloc_page_array(nr_pages, pages, 0);
>> +    if (ret) {
>> +        ret = -ENOMEM;
>> +        goto fail;
>> +    }
>> +
>> +    priv = kmalloc(sizeof(*priv), GFP_NOFS);
>> +    if (!priv) {
>> +        ret = -ENOMEM;
>> +        goto fail;
>> +    }
>> +
>> +    priv->iocb = *iocb;
>> +    priv->iov = iov;
>> +    priv->iter = *iter;
>> +    priv->count = count;
>> +    priv->cmd = cmd;
>> +    priv->cached_state = cached_state;
>> +    priv->compressed = compressed;
>> +    priv->nr_pages = nr_pages;
>> +    priv->pages = pages;
>> +    priv->start = start;
>> +    priv->lockend = lockend;
>> +    priv->err = 0;
>> +
>> +    ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
>> +                            disk_io_size, pages,
>> +                            priv);
>> +    if (ret && ret != -EIOCBQUEUED)
>> +        goto fail;
> 
> Turning both into return EIOCBQUEUED is a bit suspicious, but
> I lack context to say. Might make sense to return ret and let
> the caller handle it.

btrfs_encoded_read_regular_fill_pages returns 0 if the bio completes 
before the function can finish, -EIOCBQUEUED otherwise. In either case 
the behaviour of the calling function will be the same.

>> +
>> +    /*
>> +     * If we return -EIOCBQUEUED, we're deferring the cleanup to
>> +     * btrfs_uring_read_finished, which will handle unlocking the extent
>> +     * and inode and freeing the allocations.
>> +     */
>> +
>> +    return -EIOCBQUEUED;
>> +
>> +fail:
>> +    unlock_extent(io_tree, start, lockend, &cached_state);
>> +    btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>> +    kfree(priv);
>> +    return ret;
>> +}
>> +
>> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
>> +                    unsigned int issue_flags)
>> +{
>> +    size_t copy_end_kernel = offsetofend(struct 
>> btrfs_ioctl_encoded_io_args,
>> +                         flags);
>> +    size_t copy_end;
>> +    struct btrfs_ioctl_encoded_io_args args = { 0 };
>> +    int ret;
>> +    u64 disk_bytenr, disk_io_size;
>> +    struct file *file = cmd->file;
>> +    struct btrfs_inode *inode = BTRFS_I(file->f_inode);
>> +    struct btrfs_fs_info *fs_info = inode->root->fs_info;
>> +    struct extent_io_tree *io_tree = &inode->io_tree;
>> +    struct iovec iovstack[UIO_FASTIOV];
>> +    struct iovec *iov = iovstack;
>> +    struct iov_iter iter;
>> +    loff_t pos;
>> +    struct kiocb kiocb;
>> +    struct extent_state *cached_state = NULL;
>> +    u64 start, lockend;
>> +    void __user *sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
> 
> Let's rename it, I was taken aback for a brief second why
> you're copy_from_user() from an SQE / the ring, which turns
> out to be a user pointer to a btrfs structure.

sqe_addr being the addr field in the SQE, not the address of the SQE. I 
can see how it might be misleading, though.

> ...
>> +    ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
>> +                 &disk_bytenr, &disk_io_size);
>> +    if (ret < 0 && ret != -EIOCBQUEUED)
>> +        goto out_free;
>> +
>> +    file_accessed(file);
>> +
>> +    if (copy_to_user(sqe_addr + copy_end, (char *)&args + 
>> copy_end_kernel,
>> +             sizeof(args) - copy_end_kernel)) {
>> +        if (ret == -EIOCBQUEUED) {
>> +            unlock_extent(io_tree, start, lockend, &cached_state);
>> +            btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>> +        }
>> +        ret = -EFAULT;
>> +        goto out_free;
> 
> It seems we're saving iov in the priv structure, who can access the iovec
> after the request is submitted? -EIOCBQUEUED in general means that the
> request is submitted and will get completed async, e.g. via callback, and
> if the bio callback can use the iov maybe via the iter, this goto will be
> a use after free.
> 
> Also, you're returning -EFAULT back to io_uring, which will kill the
> io_uring request / cmd while there might still be in flight bios that
> can try to access it.
> 
> Can you inject errors into the copy and test please?

The bio hasn't been submitted at this point, that happens in 
btrfs_uring_read_extent. So far all we've done is read the metadata from 
the page cache. The copy_to_user here is copying the metadata info to 
the userspace structure.

> 
>> +    }
>> +
>> +    if (ret == -EIOCBQUEUED) {
>> +        u64 count;
>> +
>> +        /*
>> +         * If we've optimized things by storing the iovecs on the stack,
>> +         * undo this.
>> +         */> +        if (!iov) {
>> +            iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
>> +                      GFP_NOFS);
>> +            if (!iov) {
>> +                unlock_extent(io_tree, start, lockend,
>> +                          &cached_state);
>> +                btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>> +                ret = -ENOMEM;
>> +                goto out_acct;
>> +            }
>> +
>> +            memcpy(iov, iovstack,
>> +                   sizeof(struct iovec) * args.iovcnt);
>> +        }
>> +
>> +        count = min_t(u64, iov_iter_count(&iter), disk_io_size);
>> +
>> +        /* Match ioctl by not returning past EOF if uncompressed */
>> +        if (!args.compression)
>> +            count = min_t(u64, count, args.len);
>> +
>> +        ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
>> +                          cached_state, disk_bytenr,
>> +                          disk_io_size, count,
>> +                          args.compression, iov, cmd);
>> +
>> +        goto out_acct;
>> +    }
>> +
>> +out_free:
>> +    kfree(iov);
>> +
>> +out_acct:
>> +    if (ret > 0)
>> +        add_rchar(current, ret);
>> +    inc_syscr(current);
>> +
>> +    return ret;
>> +}
>
Pavel Begunkov Oct. 31, 2024, 6:26 p.m. UTC | #6
On 10/31/24 17:08, Mark Harmstone wrote:
> Thanks Pavel.
> 
...
>> If that's an iovec backed iter that might fail, you'd need to
>> steal this patch
>>
>> https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-12-9739c753666e@ddn.com/
>>
>> and fail if "issue_flags & IO_URING_F_TASK_DEAD", see
>>
>> https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-13-9739c753666e@ddn.com/
> 
> Thanks, I've sent a patchset including your patch. Does it make a
> difference, though? If the task has died, presumably copy_page_to_iter
> can't copy to another process' memory...?

IIRC copy_to_user will crash without mm set, not sure about
copy_page_to_iter(). Regardless, when the original task has dies
and it gets run from io_uring's fallback path, you shouldn't
make any assumptions about the current task.

>>> +            ret = -EFAULT;
>>> +            goto out;
>>> +        }
>>> +
>>> +        i++;
>>> +        cur += bytes;
>>> +        page_offset = 0;
>>> +    }
>>> +    ret = priv->count;
>>> +
>>> +out:
>>> +    unlock_extent(io_tree, priv->start, priv->lockend,
>>> &priv->cached_state);
>>> +    btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>>
>> When called via io_uring_cmd_complete_in_task() this function might
>> not get run in any reasonable amount of time. Even worse, a
>> misbehaving user can block it until the task dies.
>>
>> I don't remember if rwsem allows unlock being executed in a different
>> task than the pairing lock, but blocking it for that long could be a
>> problem. I might not remember it right but I think Boris meantioned
>> that the O_DIRECT path drops the inode lock right after submission
>> without waiting for bios to complete. Is that right? Can we do it
>> here as well?
> 
> We can't release the inode lock until we've released the extent lock. I
> do intend to look into reducing the amount of time we hold the extent
> lock, if we can, but it's not trivial to do this in a safe manner.

I lack the btrfs knowledge, but sounds like it can be done the
same way the async dio path works.

> We could perhaps move the unlocking to btrfs_uring_read_extent_endio
> instead, but it looks unlocking an rwsem in a different context might
> cause problems with PREEMPT_RT(?).

At least from a quick glance it doesn't seem that locks in
__clear_extent_bit are [soft]irq protected. Would be a good
idea to give it a run with lockdep enabled.


...
>>> +    ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
>>> +                            disk_io_size, pages,
>>> +                            priv);
>>> +    if (ret && ret != -EIOCBQUEUED)
>>> +        goto fail;
>>
>> Turning both into return EIOCBQUEUED is a bit suspicious, but
>> I lack context to say. Might make sense to return ret and let
>> the caller handle it.
> 
> btrfs_encoded_read_regular_fill_pages returns 0 if the bio completes
> before the function can finish, -EIOCBQUEUED otherwise. In either case
> the behaviour of the calling function will be the same.

Ok

...
>>> +    if (copy_to_user(sqe_addr + copy_end, (char *)&args +
>>> copy_end_kernel,
>>> +             sizeof(args) - copy_end_kernel)) {
>>> +        if (ret == -EIOCBQUEUED) {
>>> +            unlock_extent(io_tree, start, lockend, &cached_state);
>>> +            btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>>> +        }
>>> +        ret = -EFAULT;
>>> +        goto out_free;
>>
>> It seems we're saving iov in the priv structure, who can access the iovec
>> after the request is submitted? -EIOCBQUEUED in general means that the
>> request is submitted and will get completed async, e.g. via callback, and
>> if the bio callback can use the iov maybe via the iter, this goto will be
>> a use after free.
>>
>> Also, you're returning -EFAULT back to io_uring, which will kill the
>> io_uring request / cmd while there might still be in flight bios that
>> can try to access it.
>>
>> Can you inject errors into the copy and test please?
> 
> The bio hasn't been submitted at this point, that happens in
> btrfs_uring_read_extent. So far all we've done is read the metadata from
> the page cache. The copy_to_user here is copying the metadata info to
> the userspace structure.

I see, in this case it should be fine, but why is it -EIOCBQUEUED
then? It always meant that it queued up the request and will
complete it asynchronously, and that's where the confusion sprouted
from. Not looking deeper but sounds more like -EAGAIN? Assuming it's
returned because we can't block

>>> +    }
>>> +
>>> +    if (ret == -EIOCBQUEUED) {
>>> +        u64 count;
>>> +
>>> +        /*
>>> +         * If we've optimized things by storing the iovecs on the stack,
>>> +         * undo this.
>>> +         */> +        if (!iov) {
>>> +            iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
>>> +                      GFP_NOFS);
>>> +            if (!iov) {
>>> +                unlock_extent(io_tree, start, lockend,
>>> +                          &cached_state);
>>> +                btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>>> +                ret = -ENOMEM;
>>> +                goto out_acct;
>>> +            }
>>> +
>>> +            memcpy(iov, iovstack,
>>> +                   sizeof(struct iovec) * args.iovcnt);

As an optimisation in the future you can allocate it
together with the btrfs_priv structure.

>>> +        }
>>> +
>>> +        count = min_t(u64, iov_iter_count(&iter), disk_io_size);
>>> +
>>> +        /* Match ioctl by not returning past EOF if uncompressed */
>>> +        if (!args.compression)
>>> +            count = min_t(u64, count, args.len);
>>> +
>>> +        ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
>>> +                          cached_state, disk_bytenr,
>>> +                          disk_io_size, count,
>>> +                          args.compression, iov, cmd);

So that's the only spot where asynchronous code branches off
in this function? Do I read you right?

>>> +
>>> +        goto out_acct;
>>> +    }
>>> +
>>> +out_free:
>>> +    kfree(iov);
>>> +
>>> +out_acct:
>>> +    if (ret > 0)
>>> +        add_rchar(current, ret);
>>> +    inc_syscr(current);
>>> +
>>> +    return ret;
>>> +}
diff mbox series

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ab1fbde97cee..f551444b498e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -613,7 +613,8 @@  int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
 					     int compress_type);
 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 					  u64 disk_bytenr, u64 disk_io_size,
-					  struct page **pages);
+					  struct page **pages,
+					  void *uring_ctx);
 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 			   struct btrfs_ioctl_encoded_io_args *encoded,
 			   struct extent_state **cached_state,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5e0a1805e897..fbb753300071 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3710,6 +3710,7 @@  const struct file_operations btrfs_file_operations = {
 	.compat_ioctl	= btrfs_compat_ioctl,
 #endif
 	.remap_file_range = btrfs_remap_file_range,
+	.uring_cmd	= btrfs_uring_cmd,
 	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
 };
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5aedb85696f4..759ae076f90b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9057,6 +9057,7 @@  static ssize_t btrfs_encoded_read_inline(
 
 struct btrfs_encoded_read_private {
 	wait_queue_head_t wait;
+	void *uring_ctx;
 	atomic_t pending;
 	blk_status_t status;
 };
@@ -9076,14 +9077,23 @@  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 		 */
 		WRITE_ONCE(priv->status, bbio->bio.bi_status);
 	}
-	if (!atomic_dec_return(&priv->pending))
-		wake_up(&priv->wait);
+	if (atomic_dec_return(&priv->pending) == 0) {
+		int err = blk_status_to_errno(READ_ONCE(priv->status));
+
+		if (priv->uring_ctx) {
+			btrfs_uring_read_extent_endio(priv->uring_ctx, err);
+			kfree(priv);
+		} else {
+			wake_up(&priv->wait);
+		}
+	}
 	bio_put(&bbio->bio);
 }
 
 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 					  u64 disk_bytenr, u64 disk_io_size,
-					  struct page **pages)
+					  struct page **pages,
+					  void *uring_ctx)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_encoded_read_private *priv;
@@ -9098,6 +9108,7 @@  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 	init_waitqueue_head(&priv->wait);
 	atomic_set(&priv->pending, 1);
 	priv->status = 0;
+	priv->uring_ctx = uring_ctx;
 
 	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
 			       btrfs_encoded_read_endio, priv);
@@ -9126,12 +9137,23 @@  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 	atomic_inc(&priv->pending);
 	btrfs_submit_bbio(bbio, 0);
 
-	if (atomic_dec_return(&priv->pending))
-		io_wait_event(priv->wait, !atomic_read(&priv->pending));
-	/* See btrfs_encoded_read_endio() for ordering. */
-	ret = blk_status_to_errno(READ_ONCE(priv->status));
-	kfree(priv);
-	return ret;
+	if (uring_ctx) {
+		if (atomic_dec_return(&priv->pending) == 0) {
+			ret = blk_status_to_errno(READ_ONCE(priv->status));
+			btrfs_uring_read_extent_endio(uring_ctx, ret);
+			kfree(priv);
+			return ret;
+		}
+
+		return -EIOCBQUEUED;
+	} else {
+		if (atomic_dec_return(&priv->pending) != 0)
+			io_wait_event(priv->wait, !atomic_read(&priv->pending));
+		/* See btrfs_encoded_read_endio() for ordering. */
+		ret = blk_status_to_errno(READ_ONCE(priv->status));
+		kfree(priv);
+		return ret;
+	}
 }
 
 ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
@@ -9160,7 +9182,8 @@  ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
-						    disk_io_size, pages);
+						    disk_io_size, pages,
+						    NULL);
 	if (ret)
 		goto out;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d502b31010bc..7f2731ef3dbb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -29,6 +29,7 @@ 
 #include <linux/fileattr.h>
 #include <linux/fsverity.h>
 #include <linux/sched/xacct.h>
+#include <linux/io_uring/cmd.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "export.h"
@@ -4720,6 +4721,314 @@  static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
 	return ret;
 }
 
+/*
+ * struct btrfs_uring_priv is the context that's attached to an encoded read
+ * io_uring command, in cmd->pdu. It contains the fields in
+ * btrfs_uring_read_extent that are necessary to finish off and cleanup the I/O
+ * in btrfs_uring_read_finished.
+ */
+struct btrfs_uring_priv {
+	struct io_uring_cmd *cmd;
+	struct page **pages;
+	unsigned long nr_pages;
+	struct kiocb iocb;
+	struct iovec *iov;
+	struct iov_iter iter;
+	struct extent_state *cached_state;
+	u64 count;
+	u64 start;
+	u64 lockend;
+	int err;
+	bool compressed;
+};
+
+static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
+				      unsigned int issue_flags)
+{
+	struct btrfs_uring_priv *priv =
+		*io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *);
+	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	unsigned long i;
+	u64 cur;
+	size_t page_offset;
+	ssize_t ret;
+
+	if (priv->err) {
+		ret = priv->err;
+		goto out;
+	}
+
+	if (priv->compressed) {
+		i = 0;
+		page_offset = 0;
+	} else {
+		i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
+		page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
+	}
+	cur = 0;
+	while (cur < priv->count) {
+		size_t bytes = min_t(size_t, priv->count - cur,
+				     PAGE_SIZE - page_offset);
+
+		if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
+				      &priv->iter) != bytes) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		i++;
+		cur += bytes;
+		page_offset = 0;
+	}
+	ret = priv->count;
+
+out:
+	unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+
+	io_uring_cmd_done(cmd, ret, 0, issue_flags);
+	add_rchar(current, ret);
+
+	for (unsigned long index = 0; index < priv->nr_pages; index++)
+		__free_page(priv->pages[index]);
+
+	kfree(priv->pages);
+	kfree(priv->iov);
+	kfree(priv);
+}
+
+void btrfs_uring_read_extent_endio(void *ctx, int err)
+{
+	struct btrfs_uring_priv *priv = ctx;
+
+	priv->err = err;
+
+	*io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv;
+	io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
+}
+
+static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
+				   u64 start, u64 lockend,
+				   struct extent_state *cached_state,
+				   u64 disk_bytenr, u64 disk_io_size,
+				   size_t count, bool compressed,
+				   struct iovec *iov,
+				   struct io_uring_cmd *cmd)
+{
+	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	struct page **pages;
+	struct btrfs_uring_priv *priv = NULL;
+	unsigned long nr_pages;
+	int ret;
+
+	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+	if (!pages)
+		return -ENOMEM;
+	ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+	if (ret) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	priv = kmalloc(sizeof(*priv), GFP_NOFS);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	priv->iocb = *iocb;
+	priv->iov = iov;
+	priv->iter = *iter;
+	priv->count = count;
+	priv->cmd = cmd;
+	priv->cached_state = cached_state;
+	priv->compressed = compressed;
+	priv->nr_pages = nr_pages;
+	priv->pages = pages;
+	priv->start = start;
+	priv->lockend = lockend;
+	priv->err = 0;
+
+	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
+						    disk_io_size, pages,
+						    priv);
+	if (ret && ret != -EIOCBQUEUED)
+		goto fail;
+
+	/*
+	 * If we return -EIOCBQUEUED, we're deferring the cleanup to
+	 * btrfs_uring_read_finished, which will handle unlocking the extent
+	 * and inode and freeing the allocations.
+	 */
+
+	return -EIOCBQUEUED;
+
+fail:
+	unlock_extent(io_tree, start, lockend, &cached_state);
+	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+	kfree(priv);
+	return ret;
+}
+
+static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
+				    unsigned int issue_flags)
+{
+	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+					     flags);
+	size_t copy_end;
+	struct btrfs_ioctl_encoded_io_args args = { 0 };
+	int ret;
+	u64 disk_bytenr, disk_io_size;
+	struct file *file = cmd->file;
+	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	struct iov_iter iter;
+	loff_t pos;
+	struct kiocb kiocb;
+	struct extent_state *cached_state = NULL;
+	u64 start, lockend;
+	void __user *sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out_acct;
+	}
+
+	if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+		struct btrfs_ioctl_encoded_io_args_32 args32;
+
+		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+				       flags);
+		if (copy_from_user(&args32, sqe_addr, copy_end)) {
+			ret = -EFAULT;
+			goto out_acct;
+		}
+		args.iov = compat_ptr(args32.iov);
+		args.iovcnt = args32.iovcnt;
+		args.offset = args32.offset;
+		args.flags = args32.flags;
+#else
+		return -ENOTTY;
+#endif
+	} else {
+		copy_end = copy_end_kernel;
+		if (copy_from_user(&args, sqe_addr, copy_end)) {
+			ret = -EFAULT;
+			goto out_acct;
+		}
+	}
+
+	if (args.flags != 0)
+		return -EINVAL;
+
+	ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+			   &iov, &iter);
+	if (ret < 0)
+		goto out_acct;
+
+	if (iov_iter_count(&iter) == 0) {
+		ret = 0;
+		goto out_free;
+	}
+
+	pos = args.offset;
+	ret = rw_verify_area(READ, file, &pos, args.len);
+	if (ret < 0)
+		goto out_free;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = pos;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		kiocb.ki_flags |= IOCB_NOWAIT;
+
+	start = ALIGN_DOWN(pos, fs_info->sectorsize);
+	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+	ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+				 &disk_bytenr, &disk_io_size);
+	if (ret < 0 && ret != -EIOCBQUEUED)
+		goto out_free;
+
+	file_accessed(file);
+
+	if (copy_to_user(sqe_addr + copy_end, (char *)&args + copy_end_kernel,
+			 sizeof(args) - copy_end_kernel)) {
+		if (ret == -EIOCBQUEUED) {
+			unlock_extent(io_tree, start, lockend, &cached_state);
+			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+		}
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	if (ret == -EIOCBQUEUED) {
+		u64 count;
+
+		/*
+		 * If we've optimized things by storing the iovecs on the stack,
+		 * undo this.
+		 */
+		if (!iov) {
+			iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
+				      GFP_NOFS);
+			if (!iov) {
+				unlock_extent(io_tree, start, lockend,
+					      &cached_state);
+				btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+				ret = -ENOMEM;
+				goto out_acct;
+			}
+
+			memcpy(iov, iovstack,
+			       sizeof(struct iovec) * args.iovcnt);
+		}
+
+		count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+
+		/* Match ioctl by not returning past EOF if uncompressed */
+		if (!args.compression)
+			count = min_t(u64, count, args.len);
+
+		ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
+					      cached_state, disk_bytenr,
+					      disk_io_size, count,
+					      args.compression, iov, cmd);
+
+		goto out_acct;
+	}
+
+out_free:
+	kfree(iov);
+
+out_acct:
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+
+	return ret;
+}
+
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	switch (cmd->cmd_op) {
+	case BTRFS_IOC_ENCODED_READ:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+	case BTRFS_IOC_ENCODED_READ_32:
+#endif
+		return btrfs_uring_encoded_read(cmd, issue_flags);
+	}
+
+	return -EINVAL;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 19cd26b0244a..2b760c8778f8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,5 +22,7 @@  void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
 int __pure btrfs_is_empty_uuid(const u8 *uuid);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 				     struct btrfs_ioctl_balance_args *bargs);
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void btrfs_uring_read_extent_endio(void *ctx, int err);
 
 #endif
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 03b31b1c39be..cadb945bb345 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5669,7 +5669,8 @@  static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
 	ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
 						    disk_bytenr, disk_num_bytes,
 						    sctx->send_buf_pages +
-						    (data_offset >> PAGE_SHIFT));
+						    (data_offset >> PAGE_SHIFT),
+						    NULL);
 	if (ret)
 		goto out;