diff mbox series

[v12,09/17] btrfs: add BTRFS_IOC_ENCODED_READ

Message ID 111c7aadb3fa218b7926a49acf3fb34654d89a75.1637179348.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show
Series btrfs: add ioctls and send/receive support for reading/writing compressed data | expand

Commit Message

Omar Sandoval Nov. 17, 2021, 8:19 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

There are 4 main cases:

1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
   from disk.
4. Regular, compressed extents: we read the entire compressed extent
   from disk and indicate what subset of the decompressed extent is in
   the file.

This initial implementation simplifies a few things that can be improved
in the future:

- We hold the inode lock during the operation.
- Cases 1, 3, and 4 allocate temporary memory to read into before
  copying out to userspace.
- We don't do read repair, because it turns out that read repair is
  currently broken for compressed data.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/btrfs/ctree.h |   4 +
 fs/btrfs/inode.c | 496 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.c | 106 ++++++++++
 3 files changed, 606 insertions(+)

Comments

David Sterba Nov. 18, 2021, 2:55 p.m. UTC | #1
On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> +static int btrfs_encoded_io_compression_from_extent(int compress_type)
> +{
> +	switch (compress_type) {
> +	case BTRFS_COMPRESS_NONE:
> +		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
> +	case BTRFS_COMPRESS_ZLIB:
> +		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
> +	case BTRFS_COMPRESS_LZO:
> +		/*
> +		 * The LZO format depends on the page size. 64k is the maximum

Should this also say it depends ont the sector (not page) size?

> +		 * sectorsize (and thus page size) that we support.
> +		 */
> +		if (PAGE_SIZE < SZ_4K || PAGE_SIZE > SZ_64K)
> +			return -EINVAL;
> +		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (PAGE_SHIFT - 12);
> +	case BTRFS_COMPRESS_ZSTD:
> +		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
> +	default:
> +		return -EUCLEAN;
> +	}
> +}
> +
> +
> +		inline_size = btrfs_file_extent_inline_item_len(leaf,
> +								path->slots[0]);
> +		if (inline_size > count) {
> +			ret = -ENOBUFS;
> +			goto out;
> +		}
> +		count = inline_size;
> +		encoded->unencoded_len = ram_bytes;
> +		encoded->unencoded_offset = iocb->ki_pos - extent_start;
> +	} else {
> +		encoded->len = encoded->unencoded_len = count =
> +			min_t(u64, count, encoded->len);

Please don't do chained initializations.

> +		ptr += iocb->ki_pos - extent_start;
> +	}
> +
> +	tmp = kmalloc(count, GFP_NOFS);
> +	if (!tmp) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	read_extent_buffer(leaf, tmp, ptr, count);
> +	btrfs_release_path(path);
> +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> +	inode_unlock_shared(inode);
> +	*unlocked = true;
> +
> +	ret = copy_to_iter(tmp, count, iter);
> +	if (ret != count)
> +		ret = -EFAULT;
> +	kfree(tmp);
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +struct btrfs_encoded_read_private {
> +	struct inode *inode;
> +	u64 file_offset;
> +	wait_queue_head_t wait;
> +	atomic_t pending;
> +	blk_status_t status;
> +	bool skip_csum;
> +};
> +
> +static blk_status_t submit_encoded_read_bio(struct inode *inode,
> +					    struct bio *bio, int mirror_num,
> +					    unsigned long bio_flags)
> +{
> +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> +	struct btrfs_bio *bbio = btrfs_bio(bio);
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	blk_status_t ret;
> +
> +	if (!priv->skip_csum) {
> +		ret = btrfs_lookup_bio_sums(inode, bio, NULL);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
> +	if (ret) {
> +		btrfs_bio_free_csum(bbio);
> +		return ret;
> +	}
> +
> +	atomic_inc(&priv->pending);
> +	ret = btrfs_map_bio(fs_info, bio, mirror_num);
> +	if (ret) {
> +		atomic_dec(&priv->pending);
> +		btrfs_bio_free_csum(bbio);
> +	}
> +	return ret;
> +}
> +
> +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
> +{
> +	const bool uptodate = bbio->bio.bi_status == BLK_STS_OK;
> +	struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
> +	struct inode *inode = priv->inode;
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	u32 sectorsize = fs_info->sectorsize;
> +	struct bio_vec *bvec;
> +	struct bvec_iter_all iter_all;
> +	u64 start = priv->file_offset;
> +	u32 bio_offset = 0;
> +
> +	if (priv->skip_csum || !uptodate)
> +		return bbio->bio.bi_status;
> +
> +	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
> +		unsigned int i, nr_sectors, pgoff;
> +
> +		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
> +		pgoff = bvec->bv_offset;
> +		for (i = 0; i < nr_sectors; i++) {
> +			ASSERT(pgoff < PAGE_SIZE);
> +			if (check_data_csum(inode, bbio, bio_offset,
> +					    bvec->bv_page, pgoff, start))
> +				return BLK_STS_IOERR;
> +			start += sectorsize;
> +			bio_offset += sectorsize;
> +			pgoff += sectorsize;
> +		}
> +	}
> +	return BLK_STS_OK;
> +}
> +
> +static void btrfs_encoded_read_endio(struct bio *bio)
> +{
> +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> +	struct btrfs_bio *bbio = btrfs_bio(bio);
> +	blk_status_t status;
> +
> +	status = btrfs_encoded_read_verify_csum(bbio);
> +	if (status) {
> +		/*
> +		 * The memory barrier implied by the atomic_dec_return() here
> +		 * pairs with the memory barrier implied by the
> +		 * atomic_dec_return() or io_wait_event() in
> +		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
> +		 * write is observed before the load of status in
> +		 * btrfs_encoded_read_regular_fill_pages().
> +		 */
> +		WRITE_ONCE(priv->status, status);

The WRITE_ONCE is here 3 times, I wonder if this is ok to be opencoded
like that. I'd suggest to use a helper with a comment.

> +	}
> +	if (!atomic_dec_return(&priv->pending))
> +		wake_up(&priv->wait);
> +	btrfs_bio_free_csum(bbio);
> +	bio_put(bio);
> +}
> +
> +static int btrfs_encoded_read_regular_fill_pages(struct inode *inode,
> +						 u64 file_offset,
> +						 u64 disk_bytenr,
> +						 u64 disk_io_size,
> +						 struct page **pages)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	struct btrfs_encoded_read_private priv = {
> +		.inode = inode,
> +		.file_offset = file_offset,
> +		.pending = ATOMIC_INIT(1),
> +		.skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM,
> +	};
> +	unsigned long i = 0;
> +	u64 cur = 0;
> +	int ret;
> +
> +	init_waitqueue_head(&priv.wait);
> +	/*
> +	 * Submit bios for the extent, splitting due to bio or stripe limits as
> +	 * necessary.
> +	 */
> +	while (cur < disk_io_size) {
> +		struct extent_map *em;
> +		struct btrfs_io_geometry geom;
> +		struct bio *bio = NULL;
> +		u64 remaining;
> +
> +		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
> +					 disk_io_size - cur);
> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +		} else {
> +			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
> +						    disk_bytenr + cur, &geom);
> +			free_extent_map(em);
> +		}
> +		if (ret) {
> +			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
> +			break;
> +		}
> +		remaining = min(geom.len, disk_io_size - cur);
> +		while (bio || remaining) {
> +			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
> +
> +			if (!bio) {
> +				bio = btrfs_bio_alloc(BIO_MAX_VECS);
> +				bio->bi_iter.bi_sector =
> +					(disk_bytenr + cur) >> SECTOR_SHIFT;
> +				bio->bi_end_io = btrfs_encoded_read_endio;
> +				bio->bi_private = &priv;
> +				bio->bi_opf = REQ_OP_READ;
> +			}
> +
> +			if (!bytes ||
> +			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
> +				blk_status_t status;
> +
> +				status = submit_encoded_read_bio(inode, bio, 0,
> +								 0);
> +				if (status) {
> +					WRITE_ONCE(priv.status, status);
> +					bio_put(bio);
> +					goto out;
> +				}
> +				bio = NULL;
> +				continue;
> +			}
> +
> +			i++;
> +			cur += bytes;
> +			remaining -= bytes;
> +		}
> +	}
> +
> +out:
> +	if (atomic_dec_return(&priv.pending))
> +		io_wait_event(priv.wait, !atomic_read(&priv.pending));
> +	/* See btrfs_encoded_read_endio() for ordering. */
> +	return blk_status_to_errno(READ_ONCE(priv.status));
> +}
> +
> +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
> +					  struct iov_iter *iter,
> +					  u64 start, u64 lockend,
> +					  struct extent_state **cached_state,
> +					  u64 disk_bytenr, u64 disk_io_size,
> +					  size_t count, bool compressed,
> +					  bool *unlocked)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	struct page **pages;
> +	unsigned long nr_pages, i;
> +	u64 cur;
> +	size_t page_offset;
> +	ssize_t ret;
> +
> +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);

Power of two compile time constants can use the bitmask operations for
alighnment.

> +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> +	if (!pages)
> +		return -ENOMEM;
> +	for (i = 0; i < nr_pages; i++) {
> +		pages[i] = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
> +		if (!pages[i]) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +	}
> +
> +	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> +						    disk_io_size, pages);
> +	if (ret)
> +		goto out;
> +
> +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> +	inode_unlock_shared(inode);
> +	*unlocked = true;
> +
> +	if (compressed) {
> +		i = 0;
> +		page_offset = 0;
> +	} else {
> +		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
> +		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
> +	}
> +	cur = 0;
> +	while (cur < count) {
> +		size_t bytes = min_t(size_t, count - cur,
> +				     PAGE_SIZE - page_offset);
> +
> +		if (copy_page_to_iter(pages[i], page_offset, bytes,
> +				      iter) != bytes) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +		i++;
> +		cur += bytes;
> +		page_offset = 0;
> +	}
> +	ret = count;
> +out:
> +	for (i = 0; i < nr_pages; i++) {
> +		if (pages[i])
> +			__free_page(pages[i]);
> +	}
> +	kfree(pages);
> +	return ret;
> +}
> +
> +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> +			   struct btrfs_ioctl_encoded_io_args *encoded)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	ssize_t ret;
> +	size_t count = iov_iter_count(iter);
> +	u64 start, lockend, disk_bytenr, disk_io_size;
> +	struct extent_state *cached_state = NULL;
> +	struct extent_map *em;
> +	bool unlocked = false;
> +
> +	file_accessed(iocb->ki_filp);
> +
> +	inode_lock_shared(inode);
> +
> +	if (iocb->ki_pos >= inode->i_size) {
> +		inode_unlock_shared(inode);
> +		return 0;
> +	}
> +	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
> +	/*
> +	 * We don't know how long the extent containing iocb->ki_pos is, but if
> +	 * it's compressed we know that it won't be longer than this.
> +	 */
> +	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
> +
> +	for (;;) {
> +		struct btrfs_ordered_extent *ordered;
> +
> +		ret = btrfs_wait_ordered_range(inode, start,
> +					       lockend - start + 1);
> +		if (ret)
> +			goto out_unlock_inode;
> +		lock_extent_bits(io_tree, start, lockend, &cached_state);
> +		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
> +						     lockend - start + 1);
> +		if (!ordered)
> +			break;
> +		btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +		cond_resched();
> +	}
> +
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start,
> +			      lockend - start + 1);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out_unlock_extent;
> +	}
> +
> +	if (em->block_start == EXTENT_MAP_INLINE) {
> +		u64 extent_start = em->start;
> +
> +		/*
> +		 * For inline extents we get everything we need out of the
> +		 * extent item.
> +		 */
> +		free_extent_map(em);
> +		em = NULL;
> +		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
> +						&cached_state, extent_start,
> +						count, encoded, &unlocked);
> +		goto out;
> +	}
> +
> +	/*
> +	 * We only want to return up to EOF even if the extent extends beyond
> +	 * that.
> +	 */
> +	encoded->len = (min_t(u64, extent_map_end(em), inode->i_size) -
> +			iocb->ki_pos);
> +	if (em->block_start == EXTENT_MAP_HOLE ||
> +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +		disk_bytenr = EXTENT_MAP_HOLE;
> +		encoded->len = encoded->unencoded_len = count =
> +			min_t(u64, count, encoded->len);
> +	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
> +		disk_bytenr = em->block_start;
> +		/*
> +		 * Bail if the buffer isn't large enough to return the whole
> +		 * compressed extent.
> +		 */
> +		if (em->block_len > count) {
> +			ret = -ENOBUFS;
> +			goto out_em;
> +		}
> +		disk_io_size = count = em->block_len;
> +		encoded->unencoded_len = em->ram_bytes;
> +		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
> +		ret = btrfs_encoded_io_compression_from_extent(
> +							     em->compress_type);
> +		if (ret < 0)
> +			goto out_em;
> +		encoded->compression = ret;
> +	} else {
> +		disk_bytenr = em->block_start + (start - em->start);
> +		if (encoded->len > count)
> +			encoded->len = count;
> +		/*
> +		 * Don't read beyond what we locked. This also limits the page
> +		 * allocations that we'll do.
> +		 */
> +		disk_io_size = min(lockend + 1,
> +				   iocb->ki_pos + encoded->len) - start;
> +		encoded->len = encoded->unencoded_len = count =
> +			start + disk_io_size - iocb->ki_pos;
> +		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
> +	}
> +	free_extent_map(em);
> +	em = NULL;
> +
> +	if (disk_bytenr == EXTENT_MAP_HOLE) {
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +		inode_unlock_shared(inode);
> +		unlocked = true;
> +		ret = iov_iter_zero(count, iter);
> +		if (ret != count)
> +			ret = -EFAULT;
> +	} else {
> +		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
> +						 &cached_state, disk_bytenr,
> +						 disk_io_size, count,
> +						 encoded->compression,
> +						 &unlocked);
> +	}
> +
> +out:
> +	if (ret >= 0)
> +		iocb->ki_pos += encoded->len;
> +out_em:
> +	free_extent_map(em);
> +out_unlock_extent:
> +	if (!unlocked)
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +out_unlock_inode:
> +	if (!unlocked)
> +		inode_unlock_shared(inode);
> +	return ret;
> +}
> +
>  #ifdef CONFIG_SWAP
>  /*
>   * Add an entry indicating a block group or device which is pinned by a
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 05c77a1979a9..f0c575223d88 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -28,6 +28,7 @@
>  #include <linux/iversion.h>
>  #include <linux/fileattr.h>
>  #include <linux/fsverity.h>
> +#include <linux/sched/xacct.h>
>  #include "ctree.h"
>  #include "disk-io.h"
>  #include "export.h"
> @@ -88,6 +89,22 @@ struct btrfs_ioctl_send_args_32 {
>  
>  #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
>  			       struct btrfs_ioctl_send_args_32)
> +
> +struct btrfs_ioctl_encoded_io_args_32 {
> +	compat_uptr_t iov;
> +	compat_ulong_t iovcnt;
> +	__s64 offset;
> +	__u64 flags;
> +	__u64 len;
> +	__u64 unencoded_len;
> +	__u64 unencoded_offset;
> +	__u32 compression;
> +	__u32 encryption;
> +	__u32 reserved[8];
> +};
> +
> +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
> +				       struct btrfs_ioctl_encoded_io_args_32)
>  #endif
>  
>  /* Mask out flags that are inappropriate for the given type of inode. */
> @@ -4861,6 +4878,89 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
>  	return ret;
>  }
>  
> +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
> +				    bool compat)
> +{
> +	struct btrfs_ioctl_encoded_io_args args = {};
> +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> +					     flags);
> +	size_t copy_end;
> +	struct iovec iovstack[UIO_FASTIOV];
> +	struct iovec *iov = iovstack;
> +	struct iov_iter iter;
> +	loff_t pos;
> +	struct kiocb kiocb;
> +	ssize_t ret;
> +
> +	if (!capable(CAP_SYS_ADMIN)) {
> +		ret = -EPERM;
> +		goto out_acct;
> +	}
> +
> +	if (compat) {
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> +		struct btrfs_ioctl_encoded_io_args_32 args32;
> +
> +		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
> +				       flags);
> +		if (copy_from_user(&args32, argp, copy_end)) {
> +			ret = -EFAULT;
> +			goto out_acct;
> +		}
> +		args.iov = compat_ptr(args32.iov);
> +		args.iovcnt = args32.iovcnt;
> +		args.offset = args32.offset;
> +		args.flags = args32.flags;
> +#else
> +		return -ENOTTY;
> +#endif
> +	} else {
> +		copy_end = copy_end_kernel;
> +		if (copy_from_user(&args, argp, copy_end)) {
> +			ret = -EFAULT;
> +			goto out_acct;
> +		}
> +	}
> +	if (args.flags != 0) {
> +		ret = -EINVAL;
> +		goto out_acct;
> +	}
> +
> +	ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
> +			   &iov, &iter);
> +	if (ret < 0)
> +		goto out_acct;
> +
> +	if (iov_iter_count(&iter) == 0) {
> +		ret = 0;
> +		goto out_iov;
> +	}
> +	pos = args.offset;
> +	ret = rw_verify_area(READ, file, &pos, args.len);
> +	if (ret < 0)
> +		goto out_iov;
> +
> +	init_sync_kiocb(&kiocb, file);
> +	kiocb.ki_pos = pos;
> +
> +	ret = btrfs_encoded_read(&kiocb, &iter, &args);
> +	if (ret >= 0) {
> +		fsnotify_access(file);
> +		if (copy_to_user(argp + copy_end,
> +				 (char *)&args + copy_end_kernel,
> +				 sizeof(args) - copy_end_kernel))
> +			ret = -EFAULT;
> +	}
> +
> +out_iov:
> +	kfree(iov);
> +out_acct:
> +	if (ret > 0)
> +		add_rchar(current, ret);
> +	inc_syscr(current);
> +	return ret;
> +}
> +
>  long btrfs_ioctl(struct file *file, unsigned int
>  		cmd, unsigned long arg)
>  {
> @@ -5005,6 +5105,12 @@ long btrfs_ioctl(struct file *file, unsigned int
>  		return fsverity_ioctl_enable(file, (const void __user *)argp);
>  	case FS_IOC_MEASURE_VERITY:
>  		return fsverity_ioctl_measure(file, argp);
> +	case BTRFS_IOC_ENCODED_READ:
> +		return btrfs_ioctl_encoded_read(file, argp, false);
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> +	case BTRFS_IOC_ENCODED_READ_32:
> +		return btrfs_ioctl_encoded_read(file, argp, true);
> +#endif
>  	}
>  
>  	return -ENOTTY;
> -- 
> 2.34.0
Omar Sandoval Nov. 18, 2021, 7:11 p.m. UTC | #2
On Thu, Nov 18, 2021 at 03:55:31PM +0100, David Sterba wrote:
> On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> > +static int btrfs_encoded_io_compression_from_extent(int compress_type)
> > +{
> > +	switch (compress_type) {
> > +	case BTRFS_COMPRESS_NONE:
> > +		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
> > +	case BTRFS_COMPRESS_ZLIB:
> > +		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
> > +	case BTRFS_COMPRESS_LZO:
> > +		/*
> > +		 * The LZO format depends on the page size. 64k is the maximum
> 
> Should this also say it depends ont the sector (not page) size?

Yeah, the code also needs to be changed to use the filesystem sector
size now that we we support subpage LZO.

I'll clean up the other comments.

> > +		 * sectorsize (and thus page size) that we support.
> > +		 */
> > +		if (PAGE_SIZE < SZ_4K || PAGE_SIZE > SZ_64K)
> > +			return -EINVAL;
> > +		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (PAGE_SHIFT - 12);
> > +	case BTRFS_COMPRESS_ZSTD:
> > +		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
> > +	default:
> > +		return -EUCLEAN;
> > +	}
> > +}
> > +
> > +
> > +		inline_size = btrfs_file_extent_inline_item_len(leaf,
> > +								path->slots[0]);
> > +		if (inline_size > count) {
> > +			ret = -ENOBUFS;
> > +			goto out;
> > +		}
> > +		count = inline_size;
> > +		encoded->unencoded_len = ram_bytes;
> > +		encoded->unencoded_offset = iocb->ki_pos - extent_start;
> > +	} else {
> > +		encoded->len = encoded->unencoded_len = count =
> > +			min_t(u64, count, encoded->len);
> 
> Please don't do chained initializations.
> 
> > +		ptr += iocb->ki_pos - extent_start;
> > +	}
> > +
> > +	tmp = kmalloc(count, GFP_NOFS);
> > +	if (!tmp) {
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +	read_extent_buffer(leaf, tmp, ptr, count);
> > +	btrfs_release_path(path);
> > +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> > +	inode_unlock_shared(inode);
> > +	*unlocked = true;
> > +
> > +	ret = copy_to_iter(tmp, count, iter);
> > +	if (ret != count)
> > +		ret = -EFAULT;
> > +	kfree(tmp);
> > +out:
> > +	btrfs_free_path(path);
> > +	return ret;
> > +}
> > +
> > +struct btrfs_encoded_read_private {
> > +	struct inode *inode;
> > +	u64 file_offset;
> > +	wait_queue_head_t wait;
> > +	atomic_t pending;
> > +	blk_status_t status;
> > +	bool skip_csum;
> > +};
> > +
> > +static blk_status_t submit_encoded_read_bio(struct inode *inode,
> > +					    struct bio *bio, int mirror_num,
> > +					    unsigned long bio_flags)
> > +{
> > +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> > +	struct btrfs_bio *bbio = btrfs_bio(bio);
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > +	blk_status_t ret;
> > +
> > +	if (!priv->skip_csum) {
> > +		ret = btrfs_lookup_bio_sums(inode, bio, NULL);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
> > +	if (ret) {
> > +		btrfs_bio_free_csum(bbio);
> > +		return ret;
> > +	}
> > +
> > +	atomic_inc(&priv->pending);
> > +	ret = btrfs_map_bio(fs_info, bio, mirror_num);
> > +	if (ret) {
> > +		atomic_dec(&priv->pending);
> > +		btrfs_bio_free_csum(bbio);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
> > +{
> > +	const bool uptodate = bbio->bio.bi_status == BLK_STS_OK;
> > +	struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
> > +	struct inode *inode = priv->inode;
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > +	u32 sectorsize = fs_info->sectorsize;
> > +	struct bio_vec *bvec;
> > +	struct bvec_iter_all iter_all;
> > +	u64 start = priv->file_offset;
> > +	u32 bio_offset = 0;
> > +
> > +	if (priv->skip_csum || !uptodate)
> > +		return bbio->bio.bi_status;
> > +
> > +	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
> > +		unsigned int i, nr_sectors, pgoff;
> > +
> > +		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
> > +		pgoff = bvec->bv_offset;
> > +		for (i = 0; i < nr_sectors; i++) {
> > +			ASSERT(pgoff < PAGE_SIZE);
> > +			if (check_data_csum(inode, bbio, bio_offset,
> > +					    bvec->bv_page, pgoff, start))
> > +				return BLK_STS_IOERR;
> > +			start += sectorsize;
> > +			bio_offset += sectorsize;
> > +			pgoff += sectorsize;
> > +		}
> > +	}
> > +	return BLK_STS_OK;
> > +}
> > +
> > +static void btrfs_encoded_read_endio(struct bio *bio)
> > +{
> > +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> > +	struct btrfs_bio *bbio = btrfs_bio(bio);
> > +	blk_status_t status;
> > +
> > +	status = btrfs_encoded_read_verify_csum(bbio);
> > +	if (status) {
> > +		/*
> > +		 * The memory barrier implied by the atomic_dec_return() here
> > +		 * pairs with the memory barrier implied by the
> > +		 * atomic_dec_return() or io_wait_event() in
> > +		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
> > +		 * write is observed before the load of status in
> > +		 * btrfs_encoded_read_regular_fill_pages().
> > +		 */
> > +		WRITE_ONCE(priv->status, status);
> 
> The WRITE_ONCE is here 3 times, I wonder if this is ok to be opencoded
> like that. I'd suggest to use a helper with a comment.
> 
> > +	}
> > +	if (!atomic_dec_return(&priv->pending))
> > +		wake_up(&priv->wait);
> > +	btrfs_bio_free_csum(bbio);
> > +	bio_put(bio);
> > +}
> > +
> > +static int btrfs_encoded_read_regular_fill_pages(struct inode *inode,
> > +						 u64 file_offset,
> > +						 u64 disk_bytenr,
> > +						 u64 disk_io_size,
> > +						 struct page **pages)
> > +{
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > +	struct btrfs_encoded_read_private priv = {
> > +		.inode = inode,
> > +		.file_offset = file_offset,
> > +		.pending = ATOMIC_INIT(1),
> > +		.skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM,
> > +	};
> > +	unsigned long i = 0;
> > +	u64 cur = 0;
> > +	int ret;
> > +
> > +	init_waitqueue_head(&priv.wait);
> > +	/*
> > +	 * Submit bios for the extent, splitting due to bio or stripe limits as
> > +	 * necessary.
> > +	 */
> > +	while (cur < disk_io_size) {
> > +		struct extent_map *em;
> > +		struct btrfs_io_geometry geom;
> > +		struct bio *bio = NULL;
> > +		u64 remaining;
> > +
> > +		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
> > +					 disk_io_size - cur);
> > +		if (IS_ERR(em)) {
> > +			ret = PTR_ERR(em);
> > +		} else {
> > +			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
> > +						    disk_bytenr + cur, &geom);
> > +			free_extent_map(em);
> > +		}
> > +		if (ret) {
> > +			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
> > +			break;
> > +		}
> > +		remaining = min(geom.len, disk_io_size - cur);
> > +		while (bio || remaining) {
> > +			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
> > +
> > +			if (!bio) {
> > +				bio = btrfs_bio_alloc(BIO_MAX_VECS);
> > +				bio->bi_iter.bi_sector =
> > +					(disk_bytenr + cur) >> SECTOR_SHIFT;
> > +				bio->bi_end_io = btrfs_encoded_read_endio;
> > +				bio->bi_private = &priv;
> > +				bio->bi_opf = REQ_OP_READ;
> > +			}
> > +
> > +			if (!bytes ||
> > +			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
> > +				blk_status_t status;
> > +
> > +				status = submit_encoded_read_bio(inode, bio, 0,
> > +								 0);
> > +				if (status) {
> > +					WRITE_ONCE(priv.status, status);
> > +					bio_put(bio);
> > +					goto out;
> > +				}
> > +				bio = NULL;
> > +				continue;
> > +			}
> > +
> > +			i++;
> > +			cur += bytes;
> > +			remaining -= bytes;
> > +		}
> > +	}
> > +
> > +out:
> > +	if (atomic_dec_return(&priv.pending))
> > +		io_wait_event(priv.wait, !atomic_read(&priv.pending));
> > +	/* See btrfs_encoded_read_endio() for ordering. */
> > +	return blk_status_to_errno(READ_ONCE(priv.status));
> > +}
> > +
> > +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
> > +					  struct iov_iter *iter,
> > +					  u64 start, u64 lockend,
> > +					  struct extent_state **cached_state,
> > +					  u64 disk_bytenr, u64 disk_io_size,
> > +					  size_t count, bool compressed,
> > +					  bool *unlocked)
> > +{
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > +	struct page **pages;
> > +	unsigned long nr_pages, i;
> > +	u64 cur;
> > +	size_t page_offset;
> > +	ssize_t ret;
> > +
> > +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> 
> Power of two compile time constants can use the bitmask operations for
> alighnment.
> 
> > +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> > +	if (!pages)
> > +		return -ENOMEM;
> > +	for (i = 0; i < nr_pages; i++) {
> > +		pages[i] = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
> > +		if (!pages[i]) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +	}
> > +
> > +	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> > +						    disk_io_size, pages);
> > +	if (ret)
> > +		goto out;
> > +
> > +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> > +	inode_unlock_shared(inode);
> > +	*unlocked = true;
> > +
> > +	if (compressed) {
> > +		i = 0;
> > +		page_offset = 0;
> > +	} else {
> > +		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
> > +		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
> > +	}
> > +	cur = 0;
> > +	while (cur < count) {
> > +		size_t bytes = min_t(size_t, count - cur,
> > +				     PAGE_SIZE - page_offset);
> > +
> > +		if (copy_page_to_iter(pages[i], page_offset, bytes,
> > +				      iter) != bytes) {
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> > +		i++;
> > +		cur += bytes;
> > +		page_offset = 0;
> > +	}
> > +	ret = count;
> > +out:
> > +	for (i = 0; i < nr_pages; i++) {
> > +		if (pages[i])
> > +			__free_page(pages[i]);
> > +	}
> > +	kfree(pages);
> > +	return ret;
> > +}
> > +
> > +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> > +			   struct btrfs_ioctl_encoded_io_args *encoded)
> > +{
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > +	ssize_t ret;
> > +	size_t count = iov_iter_count(iter);
> > +	u64 start, lockend, disk_bytenr, disk_io_size;
> > +	struct extent_state *cached_state = NULL;
> > +	struct extent_map *em;
> > +	bool unlocked = false;
> > +
> > +	file_accessed(iocb->ki_filp);
> > +
> > +	inode_lock_shared(inode);
> > +
> > +	if (iocb->ki_pos >= inode->i_size) {
> > +		inode_unlock_shared(inode);
> > +		return 0;
> > +	}
> > +	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
> > +	/*
> > +	 * We don't know how long the extent containing iocb->ki_pos is, but if
> > +	 * it's compressed we know that it won't be longer than this.
> > +	 */
> > +	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
> > +
> > +	for (;;) {
> > +		struct btrfs_ordered_extent *ordered;
> > +
> > +		ret = btrfs_wait_ordered_range(inode, start,
> > +					       lockend - start + 1);
> > +		if (ret)
> > +			goto out_unlock_inode;
> > +		lock_extent_bits(io_tree, start, lockend, &cached_state);
> > +		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
> > +						     lockend - start + 1);
> > +		if (!ordered)
> > +			break;
> > +		btrfs_put_ordered_extent(ordered);
> > +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> > +		cond_resched();
> > +	}
> > +
> > +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start,
> > +			      lockend - start + 1);
> > +	if (IS_ERR(em)) {
> > +		ret = PTR_ERR(em);
> > +		goto out_unlock_extent;
> > +	}
> > +
> > +	if (em->block_start == EXTENT_MAP_INLINE) {
> > +		u64 extent_start = em->start;
> > +
> > +		/*
> > +		 * For inline extents we get everything we need out of the
> > +		 * extent item.
> > +		 */
> > +		free_extent_map(em);
> > +		em = NULL;
> > +		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
> > +						&cached_state, extent_start,
> > +						count, encoded, &unlocked);
> > +		goto out;
> > +	}
> > +
> > +	/*
> > +	 * We only want to return up to EOF even if the extent extends beyond
> > +	 * that.
> > +	 */
> > +	encoded->len = (min_t(u64, extent_map_end(em), inode->i_size) -
> > +			iocb->ki_pos);
> > +	if (em->block_start == EXTENT_MAP_HOLE ||
> > +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> > +		disk_bytenr = EXTENT_MAP_HOLE;
> > +		encoded->len = encoded->unencoded_len = count =
> > +			min_t(u64, count, encoded->len);
> > +	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
> > +		disk_bytenr = em->block_start;
> > +		/*
> > +		 * Bail if the buffer isn't large enough to return the whole
> > +		 * compressed extent.
> > +		 */
> > +		if (em->block_len > count) {
> > +			ret = -ENOBUFS;
> > +			goto out_em;
> > +		}
> > +		disk_io_size = count = em->block_len;
> > +		encoded->unencoded_len = em->ram_bytes;
> > +		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
> > +		ret = btrfs_encoded_io_compression_from_extent(
> > +							     em->compress_type);
> > +		if (ret < 0)
> > +			goto out_em;
> > +		encoded->compression = ret;
> > +	} else {
> > +		disk_bytenr = em->block_start + (start - em->start);
> > +		if (encoded->len > count)
> > +			encoded->len = count;
> > +		/*
> > +		 * Don't read beyond what we locked. This also limits the page
> > +		 * allocations that we'll do.
> > +		 */
> > +		disk_io_size = min(lockend + 1,
> > +				   iocb->ki_pos + encoded->len) - start;
> > +		encoded->len = encoded->unencoded_len = count =
> > +			start + disk_io_size - iocb->ki_pos;
> > +		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
> > +	}
> > +	free_extent_map(em);
> > +	em = NULL;
> > +
> > +	if (disk_bytenr == EXTENT_MAP_HOLE) {
> > +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> > +		inode_unlock_shared(inode);
> > +		unlocked = true;
> > +		ret = iov_iter_zero(count, iter);
> > +		if (ret != count)
> > +			ret = -EFAULT;
> > +	} else {
> > +		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
> > +						 &cached_state, disk_bytenr,
> > +						 disk_io_size, count,
> > +						 encoded->compression,
> > +						 &unlocked);
> > +	}
> > +
> > +out:
> > +	if (ret >= 0)
> > +		iocb->ki_pos += encoded->len;
> > +out_em:
> > +	free_extent_map(em);
> > +out_unlock_extent:
> > +	if (!unlocked)
> > +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> > +out_unlock_inode:
> > +	if (!unlocked)
> > +		inode_unlock_shared(inode);
> > +	return ret;
> > +}
> > +
> >  #ifdef CONFIG_SWAP
> >  /*
> >   * Add an entry indicating a block group or device which is pinned by a
> > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > index 05c77a1979a9..f0c575223d88 100644
> > --- a/fs/btrfs/ioctl.c
> > +++ b/fs/btrfs/ioctl.c
> > @@ -28,6 +28,7 @@
> >  #include <linux/iversion.h>
> >  #include <linux/fileattr.h>
> >  #include <linux/fsverity.h>
> > +#include <linux/sched/xacct.h>
> >  #include "ctree.h"
> >  #include "disk-io.h"
> >  #include "export.h"
> > @@ -88,6 +89,22 @@ struct btrfs_ioctl_send_args_32 {
> >  
> >  #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
> >  			       struct btrfs_ioctl_send_args_32)
> > +
> > +struct btrfs_ioctl_encoded_io_args_32 {
> > +	compat_uptr_t iov;
> > +	compat_ulong_t iovcnt;
> > +	__s64 offset;
> > +	__u64 flags;
> > +	__u64 len;
> > +	__u64 unencoded_len;
> > +	__u64 unencoded_offset;
> > +	__u32 compression;
> > +	__u32 encryption;
> > +	__u32 reserved[8];
> > +};
> > +
> > +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
> > +				       struct btrfs_ioctl_encoded_io_args_32)
> >  #endif
> >  
> >  /* Mask out flags that are inappropriate for the given type of inode. */
> > @@ -4861,6 +4878,89 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
> >  	return ret;
> >  }
> >  
> > +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
> > +				    bool compat)
> > +{
> > +	struct btrfs_ioctl_encoded_io_args args = {};
> > +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> > +					     flags);
> > +	size_t copy_end;
> > +	struct iovec iovstack[UIO_FASTIOV];
> > +	struct iovec *iov = iovstack;
> > +	struct iov_iter iter;
> > +	loff_t pos;
> > +	struct kiocb kiocb;
> > +	ssize_t ret;
> > +
> > +	if (!capable(CAP_SYS_ADMIN)) {
> > +		ret = -EPERM;
> > +		goto out_acct;
> > +	}
> > +
> > +	if (compat) {
> > +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> > +		struct btrfs_ioctl_encoded_io_args_32 args32;
> > +
> > +		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
> > +				       flags);
> > +		if (copy_from_user(&args32, argp, copy_end)) {
> > +			ret = -EFAULT;
> > +			goto out_acct;
> > +		}
> > +		args.iov = compat_ptr(args32.iov);
> > +		args.iovcnt = args32.iovcnt;
> > +		args.offset = args32.offset;
> > +		args.flags = args32.flags;
> > +#else
> > +		return -ENOTTY;
> > +#endif
> > +	} else {
> > +		copy_end = copy_end_kernel;
> > +		if (copy_from_user(&args, argp, copy_end)) {
> > +			ret = -EFAULT;
> > +			goto out_acct;
> > +		}
> > +	}
> > +	if (args.flags != 0) {
> > +		ret = -EINVAL;
> > +		goto out_acct;
> > +	}
> > +
> > +	ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
> > +			   &iov, &iter);
> > +	if (ret < 0)
> > +		goto out_acct;
> > +
> > +	if (iov_iter_count(&iter) == 0) {
> > +		ret = 0;
> > +		goto out_iov;
> > +	}
> > +	pos = args.offset;
> > +	ret = rw_verify_area(READ, file, &pos, args.len);
> > +	if (ret < 0)
> > +		goto out_iov;
> > +
> > +	init_sync_kiocb(&kiocb, file);
> > +	kiocb.ki_pos = pos;
> > +
> > +	ret = btrfs_encoded_read(&kiocb, &iter, &args);
> > +	if (ret >= 0) {
> > +		fsnotify_access(file);
> > +		if (copy_to_user(argp + copy_end,
> > +				 (char *)&args + copy_end_kernel,
> > +				 sizeof(args) - copy_end_kernel))
> > +			ret = -EFAULT;
> > +	}
> > +
> > +out_iov:
> > +	kfree(iov);
> > +out_acct:
> > +	if (ret > 0)
> > +		add_rchar(current, ret);
> > +	inc_syscr(current);
> > +	return ret;
> > +}
> > +
> >  long btrfs_ioctl(struct file *file, unsigned int
> >  		cmd, unsigned long arg)
> >  {
> > @@ -5005,6 +5105,12 @@ long btrfs_ioctl(struct file *file, unsigned int
> >  		return fsverity_ioctl_enable(file, (const void __user *)argp);
> >  	case FS_IOC_MEASURE_VERITY:
> >  		return fsverity_ioctl_measure(file, argp);
> > +	case BTRFS_IOC_ENCODED_READ:
> > +		return btrfs_ioctl_encoded_read(file, argp, false);
> > +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> > +	case BTRFS_IOC_ENCODED_READ_32:
> > +		return btrfs_ioctl_encoded_read(file, argp, true);
> > +#endif
> >  	}
> >  
> >  	return -ENOTTY;
> > -- 
> > 2.34.0
David Sterba Jan. 24, 2022, 9:54 p.m. UTC | #3
On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>

> - We don't do read repair, because it turns out that read repair is
>   currently broken for compressed data.

Is there a reproducer, and a fix?
David Sterba Jan. 24, 2022, 10:26 p.m. UTC | #4
On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> There are 4 main cases:
> 
> 1. Inline extents: we copy the data straight out of the extent buffer.
> 2. Hole/preallocated extents: we fill in zeroes.
> 3. Regular, uncompressed extents: we read the sectors we need directly
>    from disk.
> 4. Regular, compressed extents: we read the entire compressed extent
>    from disk and indicate what subset of the decompressed extent is in
>    the file.
> 
> This initial implementation simplifies a few things that can be improved
> in the future:
> 
> - We hold the inode lock during the operation.
> - Cases 1, 3, and 4 allocate temporary memory to read into before
>   copying out to userspace.
> - We don't do read repair, because it turns out that read repair is
>   currently broken for compressed data.
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  fs/btrfs/ctree.h |   4 +
>  fs/btrfs/inode.c | 496 +++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/ioctl.c | 106 ++++++++++
>  3 files changed, 606 insertions(+)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 2e7f74060a14..70034e33abe6 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3275,6 +3275,10 @@ int btrfs_writepage_cow_fixup(struct page *page);
>  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
>  					  struct page *page, u64 start,
>  					  u64 end, bool uptodate);
> +struct btrfs_ioctl_encoded_io_args;
> +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> +			   struct btrfs_ioctl_encoded_io_args *encoded);
> +
>  extern const struct dentry_operations btrfs_dentry_operations;
>  extern const struct iomap_ops btrfs_dio_iomap_ops;
>  extern const struct iomap_dio_ops btrfs_dio_ops;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index c2efea101f61..d29e968fd18b 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -10525,6 +10525,502 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
>  	}
>  }
>  
> +static int btrfs_encoded_io_compression_from_extent(int compress_type)
> +{
> +	switch (compress_type) {
> +	case BTRFS_COMPRESS_NONE:
> +		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
> +	case BTRFS_COMPRESS_ZLIB:
> +		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
> +	case BTRFS_COMPRESS_LZO:
> +		/*
> +		 * The LZO format depends on the page size. 64k is the maximum
> +		 * sectorsize (and thus page size) that we support.
> +		 */
> +		if (PAGE_SIZE < SZ_4K || PAGE_SIZE > SZ_64K)
> +			return -EINVAL;
> +		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (PAGE_SHIFT - 12);
> +	case BTRFS_COMPRESS_ZSTD:
> +		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
> +	default:
> +		return -EUCLEAN;
> +	}
> +}
> +
> +static ssize_t btrfs_encoded_read_inline(
> +				struct kiocb *iocb,
> +				struct iov_iter *iter, u64 start,
> +				u64 lockend,
> +				struct extent_state **cached_state,
> +				u64 extent_start, size_t count,
> +				struct btrfs_ioctl_encoded_io_args *encoded,
> +				bool *unlocked)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);

Please use btrfs_inode in all internal helpers, either as parameters or
as local variable, to avoid the BTRFS_I and btrfs_sb conversions everywhere.

> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	struct btrfs_path *path;
> +	struct extent_buffer *leaf;
> +	struct btrfs_file_extent_item *item;
> +	u64 ram_bytes;
> +	unsigned long ptr;
> +	void *tmp;
> +	ssize_t ret;
> +
> +	path = btrfs_alloc_path();
> +	if (!path) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path,
> +				       btrfs_ino(BTRFS_I(inode)), extent_start,
> +				       0);
> +	if (ret) {
> +		if (ret > 0) {
> +			/* The extent item disappeared? */
> +			ret = -EIO;
> +		}
> +		goto out;
> +	}
> +	leaf = path->nodes[0];
> +	item = btrfs_item_ptr(leaf, path->slots[0],
> +			      struct btrfs_file_extent_item);
> +
> +	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
> +	ptr = btrfs_file_extent_inline_start(item);
> +
> +	encoded->len = (min_t(u64, extent_start + ram_bytes, inode->i_size) -
> +			iocb->ki_pos);

No need for the outer ( )

> +	ret = btrfs_encoded_io_compression_from_extent(
> +				 btrfs_file_extent_compression(leaf, item));
> +	if (ret < 0)
> +		goto out;
> +	encoded->compression = ret;
> +	if (encoded->compression) {
> +		size_t inline_size;
> +
> +		inline_size = btrfs_file_extent_inline_item_len(leaf,
> +								path->slots[0]);
> +		if (inline_size > count) {
> +			ret = -ENOBUFS;
> +			goto out;
> +		}
> +		count = inline_size;
> +		encoded->unencoded_len = ram_bytes;
> +		encoded->unencoded_offset = iocb->ki_pos - extent_start;
> +	} else {
> +		encoded->len = encoded->unencoded_len = count =
> +			min_t(u64, count, encoded->len);

I'm sure I have commented on that in the past, please don't use chained
intializations. In this case something like:

		count = min_t(u64, count, encoded->len);
		encoded->len = count;
		encoded->unencoded_len = count;

> +		ptr += iocb->ki_pos - extent_start;
> +	}
> +
> +	tmp = kmalloc(count, GFP_NOFS);
> +	if (!tmp) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	read_extent_buffer(leaf, tmp, ptr, count);
> +	btrfs_release_path(path);
> +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> +	inode_unlock_shared(inode);
> +	*unlocked = true;
> +
> +	ret = copy_to_iter(tmp, count, iter);
> +	if (ret != count)
> +		ret = -EFAULT;
> +	kfree(tmp);
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +struct btrfs_encoded_read_private {
> +	struct inode *inode;

This should also be btrfs_inode

> +	u64 file_offset;
> +	wait_queue_head_t wait;
> +	atomic_t pending;
> +	blk_status_t status;
> +	bool skip_csum;
> +};
> +
> +static blk_status_t submit_encoded_read_bio(struct inode *inode,

struct btrfs_inode

> +					    struct bio *bio, int mirror_num,
> +					    unsigned long bio_flags)

bio_flags is unused here (and in the encoded patches afaics)

> +{
> +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> +	struct btrfs_bio *bbio = btrfs_bio(bio);
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	blk_status_t ret;
> +
> +	if (!priv->skip_csum) {
> +		ret = btrfs_lookup_bio_sums(inode, bio, NULL);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
> +	if (ret) {
> +		btrfs_bio_free_csum(bbio);
> +		return ret;
> +	}
> +
> +	atomic_inc(&priv->pending);
> +	ret = btrfs_map_bio(fs_info, bio, mirror_num);
> +	if (ret) {
> +		atomic_dec(&priv->pending);
> +		btrfs_bio_free_csum(bbio);
> +	}
> +	return ret;
> +}
> +
> +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
> +{
> +	const bool uptodate = bbio->bio.bi_status == BLK_STS_OK;

	const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);

> +	struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
> +	struct inode *inode = priv->inode;
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	u32 sectorsize = fs_info->sectorsize;
> +	struct bio_vec *bvec;
> +	struct bvec_iter_all iter_all;
> +	u64 start = priv->file_offset;
> +	u32 bio_offset = 0;
> +
> +	if (priv->skip_csum || !uptodate)
> +		return bbio->bio.bi_status;
> +
> +	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
> +		unsigned int i, nr_sectors, pgoff;
> +
> +		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
> +		pgoff = bvec->bv_offset;
> +		for (i = 0; i < nr_sectors; i++) {
> +			ASSERT(pgoff < PAGE_SIZE);
> +			if (check_data_csum(inode, bbio, bio_offset,
> +					    bvec->bv_page, pgoff, start))
> +				return BLK_STS_IOERR;
> +			start += sectorsize;
> +			bio_offset += sectorsize;
> +			pgoff += sectorsize;
> +		}
> +	}
> +	return BLK_STS_OK;
> +}
> +
> +static void btrfs_encoded_read_endio(struct bio *bio)
> +{
> +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> +	struct btrfs_bio *bbio = btrfs_bio(bio);
> +	blk_status_t status;
> +
> +	status = btrfs_encoded_read_verify_csum(bbio);
> +	if (status) {
> +		/*
> +		 * The memory barrier implied by the atomic_dec_return() here
> +		 * pairs with the memory barrier implied by the
> +		 * atomic_dec_return() or io_wait_event() in
> +		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
> +		 * write is observed before the load of status in
> +		 * btrfs_encoded_read_regular_fill_pages().
> +		 */
> +		WRITE_ONCE(priv->status, status);
> +	}
> +	if (!atomic_dec_return(&priv->pending))
> +		wake_up(&priv->wait);
> +	btrfs_bio_free_csum(bbio);
> +	bio_put(bio);
> +}
> +
> +static int btrfs_encoded_read_regular_fill_pages(struct inode *inode,
> +						 u64 file_offset,
> +						 u64 disk_bytenr,
> +						 u64 disk_io_size,
> +						 struct page **pages)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	struct btrfs_encoded_read_private priv = {
> +		.inode = inode,
> +		.file_offset = file_offset,
> +		.pending = ATOMIC_INIT(1),
> +		.skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM,
> +	};
> +	unsigned long i = 0;
> +	u64 cur = 0;
> +	int ret;
> +
> +	init_waitqueue_head(&priv.wait);
> +	/*
> +	 * Submit bios for the extent, splitting due to bio or stripe limits as
> +	 * necessary.
> +	 */
> +	while (cur < disk_io_size) {
> +		struct extent_map *em;
> +		struct btrfs_io_geometry geom;
> +		struct bio *bio = NULL;
> +		u64 remaining;
> +
> +		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
> +					 disk_io_size - cur);
> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +		} else {
> +			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
> +						    disk_bytenr + cur, &geom);
> +			free_extent_map(em);
> +		}
> +		if (ret) {
> +			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
> +			break;
> +		}
> +		remaining = min(geom.len, disk_io_size - cur);
> +		while (bio || remaining) {
> +			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
> +
> +			if (!bio) {
> +				bio = btrfs_bio_alloc(BIO_MAX_VECS);
> +				bio->bi_iter.bi_sector =
> +					(disk_bytenr + cur) >> SECTOR_SHIFT;
> +				bio->bi_end_io = btrfs_encoded_read_endio;
> +				bio->bi_private = &priv;
> +				bio->bi_opf = REQ_OP_READ;
> +			}
> +
> +			if (!bytes ||
> +			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
> +				blk_status_t status;
> +
> +				status = submit_encoded_read_bio(inode, bio, 0,
> +								 0);
> +				if (status) {
> +					WRITE_ONCE(priv.status, status);
> +					bio_put(bio);
> +					goto out;
> +				}
> +				bio = NULL;
> +				continue;
> +			}
> +
> +			i++;
> +			cur += bytes;
> +			remaining -= bytes;
> +		}
> +	}
> +
> +out:
> +	if (atomic_dec_return(&priv.pending))
> +		io_wait_event(priv.wait, !atomic_read(&priv.pending));
> +	/* See btrfs_encoded_read_endio() for ordering. */
> +	return blk_status_to_errno(READ_ONCE(priv.status));
> +}
> +
> +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
> +					  struct iov_iter *iter,
> +					  u64 start, u64 lockend,
> +					  struct extent_state **cached_state,
> +					  u64 disk_bytenr, u64 disk_io_size,
> +					  size_t count, bool compressed,
> +					  bool *unlocked)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	struct page **pages;
> +	unsigned long nr_pages, i;
> +	u64 cur;
> +	size_t page_offset;
> +	ssize_t ret;
> +
> +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> +	if (!pages)
> +		return -ENOMEM;
> +	for (i = 0; i < nr_pages; i++) {
> +		pages[i] = alloc_page(GFP_NOFS | __GFP_HIGHMEM);

No HIGHMEM please.

> +		if (!pages[i]) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +	}
> +
> +	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> +						    disk_io_size, pages);
> +	if (ret)
> +		goto out;
> +
> +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> +	inode_unlock_shared(inode);
> +	*unlocked = true;
> +
> +	if (compressed) {
> +		i = 0;
> +		page_offset = 0;
> +	} else {
> +		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
> +		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
> +	}
> +	cur = 0;
> +	while (cur < count) {
> +		size_t bytes = min_t(size_t, count - cur,
> +				     PAGE_SIZE - page_offset);
> +
> +		if (copy_page_to_iter(pages[i], page_offset, bytes,
> +				      iter) != bytes) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +		i++;
> +		cur += bytes;
> +		page_offset = 0;
> +	}
> +	ret = count;
> +out:
> +	for (i = 0; i < nr_pages; i++) {
> +		if (pages[i])
> +			__free_page(pages[i]);
> +	}
> +	kfree(pages);
> +	return ret;
> +}
> +
> +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> +			   struct btrfs_ioctl_encoded_io_args *encoded)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	ssize_t ret;
> +	size_t count = iov_iter_count(iter);
> +	u64 start, lockend, disk_bytenr, disk_io_size;
> +	struct extent_state *cached_state = NULL;
> +	struct extent_map *em;
> +	bool unlocked = false;
> +
> +	file_accessed(iocb->ki_filp);
> +
> +	inode_lock_shared(inode);

We have helpers for inode locking now, btrfs_inode_lock, that take
additional parameter for cases where we want to exclude certain
locking combinations.

Which also brings the question if the encoded read/write should be
excluded against some other operations like eg. deduplication has to do
against mmap.

> +
> +	if (iocb->ki_pos >= inode->i_size) {
> +		inode_unlock_shared(inode);
> +		return 0;
> +	}
> +	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
> +	/*
> +	 * We don't know how long the extent containing iocb->ki_pos is, but if
> +	 * it's compressed we know that it won't be longer than this.
> +	 */
> +	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
> +
> +	for (;;) {
> +		struct btrfs_ordered_extent *ordered;
> +
> +		ret = btrfs_wait_ordered_range(inode, start,
> +					       lockend - start + 1);
> +		if (ret)
> +			goto out_unlock_inode;
> +		lock_extent_bits(io_tree, start, lockend, &cached_state);
> +		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
> +						     lockend - start + 1);
> +		if (!ordered)
> +			break;
> +		btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +		cond_resched();
> +	}
> +
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start,
> +			      lockend - start + 1);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out_unlock_extent;
> +	}
> +
> +	if (em->block_start == EXTENT_MAP_INLINE) {
> +		u64 extent_start = em->start;
> +
> +		/*
> +		 * For inline extents we get everything we need out of the
> +		 * extent item.
> +		 */
> +		free_extent_map(em);
> +		em = NULL;
> +		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
> +						&cached_state, extent_start,
> +						count, encoded, &unlocked);
> +		goto out;
> +	}
> +
> +	/*
> +	 * We only want to return up to EOF even if the extent extends beyond
> +	 * that.
> +	 */
> +	encoded->len = (min_t(u64, extent_map_end(em), inode->i_size) -
> +			iocb->ki_pos);

no outer ( )

> +	if (em->block_start == EXTENT_MAP_HOLE ||
> +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +		disk_bytenr = EXTENT_MAP_HOLE;
> +		encoded->len = encoded->unencoded_len = count =
> +			min_t(u64, count, encoded->len);

No chained initializations

> +	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
> +		disk_bytenr = em->block_start;
> +		/*
> +		 * Bail if the buffer isn't large enough to return the whole
> +		 * compressed extent.
> +		 */
> +		if (em->block_len > count) {
> +			ret = -ENOBUFS;
> +			goto out_em;
> +		}
> +		disk_io_size = count = em->block_len;
> +		encoded->unencoded_len = em->ram_bytes;
> +		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
> +		ret = btrfs_encoded_io_compression_from_extent(
> +							     em->compress_type);
> +		if (ret < 0)
> +			goto out_em;
> +		encoded->compression = ret;
> +	} else {
> +		disk_bytenr = em->block_start + (start - em->start);
> +		if (encoded->len > count)
> +			encoded->len = count;
> +		/*
> +		 * Don't read beyond what we locked. This also limits the page
> +		 * allocations that we'll do.
> +		 */
> +		disk_io_size = min(lockend + 1,
> +				   iocb->ki_pos + encoded->len) - start;
> +		encoded->len = encoded->unencoded_len = count =
> +			start + disk_io_size - iocb->ki_pos;
> +		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
> +	}
> +	free_extent_map(em);
> +	em = NULL;
> +
> +	if (disk_bytenr == EXTENT_MAP_HOLE) {
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +		inode_unlock_shared(inode);
> +		unlocked = true;
> +		ret = iov_iter_zero(count, iter);
> +		if (ret != count)
> +			ret = -EFAULT;
> +	} else {
> +		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
> +						 &cached_state, disk_bytenr,
> +						 disk_io_size, count,
> +						 encoded->compression,
> +						 &unlocked);
> +	}
> +
> +out:
> +	if (ret >= 0)
> +		iocb->ki_pos += encoded->len;
> +out_em:
> +	free_extent_map(em);
> +out_unlock_extent:
> +	if (!unlocked)
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +out_unlock_inode:
> +	if (!unlocked)
> +		inode_unlock_shared(inode);
> +	return ret;
> +}
> +
>  #ifdef CONFIG_SWAP
>  /*
>   * Add an entry indicating a block group or device which is pinned by a
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 05c77a1979a9..f0c575223d88 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -28,6 +28,7 @@
>  #include <linux/iversion.h>
>  #include <linux/fileattr.h>
>  #include <linux/fsverity.h>
> +#include <linux/sched/xacct.h>
>  #include "ctree.h"
>  #include "disk-io.h"
>  #include "export.h"
> @@ -88,6 +89,22 @@ struct btrfs_ioctl_send_args_32 {
>  
>  #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
>  			       struct btrfs_ioctl_send_args_32)
> +
> +struct btrfs_ioctl_encoded_io_args_32 {
> +	compat_uptr_t iov;
> +	compat_ulong_t iovcnt;
> +	__s64 offset;
> +	__u64 flags;
> +	__u64 len;
> +	__u64 unencoded_len;
> +	__u64 unencoded_offset;
> +	__u32 compression;
> +	__u32 encryption;
> +	__u32 reserved[8];
> +};
> +
> +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
> +				       struct btrfs_ioctl_encoded_io_args_32)
>  #endif
>  
>  /* Mask out flags that are inappropriate for the given type of inode. */
> @@ -4861,6 +4878,89 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
>  	return ret;
>  }
>  
> +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
> +				    bool compat)
> +{
> +	struct btrfs_ioctl_encoded_io_args args = {};
> +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> +					     flags);
> +	size_t copy_end;
> +	struct iovec iovstack[UIO_FASTIOV];
> +	struct iovec *iov = iovstack;
> +	struct iov_iter iter;
> +	loff_t pos;
> +	struct kiocb kiocb;
> +	ssize_t ret;
> +
> +	if (!capable(CAP_SYS_ADMIN)) {
> +		ret = -EPERM;
> +		goto out_acct;
> +	}
> +
> +	if (compat) {
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> +		struct btrfs_ioctl_encoded_io_args_32 args32;
> +
> +		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
> +				       flags);
> +		if (copy_from_user(&args32, argp, copy_end)) {
> +			ret = -EFAULT;
> +			goto out_acct;
> +		}
> +		args.iov = compat_ptr(args32.iov);
> +		args.iovcnt = args32.iovcnt;
> +		args.offset = args32.offset;
> +		args.flags = args32.flags;
> +#else
> +		return -ENOTTY;
> +#endif
> +	} else {
> +		copy_end = copy_end_kernel;
> +		if (copy_from_user(&args, argp, copy_end)) {
> +			ret = -EFAULT;
> +			goto out_acct;
> +		}
> +	}
> +	if (args.flags != 0) {
> +		ret = -EINVAL;
> +		goto out_acct;
> +	}
> +
> +	ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
> +			   &iov, &iter);
> +	if (ret < 0)
> +		goto out_acct;
> +
> +	if (iov_iter_count(&iter) == 0) {
> +		ret = 0;
> +		goto out_iov;
> +	}
> +	pos = args.offset;
> +	ret = rw_verify_area(READ, file, &pos, args.len);
> +	if (ret < 0)
> +		goto out_iov;
> +
> +	init_sync_kiocb(&kiocb, file);
> +	kiocb.ki_pos = pos;
> +
> +	ret = btrfs_encoded_read(&kiocb, &iter, &args);
> +	if (ret >= 0) {
> +		fsnotify_access(file);
> +		if (copy_to_user(argp + copy_end,
> +				 (char *)&args + copy_end_kernel,
> +				 sizeof(args) - copy_end_kernel))
> +			ret = -EFAULT;
> +	}
> +
> +out_iov:
> +	kfree(iov);
> +out_acct:
> +	if (ret > 0)
> +		add_rchar(current, ret);
> +	inc_syscr(current);
> +	return ret;
> +}
> +
>  long btrfs_ioctl(struct file *file, unsigned int
>  		cmd, unsigned long arg)
>  {
> @@ -5005,6 +5105,12 @@ long btrfs_ioctl(struct file *file, unsigned int
>  		return fsverity_ioctl_enable(file, (const void __user *)argp);
>  	case FS_IOC_MEASURE_VERITY:
>  		return fsverity_ioctl_measure(file, argp);
> +	case BTRFS_IOC_ENCODED_READ:
> +		return btrfs_ioctl_encoded_read(file, argp, false);
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> +	case BTRFS_IOC_ENCODED_READ_32:
> +		return btrfs_ioctl_encoded_read(file, argp, true);
> +#endif
>  	}
>  
>  	return -ENOTTY;
> -- 
> 2.34.0
Omar Sandoval Jan. 24, 2022, 10:33 p.m. UTC | #5
On Mon, Jan 24, 2022 at 10:54:35PM +0100, David Sterba wrote:
> On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> > From: Omar Sandoval <osandov@fb.com>
> 
> > - We don't do read repair, because it turns out that read repair is
> >   currently broken for compressed data.
> 
> Is there a reproducer, and a fix?

Reproducer:

# Create a filesystem with data duplicated at 2 physical locations on the same
# disk.
$ mkfs.btrfs -f -d dup /dev/vdb
# Mount it with compression enabled.
$ mount -o compress /dev/vdb /mnt
# Write some compressible data (octal dump of random data).
$ dd if=/dev/urandom bs=4k count=1 | od > /mnt/foo
# Force it on disk.
$ sync
# Get the locations it was written to on disk.
$ ~/repos/osandov-linux/scripts/btrfs_map_physical /mnt/foo | column -ts $'\t'
FILE OFFSET  FILE SIZE  EXTENT OFFSET  EXTENT TYPE                   LOGICAL SIZE  LOGICAL OFFSET  PHYSICAL SIZE  DEVID  PHYSICAL OFFSET
0            20480      0              regular,compression=zlib,dup  20480         298844160       8192           1      575668224
                                                                                                                  1      1005125632
# Corrupt one of the copies.
$ dd if=/dev/zero of=/dev/vdb bs=4k count=1 seek=575668224 oflag=seek_bytes
$ sync
# Now, re-read the file until we read it from the corrupted copy. To make sure
# we read from disk, we drop the page cache between each read.
$ while ! btrfs device stats /dev/vdb | grep -q 'corruption_errs\s\+[1-9]'; do echo 1 > /proc/sys/vm/drop_caches; cat /mnt/foo > /dev/null; done
$ dmesg | tail
[ 3240.222922] BTRFS info (device vdb): has skinny extents
[ 3240.235245] BTRFS info (device vdb): checking UUID tree
[ 3298.885372] bash (481): drop_caches: 1
[ 3298.924648] BTRFS warning (device vdb): csum failed root 5 ino 257 off 575676416 csum 0x8941f998 expected csum 0x05a3d0cd mirror 1
[ 3298.924657] BTRFS error (device vdb): bdev /dev/vdb errs: wr 0, rd 0, flush 0, corrupt 1, gen 0
[ 3298.926221] BTRFS info (device vdb): read error corrected: ino 257 off 0 (dev /dev/vdb sector 1124352)
[ 3298.926473] BTRFS info (device vdb): read error corrected: ino 257 off 4096 (dev /dev/vdb sector 1124352)
[ 3298.926516] BTRFS info (device vdb): read error corrected: ino 257 off 16384 (dev /dev/vdb sector 1124352)
[ 3298.926555] BTRFS info (device vdb): read error corrected: ino 257 off 8192 (dev /dev/vdb sector 1124352)
[ 3298.926614] BTRFS info (device vdb): read error corrected: ino 257 off 12288 (dev /dev/vdb sector 1124352)
# Now check that the copies match.
$ dd if=/dev/vdb bs=4k count=1 skip=575668224 iflag=skip_bytes status=none | sha256sum
6ffd32b49d77b9e4ae07fd1c598b8407bc4cbb2fdb7244420589703f35605996  -
$ dd if=/dev/vdb bs=4k count=1 skip=1005125632 iflag=skip_bytes status=none | sha256sum
3646164006a08d908f5cbd6131ce413c8de49566560bf7ac6bc9432ac792605d  -
# Oops, they don't. Check the corrupted copy.
$ dd if=/dev/vdb bs=4k count=1 skip=575668224 iflag=skip_bytes status=none | head
0006000 006760 154362 010427 177511 151544 074422 116513 105472
0006020 045224 041623 063647 150022 006147 155332 053640 077304
0006040 173561 102237 155233 021641 165413 114564 004351 006141
0006060 075766 007723 017005 142265 175347 110221 071117 004421
0006100 045713 040102 005447 127414 173546 075206 042537 176547
0006120 120721 117062 177257 171012 130114 031767 165144 103776
0006140 054556 152447 123212 023000 062570 103502 057476 065541
0006160 064364 151221 117125 133463 076760 133756 133026 171622
0006200 164705 031677 005544 027201 130024 177437 102433 005170
0006220 144353 136645 147416 035017 173440 121605 050052 050325
# It contains decompressed data, and not even for the correct offset: this is
# the 4th block of the file.

I don't have a fix. Rohit Singh from my team is ramping up and working
on a fix.

This bug has been present as far back as I could follow the history. I
think it's just been masked by the fact that 1) repairing the bad copies
with bad data doesn't make things "worse" and 2) the page cache caches
the good data, so once you've repaired it once, you don't see it again
until you reboot or the page is evicted.
Omar Sandoval Jan. 25, 2022, 9:26 p.m. UTC | #6
On Mon, Jan 24, 2022 at 11:26:32PM +0100, David Sterba wrote:
> On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> > From: Omar Sandoval <osandov@fb.com>
> > 
> > There are 4 main cases:
> > 
> > 1. Inline extents: we copy the data straight out of the extent buffer.
> > 2. Hole/preallocated extents: we fill in zeroes.
> > 3. Regular, uncompressed extents: we read the sectors we need directly
> >    from disk.
> > 4. Regular, compressed extents: we read the entire compressed extent
> >    from disk and indicate what subset of the decompressed extent is in
> >    the file.
> > 
> > This initial implementation simplifies a few things that can be improved
> > in the future:
> > 
> > - We hold the inode lock during the operation.
> > - Cases 1, 3, and 4 allocate temporary memory to read into before
> >   copying out to userspace.
> > - We don't do read repair, because it turns out that read repair is
> >   currently broken for compressed data.
> > 
> > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > ---
> >  fs/btrfs/ctree.h |   4 +
> >  fs/btrfs/inode.c | 496 +++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/btrfs/ioctl.c | 106 ++++++++++
> >  3 files changed, 606 insertions(+)
> > 
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index 2e7f74060a14..70034e33abe6 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -3275,6 +3275,10 @@ int btrfs_writepage_cow_fixup(struct page *page);
> >  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
> >  					  struct page *page, u64 start,
> >  					  u64 end, bool uptodate);
> > +struct btrfs_ioctl_encoded_io_args;
> > +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> > +			   struct btrfs_ioctl_encoded_io_args *encoded);
> > +
> >  extern const struct dentry_operations btrfs_dentry_operations;
> >  extern const struct iomap_ops btrfs_dio_iomap_ops;
> >  extern const struct iomap_dio_ops btrfs_dio_ops;
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index c2efea101f61..d29e968fd18b 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -10525,6 +10525,502 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
> >  	}
> >  }
> >  
> > +static int btrfs_encoded_io_compression_from_extent(int compress_type)
> > +{
> > +	switch (compress_type) {
> > +	case BTRFS_COMPRESS_NONE:
> > +		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
> > +	case BTRFS_COMPRESS_ZLIB:
> > +		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
> > +	case BTRFS_COMPRESS_LZO:
> > +		/*
> > +		 * The LZO format depends on the page size. 64k is the maximum
> > +		 * sectorsize (and thus page size) that we support.
> > +		 */
> > +		if (PAGE_SIZE < SZ_4K || PAGE_SIZE > SZ_64K)
> > +			return -EINVAL;
> > +		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (PAGE_SHIFT - 12);
> > +	case BTRFS_COMPRESS_ZSTD:
> > +		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
> > +	default:
> > +		return -EUCLEAN;
> > +	}
> > +}
> > +
> > +static ssize_t btrfs_encoded_read_inline(
> > +				struct kiocb *iocb,
> > +				struct iov_iter *iter, u64 start,
> > +				u64 lockend,
> > +				struct extent_state **cached_state,
> > +				u64 extent_start, size_t count,
> > +				struct btrfs_ioctl_encoded_io_args *encoded,
> > +				bool *unlocked)
> > +{
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> 
> Please use btrfs_inode in all internal helpers, either as parameters or
> as local variable, to avoid the BTRFS_I and btrfs_sb conversions everywhere.
> 
> > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > +	struct btrfs_path *path;
> > +	struct extent_buffer *leaf;
> > +	struct btrfs_file_extent_item *item;
> > +	u64 ram_bytes;
> > +	unsigned long ptr;
> > +	void *tmp;
> > +	ssize_t ret;
> > +
> > +	path = btrfs_alloc_path();
> > +	if (!path) {
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path,
> > +				       btrfs_ino(BTRFS_I(inode)), extent_start,
> > +				       0);
> > +	if (ret) {
> > +		if (ret > 0) {
> > +			/* The extent item disappeared? */
> > +			ret = -EIO;
> > +		}
> > +		goto out;
> > +	}
> > +	leaf = path->nodes[0];
> > +	item = btrfs_item_ptr(leaf, path->slots[0],
> > +			      struct btrfs_file_extent_item);
> > +
> > +	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
> > +	ptr = btrfs_file_extent_inline_start(item);
> > +
> > +	encoded->len = (min_t(u64, extent_start + ram_bytes, inode->i_size) -
> > +			iocb->ki_pos);
> 
> No need for the outer ( )
> 
> > +	ret = btrfs_encoded_io_compression_from_extent(
> > +				 btrfs_file_extent_compression(leaf, item));
> > +	if (ret < 0)
> > +		goto out;
> > +	encoded->compression = ret;
> > +	if (encoded->compression) {
> > +		size_t inline_size;
> > +
> > +		inline_size = btrfs_file_extent_inline_item_len(leaf,
> > +								path->slots[0]);
> > +		if (inline_size > count) {
> > +			ret = -ENOBUFS;
> > +			goto out;
> > +		}
> > +		count = inline_size;
> > +		encoded->unencoded_len = ram_bytes;
> > +		encoded->unencoded_offset = iocb->ki_pos - extent_start;
> > +	} else {
> > +		encoded->len = encoded->unencoded_len = count =
> > +			min_t(u64, count, encoded->len);
> 
> I'm sure I have commented on that in the past, please don't use chained
> intializations. In this case something like:
> 
> 		count = min_t(u64, count, encoded->len);
> 		encoded->len = count;
> 		encoded->unencoded_len = count;
> 
> > +		ptr += iocb->ki_pos - extent_start;
> > +	}
> > +
> > +	tmp = kmalloc(count, GFP_NOFS);
> > +	if (!tmp) {
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +	read_extent_buffer(leaf, tmp, ptr, count);
> > +	btrfs_release_path(path);
> > +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> > +	inode_unlock_shared(inode);
> > +	*unlocked = true;
> > +
> > +	ret = copy_to_iter(tmp, count, iter);
> > +	if (ret != count)
> > +		ret = -EFAULT;
> > +	kfree(tmp);
> > +out:
> > +	btrfs_free_path(path);
> > +	return ret;
> > +}
> > +
> > +struct btrfs_encoded_read_private {
> > +	struct inode *inode;
> 
> This should also be btrfs_inode
> 
> > +	u64 file_offset;
> > +	wait_queue_head_t wait;
> > +	atomic_t pending;
> > +	blk_status_t status;
> > +	bool skip_csum;
> > +};
> > +
> > +static blk_status_t submit_encoded_read_bio(struct inode *inode,
> 
> struct btrfs_inode
> 
> > +					    struct bio *bio, int mirror_num,
> > +					    unsigned long bio_flags)
> 
> bio_flags is unused here (and in the encoded patches afaics)
> 
> > +{
> > +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> > +	struct btrfs_bio *bbio = btrfs_bio(bio);
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > +	blk_status_t ret;
> > +
> > +	if (!priv->skip_csum) {
> > +		ret = btrfs_lookup_bio_sums(inode, bio, NULL);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
> > +	if (ret) {
> > +		btrfs_bio_free_csum(bbio);
> > +		return ret;
> > +	}
> > +
> > +	atomic_inc(&priv->pending);
> > +	ret = btrfs_map_bio(fs_info, bio, mirror_num);
> > +	if (ret) {
> > +		atomic_dec(&priv->pending);
> > +		btrfs_bio_free_csum(bbio);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
> > +{
> > +	const bool uptodate = bbio->bio.bi_status == BLK_STS_OK;
> 
> 	const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
> 
> > +	struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
> > +	struct inode *inode = priv->inode;
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > +	u32 sectorsize = fs_info->sectorsize;
> > +	struct bio_vec *bvec;
> > +	struct bvec_iter_all iter_all;
> > +	u64 start = priv->file_offset;
> > +	u32 bio_offset = 0;
> > +
> > +	if (priv->skip_csum || !uptodate)
> > +		return bbio->bio.bi_status;
> > +
> > +	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
> > +		unsigned int i, nr_sectors, pgoff;
> > +
> > +		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
> > +		pgoff = bvec->bv_offset;
> > +		for (i = 0; i < nr_sectors; i++) {
> > +			ASSERT(pgoff < PAGE_SIZE);
> > +			if (check_data_csum(inode, bbio, bio_offset,
> > +					    bvec->bv_page, pgoff, start))
> > +				return BLK_STS_IOERR;
> > +			start += sectorsize;
> > +			bio_offset += sectorsize;
> > +			pgoff += sectorsize;
> > +		}
> > +	}
> > +	return BLK_STS_OK;
> > +}
> > +
> > +static void btrfs_encoded_read_endio(struct bio *bio)
> > +{
> > +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> > +	struct btrfs_bio *bbio = btrfs_bio(bio);
> > +	blk_status_t status;
> > +
> > +	status = btrfs_encoded_read_verify_csum(bbio);
> > +	if (status) {
> > +		/*
> > +		 * The memory barrier implied by the atomic_dec_return() here
> > +		 * pairs with the memory barrier implied by the
> > +		 * atomic_dec_return() or io_wait_event() in
> > +		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
> > +		 * write is observed before the load of status in
> > +		 * btrfs_encoded_read_regular_fill_pages().
> > +		 */
> > +		WRITE_ONCE(priv->status, status);
> > +	}
> > +	if (!atomic_dec_return(&priv->pending))
> > +		wake_up(&priv->wait);
> > +	btrfs_bio_free_csum(bbio);
> > +	bio_put(bio);
> > +}
> > +
> > +static int btrfs_encoded_read_regular_fill_pages(struct inode *inode,
> > +						 u64 file_offset,
> > +						 u64 disk_bytenr,
> > +						 u64 disk_io_size,
> > +						 struct page **pages)
> > +{
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > +	struct btrfs_encoded_read_private priv = {
> > +		.inode = inode,
> > +		.file_offset = file_offset,
> > +		.pending = ATOMIC_INIT(1),
> > +		.skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM,
> > +	};
> > +	unsigned long i = 0;
> > +	u64 cur = 0;
> > +	int ret;
> > +
> > +	init_waitqueue_head(&priv.wait);
> > +	/*
> > +	 * Submit bios for the extent, splitting due to bio or stripe limits as
> > +	 * necessary.
> > +	 */
> > +	while (cur < disk_io_size) {
> > +		struct extent_map *em;
> > +		struct btrfs_io_geometry geom;
> > +		struct bio *bio = NULL;
> > +		u64 remaining;
> > +
> > +		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
> > +					 disk_io_size - cur);
> > +		if (IS_ERR(em)) {
> > +			ret = PTR_ERR(em);
> > +		} else {
> > +			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
> > +						    disk_bytenr + cur, &geom);
> > +			free_extent_map(em);
> > +		}
> > +		if (ret) {
> > +			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
> > +			break;
> > +		}
> > +		remaining = min(geom.len, disk_io_size - cur);
> > +		while (bio || remaining) {
> > +			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
> > +
> > +			if (!bio) {
> > +				bio = btrfs_bio_alloc(BIO_MAX_VECS);
> > +				bio->bi_iter.bi_sector =
> > +					(disk_bytenr + cur) >> SECTOR_SHIFT;
> > +				bio->bi_end_io = btrfs_encoded_read_endio;
> > +				bio->bi_private = &priv;
> > +				bio->bi_opf = REQ_OP_READ;
> > +			}
> > +
> > +			if (!bytes ||
> > +			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
> > +				blk_status_t status;
> > +
> > +				status = submit_encoded_read_bio(inode, bio, 0,
> > +								 0);
> > +				if (status) {
> > +					WRITE_ONCE(priv.status, status);
> > +					bio_put(bio);
> > +					goto out;
> > +				}
> > +				bio = NULL;
> > +				continue;
> > +			}
> > +
> > +			i++;
> > +			cur += bytes;
> > +			remaining -= bytes;
> > +		}
> > +	}
> > +
> > +out:
> > +	if (atomic_dec_return(&priv.pending))
> > +		io_wait_event(priv.wait, !atomic_read(&priv.pending));
> > +	/* See btrfs_encoded_read_endio() for ordering. */
> > +	return blk_status_to_errno(READ_ONCE(priv.status));
> > +}
> > +
> > +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
> > +					  struct iov_iter *iter,
> > +					  u64 start, u64 lockend,
> > +					  struct extent_state **cached_state,
> > +					  u64 disk_bytenr, u64 disk_io_size,
> > +					  size_t count, bool compressed,
> > +					  bool *unlocked)
> > +{
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > +	struct page **pages;
> > +	unsigned long nr_pages, i;
> > +	u64 cur;
> > +	size_t page_offset;
> > +	ssize_t ret;
> > +
> > +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> > +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> > +	if (!pages)
> > +		return -ENOMEM;
> > +	for (i = 0; i < nr_pages; i++) {
> > +		pages[i] = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
> 
> No HIGHMEM please.
> 
> > +		if (!pages[i]) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +	}
> > +
> > +	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> > +						    disk_io_size, pages);
> > +	if (ret)
> > +		goto out;
> > +
> > +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> > +	inode_unlock_shared(inode);
> > +	*unlocked = true;
> > +
> > +	if (compressed) {
> > +		i = 0;
> > +		page_offset = 0;
> > +	} else {
> > +		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
> > +		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
> > +	}
> > +	cur = 0;
> > +	while (cur < count) {
> > +		size_t bytes = min_t(size_t, count - cur,
> > +				     PAGE_SIZE - page_offset);
> > +
> > +		if (copy_page_to_iter(pages[i], page_offset, bytes,
> > +				      iter) != bytes) {
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> > +		i++;
> > +		cur += bytes;
> > +		page_offset = 0;
> > +	}
> > +	ret = count;
> > +out:
> > +	for (i = 0; i < nr_pages; i++) {
> > +		if (pages[i])
> > +			__free_page(pages[i]);
> > +	}
> > +	kfree(pages);
> > +	return ret;
> > +}
> > +
> > +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> > +			   struct btrfs_ioctl_encoded_io_args *encoded)
> > +{
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > +	ssize_t ret;
> > +	size_t count = iov_iter_count(iter);
> > +	u64 start, lockend, disk_bytenr, disk_io_size;
> > +	struct extent_state *cached_state = NULL;
> > +	struct extent_map *em;
> > +	bool unlocked = false;
> > +
> > +	file_accessed(iocb->ki_filp);
> > +
> > +	inode_lock_shared(inode);
> 
> We have helpers for inode locking now, btrfs_inode_lock, that take
> additional parameter for cases where we want to exclude certain
> locking combinations.
> 
> Which also brings the question if the encoded read/write should be
> excluded against some other operations like eg. deduplication has to do
> against mmap.

I _think_ we want to exclude mmap, but I need to think some more about
it. Other than this comment, I believe I've addressed your other
outstanding comments and rebased my branches:

https://github.com/osandov/linux/tree/btrfs-send-encoded
https://github.com/osandov/btrfs-progs/tree/send-encoded

I'm guessing you're still looking at the send parts, so I'll figure out
mmap and await further comments before resending.
Omar Sandoval Feb. 8, 2022, 8:08 p.m. UTC | #7
On Tue, Jan 25, 2022 at 01:26:21PM -0800, Omar Sandoval wrote:
> On Mon, Jan 24, 2022 at 11:26:32PM +0100, David Sterba wrote:
> > On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> > > From: Omar Sandoval <osandov@fb.com>
> > > 
> > > There are 4 main cases:
> > > 
> > > 1. Inline extents: we copy the data straight out of the extent buffer.
> > > 2. Hole/preallocated extents: we fill in zeroes.
> > > 3. Regular, uncompressed extents: we read the sectors we need directly
> > >    from disk.
> > > 4. Regular, compressed extents: we read the entire compressed extent
> > >    from disk and indicate what subset of the decompressed extent is in
> > >    the file.
> > > 
> > > This initial implementation simplifies a few things that can be improved
> > > in the future:
> > > 
> > > - We hold the inode lock during the operation.
> > > - Cases 1, 3, and 4 allocate temporary memory to read into before
> > >   copying out to userspace.
> > > - We don't do read repair, because it turns out that read repair is
> > >   currently broken for compressed data.
> > > 
> > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > ---
> > >  fs/btrfs/ctree.h |   4 +
> > >  fs/btrfs/inode.c | 496 +++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/btrfs/ioctl.c | 106 ++++++++++
> > >  3 files changed, 606 insertions(+)
> > > 
> > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > > index 2e7f74060a14..70034e33abe6 100644
> > > --- a/fs/btrfs/ctree.h
> > > +++ b/fs/btrfs/ctree.h
> > > @@ -3275,6 +3275,10 @@ int btrfs_writepage_cow_fixup(struct page *page);
> > >  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
> > >  					  struct page *page, u64 start,
> > >  					  u64 end, bool uptodate);
> > > +struct btrfs_ioctl_encoded_io_args;
> > > +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> > > +			   struct btrfs_ioctl_encoded_io_args *encoded);
> > > +
> > >  extern const struct dentry_operations btrfs_dentry_operations;
> > >  extern const struct iomap_ops btrfs_dio_iomap_ops;
> > >  extern const struct iomap_dio_ops btrfs_dio_ops;
> > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > > index c2efea101f61..d29e968fd18b 100644
> > > --- a/fs/btrfs/inode.c
> > > +++ b/fs/btrfs/inode.c
> > > @@ -10525,6 +10525,502 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
> > >  	}
> > >  }
> > >  
> > > +static int btrfs_encoded_io_compression_from_extent(int compress_type)
> > > +{
> > > +	switch (compress_type) {
> > > +	case BTRFS_COMPRESS_NONE:
> > > +		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
> > > +	case BTRFS_COMPRESS_ZLIB:
> > > +		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
> > > +	case BTRFS_COMPRESS_LZO:
> > > +		/*
> > > +		 * The LZO format depends on the page size. 64k is the maximum
> > > +		 * sectorsize (and thus page size) that we support.
> > > +		 */
> > > +		if (PAGE_SIZE < SZ_4K || PAGE_SIZE > SZ_64K)
> > > +			return -EINVAL;
> > > +		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (PAGE_SHIFT - 12);
> > > +	case BTRFS_COMPRESS_ZSTD:
> > > +		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
> > > +	default:
> > > +		return -EUCLEAN;
> > > +	}
> > > +}
> > > +
> > > +static ssize_t btrfs_encoded_read_inline(
> > > +				struct kiocb *iocb,
> > > +				struct iov_iter *iter, u64 start,
> > > +				u64 lockend,
> > > +				struct extent_state **cached_state,
> > > +				u64 extent_start, size_t count,
> > > +				struct btrfs_ioctl_encoded_io_args *encoded,
> > > +				bool *unlocked)
> > > +{
> > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > 
> > Please use btrfs_inode in all internal helpers, either as parameters or
> > as local variable, to avoid the BTRFS_I and btrfs_sb conversions everywhere.
> > 
> > > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > > +	struct btrfs_path *path;
> > > +	struct extent_buffer *leaf;
> > > +	struct btrfs_file_extent_item *item;
> > > +	u64 ram_bytes;
> > > +	unsigned long ptr;
> > > +	void *tmp;
> > > +	ssize_t ret;
> > > +
> > > +	path = btrfs_alloc_path();
> > > +	if (!path) {
> > > +		ret = -ENOMEM;
> > > +		goto out;
> > > +	}
> > > +	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path,
> > > +				       btrfs_ino(BTRFS_I(inode)), extent_start,
> > > +				       0);
> > > +	if (ret) {
> > > +		if (ret > 0) {
> > > +			/* The extent item disappeared? */
> > > +			ret = -EIO;
> > > +		}
> > > +		goto out;
> > > +	}
> > > +	leaf = path->nodes[0];
> > > +	item = btrfs_item_ptr(leaf, path->slots[0],
> > > +			      struct btrfs_file_extent_item);
> > > +
> > > +	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
> > > +	ptr = btrfs_file_extent_inline_start(item);
> > > +
> > > +	encoded->len = (min_t(u64, extent_start + ram_bytes, inode->i_size) -
> > > +			iocb->ki_pos);
> > 
> > No need for the outer ( )
> > 
> > > +	ret = btrfs_encoded_io_compression_from_extent(
> > > +				 btrfs_file_extent_compression(leaf, item));
> > > +	if (ret < 0)
> > > +		goto out;
> > > +	encoded->compression = ret;
> > > +	if (encoded->compression) {
> > > +		size_t inline_size;
> > > +
> > > +		inline_size = btrfs_file_extent_inline_item_len(leaf,
> > > +								path->slots[0]);
> > > +		if (inline_size > count) {
> > > +			ret = -ENOBUFS;
> > > +			goto out;
> > > +		}
> > > +		count = inline_size;
> > > +		encoded->unencoded_len = ram_bytes;
> > > +		encoded->unencoded_offset = iocb->ki_pos - extent_start;
> > > +	} else {
> > > +		encoded->len = encoded->unencoded_len = count =
> > > +			min_t(u64, count, encoded->len);
> > 
> > I'm sure I have commented on that in the past, please don't use chained
> > intializations. In this case something like:
> > 
> > 		count = min_t(u64, count, encoded->len);
> > 		encoded->len = count;
> > 		encoded->unencoded_len = count;
> > 
> > > +		ptr += iocb->ki_pos - extent_start;
> > > +	}
> > > +
> > > +	tmp = kmalloc(count, GFP_NOFS);
> > > +	if (!tmp) {
> > > +		ret = -ENOMEM;
> > > +		goto out;
> > > +	}
> > > +	read_extent_buffer(leaf, tmp, ptr, count);
> > > +	btrfs_release_path(path);
> > > +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> > > +	inode_unlock_shared(inode);
> > > +	*unlocked = true;
> > > +
> > > +	ret = copy_to_iter(tmp, count, iter);
> > > +	if (ret != count)
> > > +		ret = -EFAULT;
> > > +	kfree(tmp);
> > > +out:
> > > +	btrfs_free_path(path);
> > > +	return ret;
> > > +}
> > > +
> > > +struct btrfs_encoded_read_private {
> > > +	struct inode *inode;
> > 
> > This should also be btrfs_inode
> > 
> > > +	u64 file_offset;
> > > +	wait_queue_head_t wait;
> > > +	atomic_t pending;
> > > +	blk_status_t status;
> > > +	bool skip_csum;
> > > +};
> > > +
> > > +static blk_status_t submit_encoded_read_bio(struct inode *inode,
> > 
> > struct btrfs_inode
> > 
> > > +					    struct bio *bio, int mirror_num,
> > > +					    unsigned long bio_flags)
> > 
> > bio_flags is unused here (and in the encoded patches afaics)
> > 
> > > +{
> > > +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> > > +	struct btrfs_bio *bbio = btrfs_bio(bio);
> > > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > > +	blk_status_t ret;
> > > +
> > > +	if (!priv->skip_csum) {
> > > +		ret = btrfs_lookup_bio_sums(inode, bio, NULL);
> > > +		if (ret)
> > > +			return ret;
> > > +	}
> > > +
> > > +	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
> > > +	if (ret) {
> > > +		btrfs_bio_free_csum(bbio);
> > > +		return ret;
> > > +	}
> > > +
> > > +	atomic_inc(&priv->pending);
> > > +	ret = btrfs_map_bio(fs_info, bio, mirror_num);
> > > +	if (ret) {
> > > +		atomic_dec(&priv->pending);
> > > +		btrfs_bio_free_csum(bbio);
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > > +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
> > > +{
> > > +	const bool uptodate = bbio->bio.bi_status == BLK_STS_OK;
> > 
> > 	const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
> > 
> > > +	struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
> > > +	struct inode *inode = priv->inode;
> > > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > > +	u32 sectorsize = fs_info->sectorsize;
> > > +	struct bio_vec *bvec;
> > > +	struct bvec_iter_all iter_all;
> > > +	u64 start = priv->file_offset;
> > > +	u32 bio_offset = 0;
> > > +
> > > +	if (priv->skip_csum || !uptodate)
> > > +		return bbio->bio.bi_status;
> > > +
> > > +	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
> > > +		unsigned int i, nr_sectors, pgoff;
> > > +
> > > +		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
> > > +		pgoff = bvec->bv_offset;
> > > +		for (i = 0; i < nr_sectors; i++) {
> > > +			ASSERT(pgoff < PAGE_SIZE);
> > > +			if (check_data_csum(inode, bbio, bio_offset,
> > > +					    bvec->bv_page, pgoff, start))
> > > +				return BLK_STS_IOERR;
> > > +			start += sectorsize;
> > > +			bio_offset += sectorsize;
> > > +			pgoff += sectorsize;
> > > +		}
> > > +	}
> > > +	return BLK_STS_OK;
> > > +}
> > > +
> > > +static void btrfs_encoded_read_endio(struct bio *bio)
> > > +{
> > > +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> > > +	struct btrfs_bio *bbio = btrfs_bio(bio);
> > > +	blk_status_t status;
> > > +
> > > +	status = btrfs_encoded_read_verify_csum(bbio);
> > > +	if (status) {
> > > +		/*
> > > +		 * The memory barrier implied by the atomic_dec_return() here
> > > +		 * pairs with the memory barrier implied by the
> > > +		 * atomic_dec_return() or io_wait_event() in
> > > +		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
> > > +		 * write is observed before the load of status in
> > > +		 * btrfs_encoded_read_regular_fill_pages().
> > > +		 */
> > > +		WRITE_ONCE(priv->status, status);
> > > +	}
> > > +	if (!atomic_dec_return(&priv->pending))
> > > +		wake_up(&priv->wait);
> > > +	btrfs_bio_free_csum(bbio);
> > > +	bio_put(bio);
> > > +}
> > > +
> > > +static int btrfs_encoded_read_regular_fill_pages(struct inode *inode,
> > > +						 u64 file_offset,
> > > +						 u64 disk_bytenr,
> > > +						 u64 disk_io_size,
> > > +						 struct page **pages)
> > > +{
> > > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > > +	struct btrfs_encoded_read_private priv = {
> > > +		.inode = inode,
> > > +		.file_offset = file_offset,
> > > +		.pending = ATOMIC_INIT(1),
> > > +		.skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM,
> > > +	};
> > > +	unsigned long i = 0;
> > > +	u64 cur = 0;
> > > +	int ret;
> > > +
> > > +	init_waitqueue_head(&priv.wait);
> > > +	/*
> > > +	 * Submit bios for the extent, splitting due to bio or stripe limits as
> > > +	 * necessary.
> > > +	 */
> > > +	while (cur < disk_io_size) {
> > > +		struct extent_map *em;
> > > +		struct btrfs_io_geometry geom;
> > > +		struct bio *bio = NULL;
> > > +		u64 remaining;
> > > +
> > > +		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
> > > +					 disk_io_size - cur);
> > > +		if (IS_ERR(em)) {
> > > +			ret = PTR_ERR(em);
> > > +		} else {
> > > +			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
> > > +						    disk_bytenr + cur, &geom);
> > > +			free_extent_map(em);
> > > +		}
> > > +		if (ret) {
> > > +			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
> > > +			break;
> > > +		}
> > > +		remaining = min(geom.len, disk_io_size - cur);
> > > +		while (bio || remaining) {
> > > +			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
> > > +
> > > +			if (!bio) {
> > > +				bio = btrfs_bio_alloc(BIO_MAX_VECS);
> > > +				bio->bi_iter.bi_sector =
> > > +					(disk_bytenr + cur) >> SECTOR_SHIFT;
> > > +				bio->bi_end_io = btrfs_encoded_read_endio;
> > > +				bio->bi_private = &priv;
> > > +				bio->bi_opf = REQ_OP_READ;
> > > +			}
> > > +
> > > +			if (!bytes ||
> > > +			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
> > > +				blk_status_t status;
> > > +
> > > +				status = submit_encoded_read_bio(inode, bio, 0,
> > > +								 0);
> > > +				if (status) {
> > > +					WRITE_ONCE(priv.status, status);
> > > +					bio_put(bio);
> > > +					goto out;
> > > +				}
> > > +				bio = NULL;
> > > +				continue;
> > > +			}
> > > +
> > > +			i++;
> > > +			cur += bytes;
> > > +			remaining -= bytes;
> > > +		}
> > > +	}
> > > +
> > > +out:
> > > +	if (atomic_dec_return(&priv.pending))
> > > +		io_wait_event(priv.wait, !atomic_read(&priv.pending));
> > > +	/* See btrfs_encoded_read_endio() for ordering. */
> > > +	return blk_status_to_errno(READ_ONCE(priv.status));
> > > +}
> > > +
> > > +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
> > > +					  struct iov_iter *iter,
> > > +					  u64 start, u64 lockend,
> > > +					  struct extent_state **cached_state,
> > > +					  u64 disk_bytenr, u64 disk_io_size,
> > > +					  size_t count, bool compressed,
> > > +					  bool *unlocked)
> > > +{
> > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > > +	struct page **pages;
> > > +	unsigned long nr_pages, i;
> > > +	u64 cur;
> > > +	size_t page_offset;
> > > +	ssize_t ret;
> > > +
> > > +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> > > +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> > > +	if (!pages)
> > > +		return -ENOMEM;
> > > +	for (i = 0; i < nr_pages; i++) {
> > > +		pages[i] = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
> > 
> > No HIGHMEM please.
> > 
> > > +		if (!pages[i]) {
> > > +			ret = -ENOMEM;
> > > +			goto out;
> > > +		}
> > > +	}
> > > +
> > > +	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> > > +						    disk_io_size, pages);
> > > +	if (ret)
> > > +		goto out;
> > > +
> > > +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> > > +	inode_unlock_shared(inode);
> > > +	*unlocked = true;
> > > +
> > > +	if (compressed) {
> > > +		i = 0;
> > > +		page_offset = 0;
> > > +	} else {
> > > +		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
> > > +		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
> > > +	}
> > > +	cur = 0;
> > > +	while (cur < count) {
> > > +		size_t bytes = min_t(size_t, count - cur,
> > > +				     PAGE_SIZE - page_offset);
> > > +
> > > +		if (copy_page_to_iter(pages[i], page_offset, bytes,
> > > +				      iter) != bytes) {
> > > +			ret = -EFAULT;
> > > +			goto out;
> > > +		}
> > > +		i++;
> > > +		cur += bytes;
> > > +		page_offset = 0;
> > > +	}
> > > +	ret = count;
> > > +out:
> > > +	for (i = 0; i < nr_pages; i++) {
> > > +		if (pages[i])
> > > +			__free_page(pages[i]);
> > > +	}
> > > +	kfree(pages);
> > > +	return ret;
> > > +}
> > > +
> > > +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> > > +			   struct btrfs_ioctl_encoded_io_args *encoded)
> > > +{
> > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > > +	ssize_t ret;
> > > +	size_t count = iov_iter_count(iter);
> > > +	u64 start, lockend, disk_bytenr, disk_io_size;
> > > +	struct extent_state *cached_state = NULL;
> > > +	struct extent_map *em;
> > > +	bool unlocked = false;
> > > +
> > > +	file_accessed(iocb->ki_filp);
> > > +
> > > +	inode_lock_shared(inode);
> > 
> > We have helpers for inode locking now, btrfs_inode_lock, that take
> > additional parameter for cases where we want to exclude certain
> > locking combinations.
> > 
> > Which also brings the question if the encoded read/write should be
> > excluded against some other operations like eg. deduplication has to do
> > against mmap.
> 
> I _think_ we want to exclude mmap, but I need to think some more about
> it. Other than this comment, I believe I've addressed your other
> outstanding comments and rebased my branches:
> 
> https://github.com/osandov/linux/tree/btrfs-send-encoded
> https://github.com/osandov/btrfs-progs/tree/send-encoded
> 
> I'm guessing you're still looking at the send parts, so I'll figure out
> mmap and await further comments before resending.

After looking at the code some more, I don't think we need to exclude
mmap after all.

The only thing that can happen is that with this interleaving:

btrfs_page_mkwrite         btrfs_encoded_read
---------------------------------------------------
(enter)                    (enter)
                           btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
                           lock_extent_bits
			   read extent
                           unlock_extent_cached
                           (exit)

we read the old data from before the page was dirtied. But there are
other interleavings of a concurrent btrfs_page_mkwrite and
btrfs_encoded_read that would read the old data, like:

btrfs_page_mkwrite         btrfs_encoded_read
---------------------------------------------------
(enter)                    (enter)
                           btrfs_wait_ordered_range
                           lock_extent_bits
lock_extent_bits blocked
			   read extent
                           unlock_extent_cached
                           (exit)
lock_extent_bits returns
btrfs_page_set_dirty
unlock_extent_cached
(exit)

Or even if we were to use BTRFS_ILOCK_MMAP:

btrfs_page_mkwrite               btrfs_encoded_read
-------------------------------------------------------------------
(enter)                          (enter)
			         btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
                                 btrfs_wait_ordered_range
                                 lock_extent_bits
			         read extent
                                 unlock_extent_cached
			         btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached

In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.

I pushed the change to use btrfs_inode_lock()/btrfs_inode_unlock() with
BTRFS_ILOCK_SHARED to my branch. I'll resend the series as well.
David Sterba Feb. 10, 2022, 6:38 p.m. UTC | #8
On Tue, Feb 08, 2022 at 12:08:50PM -0800, Omar Sandoval wrote:
> On Tue, Jan 25, 2022 at 01:26:21PM -0800, Omar Sandoval wrote:
> > On Mon, Jan 24, 2022 at 11:26:32PM +0100, David Sterba wrote:
> > > On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> > > > From: Omar Sandoval <osandov@fb.com>
> > > > 
> > > > There are 4 main cases:
> > > > 
> > > > 1. Inline extents: we copy the data straight out of the extent buffer.
> > > > 2. Hole/preallocated extents: we fill in zeroes.
> > > > 3. Regular, uncompressed extents: we read the sectors we need directly
> > > >    from disk.
> > > > 4. Regular, compressed extents: we read the entire compressed extent
> > > >    from disk and indicate what subset of the decompressed extent is in
> > > >    the file.
> > > > 
> > > > This initial implementation simplifies a few things that can be improved
> > > > in the future:
> > > > 
> > > > - We hold the inode lock during the operation.
> > > > - Cases 1, 3, and 4 allocate temporary memory to read into before
> > > >   copying out to userspace.
> > > > - We don't do read repair, because it turns out that read repair is
> > > >   currently broken for compressed data.
> > > > 
> > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > ---
> > > >  fs/btrfs/ctree.h |   4 +
> > > >  fs/btrfs/inode.c | 496 +++++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/btrfs/ioctl.c | 106 ++++++++++
> > > >  3 files changed, 606 insertions(+)
> > > > 
> > > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > > > index 2e7f74060a14..70034e33abe6 100644
> > > > --- a/fs/btrfs/ctree.h
> > > > +++ b/fs/btrfs/ctree.h
> > > > @@ -3275,6 +3275,10 @@ int btrfs_writepage_cow_fixup(struct page *page);
> > > >  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
> > > >  					  struct page *page, u64 start,
> > > >  					  u64 end, bool uptodate);
> > > > +struct btrfs_ioctl_encoded_io_args;
> > > > +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> > > > +			   struct btrfs_ioctl_encoded_io_args *encoded);
> > > > +
> > > >  extern const struct dentry_operations btrfs_dentry_operations;
> > > >  extern const struct iomap_ops btrfs_dio_iomap_ops;
> > > >  extern const struct iomap_dio_ops btrfs_dio_ops;
> > > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > > > index c2efea101f61..d29e968fd18b 100644
> > > > --- a/fs/btrfs/inode.c
> > > > +++ b/fs/btrfs/inode.c
> > > > @@ -10525,6 +10525,502 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
> > > >  	}
> > > >  }
> > > >  
> > > > +static int btrfs_encoded_io_compression_from_extent(int compress_type)
> > > > +{
> > > > +	switch (compress_type) {
> > > > +	case BTRFS_COMPRESS_NONE:
> > > > +		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
> > > > +	case BTRFS_COMPRESS_ZLIB:
> > > > +		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
> > > > +	case BTRFS_COMPRESS_LZO:
> > > > +		/*
> > > > +		 * The LZO format depends on the page size. 64k is the maximum
> > > > +		 * sectorsize (and thus page size) that we support.
> > > > +		 */
> > > > +		if (PAGE_SIZE < SZ_4K || PAGE_SIZE > SZ_64K)
> > > > +			return -EINVAL;
> > > > +		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (PAGE_SHIFT - 12);
> > > > +	case BTRFS_COMPRESS_ZSTD:
> > > > +		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
> > > > +	default:
> > > > +		return -EUCLEAN;
> > > > +	}
> > > > +}
> > > > +
> > > > +static ssize_t btrfs_encoded_read_inline(
> > > > +				struct kiocb *iocb,
> > > > +				struct iov_iter *iter, u64 start,
> > > > +				u64 lockend,
> > > > +				struct extent_state **cached_state,
> > > > +				u64 extent_start, size_t count,
> > > > +				struct btrfs_ioctl_encoded_io_args *encoded,
> > > > +				bool *unlocked)
> > > > +{
> > > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > 
> > > Please use btrfs_inode in all internal helpers, either as parameters or
> > > as local variable, to avoid the BTRFS_I and btrfs_sb conversions everywhere.
> > > 
> > > > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > > > +	struct btrfs_path *path;
> > > > +	struct extent_buffer *leaf;
> > > > +	struct btrfs_file_extent_item *item;
> > > > +	u64 ram_bytes;
> > > > +	unsigned long ptr;
> > > > +	void *tmp;
> > > > +	ssize_t ret;
> > > > +
> > > > +	path = btrfs_alloc_path();
> > > > +	if (!path) {
> > > > +		ret = -ENOMEM;
> > > > +		goto out;
> > > > +	}
> > > > +	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path,
> > > > +				       btrfs_ino(BTRFS_I(inode)), extent_start,
> > > > +				       0);
> > > > +	if (ret) {
> > > > +		if (ret > 0) {
> > > > +			/* The extent item disappeared? */
> > > > +			ret = -EIO;
> > > > +		}
> > > > +		goto out;
> > > > +	}
> > > > +	leaf = path->nodes[0];
> > > > +	item = btrfs_item_ptr(leaf, path->slots[0],
> > > > +			      struct btrfs_file_extent_item);
> > > > +
> > > > +	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
> > > > +	ptr = btrfs_file_extent_inline_start(item);
> > > > +
> > > > +	encoded->len = (min_t(u64, extent_start + ram_bytes, inode->i_size) -
> > > > +			iocb->ki_pos);
> > > 
> > > No need for the outer ( )
> > > 
> > > > +	ret = btrfs_encoded_io_compression_from_extent(
> > > > +				 btrfs_file_extent_compression(leaf, item));
> > > > +	if (ret < 0)
> > > > +		goto out;
> > > > +	encoded->compression = ret;
> > > > +	if (encoded->compression) {
> > > > +		size_t inline_size;
> > > > +
> > > > +		inline_size = btrfs_file_extent_inline_item_len(leaf,
> > > > +								path->slots[0]);
> > > > +		if (inline_size > count) {
> > > > +			ret = -ENOBUFS;
> > > > +			goto out;
> > > > +		}
> > > > +		count = inline_size;
> > > > +		encoded->unencoded_len = ram_bytes;
> > > > +		encoded->unencoded_offset = iocb->ki_pos - extent_start;
> > > > +	} else {
> > > > +		encoded->len = encoded->unencoded_len = count =
> > > > +			min_t(u64, count, encoded->len);
> > > 
> > > I'm sure I have commented on that in the past, please don't use chained
> > > intializations. In this case something like:
> > > 
> > > 		count = min_t(u64, count, encoded->len);
> > > 		encoded->len = count;
> > > 		encoded->unencoded_len = count;
> > > 
> > > > +		ptr += iocb->ki_pos - extent_start;
> > > > +	}
> > > > +
> > > > +	tmp = kmalloc(count, GFP_NOFS);
> > > > +	if (!tmp) {
> > > > +		ret = -ENOMEM;
> > > > +		goto out;
> > > > +	}
> > > > +	read_extent_buffer(leaf, tmp, ptr, count);
> > > > +	btrfs_release_path(path);
> > > > +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> > > > +	inode_unlock_shared(inode);
> > > > +	*unlocked = true;
> > > > +
> > > > +	ret = copy_to_iter(tmp, count, iter);
> > > > +	if (ret != count)
> > > > +		ret = -EFAULT;
> > > > +	kfree(tmp);
> > > > +out:
> > > > +	btrfs_free_path(path);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +struct btrfs_encoded_read_private {
> > > > +	struct inode *inode;
> > > 
> > > This should also be btrfs_inode
> > > 
> > > > +	u64 file_offset;
> > > > +	wait_queue_head_t wait;
> > > > +	atomic_t pending;
> > > > +	blk_status_t status;
> > > > +	bool skip_csum;
> > > > +};
> > > > +
> > > > +static blk_status_t submit_encoded_read_bio(struct inode *inode,
> > > 
> > > struct btrfs_inode
> > > 
> > > > +					    struct bio *bio, int mirror_num,
> > > > +					    unsigned long bio_flags)
> > > 
> > > bio_flags is unused here (and in the encoded patches afaics)
> > > 
> > > > +{
> > > > +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> > > > +	struct btrfs_bio *bbio = btrfs_bio(bio);
> > > > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > > > +	blk_status_t ret;
> > > > +
> > > > +	if (!priv->skip_csum) {
> > > > +		ret = btrfs_lookup_bio_sums(inode, bio, NULL);
> > > > +		if (ret)
> > > > +			return ret;
> > > > +	}
> > > > +
> > > > +	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
> > > > +	if (ret) {
> > > > +		btrfs_bio_free_csum(bbio);
> > > > +		return ret;
> > > > +	}
> > > > +
> > > > +	atomic_inc(&priv->pending);
> > > > +	ret = btrfs_map_bio(fs_info, bio, mirror_num);
> > > > +	if (ret) {
> > > > +		atomic_dec(&priv->pending);
> > > > +		btrfs_bio_free_csum(bbio);
> > > > +	}
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
> > > > +{
> > > > +	const bool uptodate = bbio->bio.bi_status == BLK_STS_OK;
> > > 
> > > 	const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
> > > 
> > > > +	struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
> > > > +	struct inode *inode = priv->inode;
> > > > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > > > +	u32 sectorsize = fs_info->sectorsize;
> > > > +	struct bio_vec *bvec;
> > > > +	struct bvec_iter_all iter_all;
> > > > +	u64 start = priv->file_offset;
> > > > +	u32 bio_offset = 0;
> > > > +
> > > > +	if (priv->skip_csum || !uptodate)
> > > > +		return bbio->bio.bi_status;
> > > > +
> > > > +	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
> > > > +		unsigned int i, nr_sectors, pgoff;
> > > > +
> > > > +		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
> > > > +		pgoff = bvec->bv_offset;
> > > > +		for (i = 0; i < nr_sectors; i++) {
> > > > +			ASSERT(pgoff < PAGE_SIZE);
> > > > +			if (check_data_csum(inode, bbio, bio_offset,
> > > > +					    bvec->bv_page, pgoff, start))
> > > > +				return BLK_STS_IOERR;
> > > > +			start += sectorsize;
> > > > +			bio_offset += sectorsize;
> > > > +			pgoff += sectorsize;
> > > > +		}
> > > > +	}
> > > > +	return BLK_STS_OK;
> > > > +}
> > > > +
> > > > +static void btrfs_encoded_read_endio(struct bio *bio)
> > > > +{
> > > > +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> > > > +	struct btrfs_bio *bbio = btrfs_bio(bio);
> > > > +	blk_status_t status;
> > > > +
> > > > +	status = btrfs_encoded_read_verify_csum(bbio);
> > > > +	if (status) {
> > > > +		/*
> > > > +		 * The memory barrier implied by the atomic_dec_return() here
> > > > +		 * pairs with the memory barrier implied by the
> > > > +		 * atomic_dec_return() or io_wait_event() in
> > > > +		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
> > > > +		 * write is observed before the load of status in
> > > > +		 * btrfs_encoded_read_regular_fill_pages().
> > > > +		 */
> > > > +		WRITE_ONCE(priv->status, status);
> > > > +	}
> > > > +	if (!atomic_dec_return(&priv->pending))
> > > > +		wake_up(&priv->wait);
> > > > +	btrfs_bio_free_csum(bbio);
> > > > +	bio_put(bio);
> > > > +}
> > > > +
> > > > +static int btrfs_encoded_read_regular_fill_pages(struct inode *inode,
> > > > +						 u64 file_offset,
> > > > +						 u64 disk_bytenr,
> > > > +						 u64 disk_io_size,
> > > > +						 struct page **pages)
> > > > +{
> > > > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > > > +	struct btrfs_encoded_read_private priv = {
> > > > +		.inode = inode,
> > > > +		.file_offset = file_offset,
> > > > +		.pending = ATOMIC_INIT(1),
> > > > +		.skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM,
> > > > +	};
> > > > +	unsigned long i = 0;
> > > > +	u64 cur = 0;
> > > > +	int ret;
> > > > +
> > > > +	init_waitqueue_head(&priv.wait);
> > > > +	/*
> > > > +	 * Submit bios for the extent, splitting due to bio or stripe limits as
> > > > +	 * necessary.
> > > > +	 */
> > > > +	while (cur < disk_io_size) {
> > > > +		struct extent_map *em;
> > > > +		struct btrfs_io_geometry geom;
> > > > +		struct bio *bio = NULL;
> > > > +		u64 remaining;
> > > > +
> > > > +		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
> > > > +					 disk_io_size - cur);
> > > > +		if (IS_ERR(em)) {
> > > > +			ret = PTR_ERR(em);
> > > > +		} else {
> > > > +			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
> > > > +						    disk_bytenr + cur, &geom);
> > > > +			free_extent_map(em);
> > > > +		}
> > > > +		if (ret) {
> > > > +			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
> > > > +			break;
> > > > +		}
> > > > +		remaining = min(geom.len, disk_io_size - cur);
> > > > +		while (bio || remaining) {
> > > > +			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
> > > > +
> > > > +			if (!bio) {
> > > > +				bio = btrfs_bio_alloc(BIO_MAX_VECS);
> > > > +				bio->bi_iter.bi_sector =
> > > > +					(disk_bytenr + cur) >> SECTOR_SHIFT;
> > > > +				bio->bi_end_io = btrfs_encoded_read_endio;
> > > > +				bio->bi_private = &priv;
> > > > +				bio->bi_opf = REQ_OP_READ;
> > > > +			}
> > > > +
> > > > +			if (!bytes ||
> > > > +			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
> > > > +				blk_status_t status;
> > > > +
> > > > +				status = submit_encoded_read_bio(inode, bio, 0,
> > > > +								 0);
> > > > +				if (status) {
> > > > +					WRITE_ONCE(priv.status, status);
> > > > +					bio_put(bio);
> > > > +					goto out;
> > > > +				}
> > > > +				bio = NULL;
> > > > +				continue;
> > > > +			}
> > > > +
> > > > +			i++;
> > > > +			cur += bytes;
> > > > +			remaining -= bytes;
> > > > +		}
> > > > +	}
> > > > +
> > > > +out:
> > > > +	if (atomic_dec_return(&priv.pending))
> > > > +		io_wait_event(priv.wait, !atomic_read(&priv.pending));
> > > > +	/* See btrfs_encoded_read_endio() for ordering. */
> > > > +	return blk_status_to_errno(READ_ONCE(priv.status));
> > > > +}
> > > > +
> > > > +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
> > > > +					  struct iov_iter *iter,
> > > > +					  u64 start, u64 lockend,
> > > > +					  struct extent_state **cached_state,
> > > > +					  u64 disk_bytenr, u64 disk_io_size,
> > > > +					  size_t count, bool compressed,
> > > > +					  bool *unlocked)
> > > > +{
> > > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > > > +	struct page **pages;
> > > > +	unsigned long nr_pages, i;
> > > > +	u64 cur;
> > > > +	size_t page_offset;
> > > > +	ssize_t ret;
> > > > +
> > > > +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> > > > +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> > > > +	if (!pages)
> > > > +		return -ENOMEM;
> > > > +	for (i = 0; i < nr_pages; i++) {
> > > > +		pages[i] = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
> > > 
> > > No HIGHMEM please.
> > > 
> > > > +		if (!pages[i]) {
> > > > +			ret = -ENOMEM;
> > > > +			goto out;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> > > > +						    disk_io_size, pages);
> > > > +	if (ret)
> > > > +		goto out;
> > > > +
> > > > +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> > > > +	inode_unlock_shared(inode);
> > > > +	*unlocked = true;
> > > > +
> > > > +	if (compressed) {
> > > > +		i = 0;
> > > > +		page_offset = 0;
> > > > +	} else {
> > > > +		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
> > > > +		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
> > > > +	}
> > > > +	cur = 0;
> > > > +	while (cur < count) {
> > > > +		size_t bytes = min_t(size_t, count - cur,
> > > > +				     PAGE_SIZE - page_offset);
> > > > +
> > > > +		if (copy_page_to_iter(pages[i], page_offset, bytes,
> > > > +				      iter) != bytes) {
> > > > +			ret = -EFAULT;
> > > > +			goto out;
> > > > +		}
> > > > +		i++;
> > > > +		cur += bytes;
> > > > +		page_offset = 0;
> > > > +	}
> > > > +	ret = count;
> > > > +out:
> > > > +	for (i = 0; i < nr_pages; i++) {
> > > > +		if (pages[i])
> > > > +			__free_page(pages[i]);
> > > > +	}
> > > > +	kfree(pages);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
> > > > +			   struct btrfs_ioctl_encoded_io_args *encoded)
> > > > +{
> > > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > > +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > > > +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> > > > +	ssize_t ret;
> > > > +	size_t count = iov_iter_count(iter);
> > > > +	u64 start, lockend, disk_bytenr, disk_io_size;
> > > > +	struct extent_state *cached_state = NULL;
> > > > +	struct extent_map *em;
> > > > +	bool unlocked = false;
> > > > +
> > > > +	file_accessed(iocb->ki_filp);
> > > > +
> > > > +	inode_lock_shared(inode);
> > > 
> > > We have helpers for inode locking now, btrfs_inode_lock, that take
> > > additional parameter for cases where we want to exclude certain
> > > locking combinations.
> > > 
> > > Which also brings the question if the encoded read/write should be
> > > excluded against some other operations like eg. deduplication has to do
> > > against mmap.
> > 
> > I _think_ we want to exclude mmap, but I need to think some more about
> > it. Other than this comment, I believe I've addressed your other
> > outstanding comments and rebased my branches:
> > 
> > https://github.com/osandov/linux/tree/btrfs-send-encoded
> > https://github.com/osandov/btrfs-progs/tree/send-encoded
> > 
> > I'm guessing you're still looking at the send parts, so I'll figure out
> > mmap and await further comments before resending.
> 
> After looking at the code some more, I don't think we need to exclude
> mmap after all.
> 
> The only thing that can happen is that with this interleaving:
> 
> btrfs_page_mkwrite         btrfs_encoded_read
> ---------------------------------------------------
> (enter)                    (enter)
>                            btrfs_wait_ordered_range
> lock_extent_bits
> btrfs_page_set_dirty
> unlock_extent_cached
> (exit)
>                            lock_extent_bits
> 			   read extent
>                            unlock_extent_cached
>                            (exit)
> 
> we read the old data from before the page was dirtied. But there are
> other interleavings of a concurrent btrfs_page_mkwrite and
> btrfs_encoded_read that would read the old data, like:
> 
> btrfs_page_mkwrite         btrfs_encoded_read
> ---------------------------------------------------
> (enter)                    (enter)
>                            btrfs_wait_ordered_range
>                            lock_extent_bits
> lock_extent_bits blocked
> 			   read extent
>                            unlock_extent_cached
>                            (exit)
> lock_extent_bits returns
> btrfs_page_set_dirty
> unlock_extent_cached
> (exit)
> 
> Or even if we were to use BTRFS_ILOCK_MMAP:
> 
> btrfs_page_mkwrite               btrfs_encoded_read
> -------------------------------------------------------------------
> (enter)                          (enter)
> 			         btrfs_inode_lock(BTRFS_ILOCK_MMAP)
> down_read(i_mmap_lock) (blocked)
>                                  btrfs_wait_ordered_range
>                                  lock_extent_bits
> 			         read extent
>                                  unlock_extent_cached
> 			         btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
> down_read(i_mmap_lock) returns
> lock_extent_bits
> btrfs_page_set_dirty
> unlock_extent_cached
> 
> In other words, this is inherently racy, so it's fine that we return the
> old data in this tiny window.

Thanks for the analysis, please put it to the changelog, I think it will
be handy one day.
David Sterba Feb. 10, 2022, 6:42 p.m. UTC | #9
On Tue, Jan 25, 2022 at 01:26:21PM -0800, Omar Sandoval wrote:
> On Mon, Jan 24, 2022 at 11:26:32PM +0100, David Sterba wrote:
> > On Wed, Nov 17, 2021 at 12:19:19PM -0800, Omar Sandoval wrote:
> > > +
> > > +	inode_lock_shared(inode);
> > 
> > We have helpers for inode locking now, btrfs_inode_lock, that take
> > additional parameter for cases where we want to exclude certain
> > locking combinations.
> > 
> > Which also brings the question if the encoded read/write should be
> > excluded against some other operations like eg. deduplication has to do
> > against mmap.
> 
> I _think_ we want to exclude mmap, but I need to think some more about
> it. Other than this comment, I believe I've addressed your other
> outstanding comments and rebased my branches:
> 
> https://github.com/osandov/linux/tree/btrfs-send-encoded
> https://github.com/osandov/btrfs-progs/tree/send-encoded
> 
> I'm guessing you're still looking at the send parts, so I'll figure out
> mmap and await further comments before resending.

First I want to get the encoded ioctl, so please send v13.
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2e7f74060a14..70034e33abe6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3275,6 +3275,10 @@  int btrfs_writepage_cow_fixup(struct page *page);
 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
 					  struct page *page, u64 start,
 					  u64 end, bool uptodate);
+struct btrfs_ioctl_encoded_io_args;
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+			   struct btrfs_ioctl_encoded_io_args *encoded);
+
 extern const struct dentry_operations btrfs_dentry_operations;
 extern const struct iomap_ops btrfs_dio_iomap_ops;
 extern const struct iomap_dio_ops btrfs_dio_ops;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c2efea101f61..d29e968fd18b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10525,6 +10525,502 @@  void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
 	}
 }
 
+static int btrfs_encoded_io_compression_from_extent(int compress_type)
+{
+	switch (compress_type) {
+	case BTRFS_COMPRESS_NONE:
+		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
+	case BTRFS_COMPRESS_ZLIB:
+		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
+	case BTRFS_COMPRESS_LZO:
+		/*
+		 * The LZO format depends on the page size. 64k is the maximum
+		 * sectorsize (and thus page size) that we support.
+		 */
+		if (PAGE_SIZE < SZ_4K || PAGE_SIZE > SZ_64K)
+			return -EINVAL;
+		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (PAGE_SHIFT - 12);
+	case BTRFS_COMPRESS_ZSTD:
+		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
+	default:
+		return -EUCLEAN;
+	}
+}
+
+static ssize_t btrfs_encoded_read_inline(
+				struct kiocb *iocb,
+				struct iov_iter *iter, u64 start,
+				u64 lockend,
+				struct extent_state **cached_state,
+				u64 extent_start, size_t count,
+				struct btrfs_ioctl_encoded_io_args *encoded,
+				bool *unlocked)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *item;
+	u64 ram_bytes;
+	unsigned long ptr;
+	void *tmp;
+	ssize_t ret;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path,
+				       btrfs_ino(BTRFS_I(inode)), extent_start,
+				       0);
+	if (ret) {
+		if (ret > 0) {
+			/* The extent item disappeared? */
+			ret = -EIO;
+		}
+		goto out;
+	}
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+
+	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+	ptr = btrfs_file_extent_inline_start(item);
+
+	encoded->len = (min_t(u64, extent_start + ram_bytes, inode->i_size) -
+			iocb->ki_pos);
+	ret = btrfs_encoded_io_compression_from_extent(
+				 btrfs_file_extent_compression(leaf, item));
+	if (ret < 0)
+		goto out;
+	encoded->compression = ret;
+	if (encoded->compression) {
+		size_t inline_size;
+
+		inline_size = btrfs_file_extent_inline_item_len(leaf,
+								path->slots[0]);
+		if (inline_size > count) {
+			ret = -ENOBUFS;
+			goto out;
+		}
+		count = inline_size;
+		encoded->unencoded_len = ram_bytes;
+		encoded->unencoded_offset = iocb->ki_pos - extent_start;
+	} else {
+		encoded->len = encoded->unencoded_len = count =
+			min_t(u64, count, encoded->len);
+		ptr += iocb->ki_pos - extent_start;
+	}
+
+	tmp = kmalloc(count, GFP_NOFS);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	read_extent_buffer(leaf, tmp, ptr, count);
+	btrfs_release_path(path);
+	unlock_extent_cached(io_tree, start, lockend, cached_state);
+	inode_unlock_shared(inode);
+	*unlocked = true;
+
+	ret = copy_to_iter(tmp, count, iter);
+	if (ret != count)
+		ret = -EFAULT;
+	kfree(tmp);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+struct btrfs_encoded_read_private {
+	struct inode *inode;
+	u64 file_offset;
+	wait_queue_head_t wait;
+	atomic_t pending;
+	blk_status_t status;
+	bool skip_csum;
+};
+
+static blk_status_t submit_encoded_read_bio(struct inode *inode,
+					    struct bio *bio, int mirror_num,
+					    unsigned long bio_flags)
+{
+	struct btrfs_encoded_read_private *priv = bio->bi_private;
+	struct btrfs_bio *bbio = btrfs_bio(bio);
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	blk_status_t ret;
+
+	if (!priv->skip_csum) {
+		ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+		if (ret)
+			return ret;
+	}
+
+	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+	if (ret) {
+		btrfs_bio_free_csum(bbio);
+		return ret;
+	}
+
+	atomic_inc(&priv->pending);
+	ret = btrfs_map_bio(fs_info, bio, mirror_num);
+	if (ret) {
+		atomic_dec(&priv->pending);
+		btrfs_bio_free_csum(bbio);
+	}
+	return ret;
+}
+
+static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+{
+	const bool uptodate = bbio->bio.bi_status == BLK_STS_OK;
+	struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+	struct inode *inode = priv->inode;
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	u32 sectorsize = fs_info->sectorsize;
+	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
+	u64 start = priv->file_offset;
+	u32 bio_offset = 0;
+
+	if (priv->skip_csum || !uptodate)
+		return bbio->bio.bi_status;
+
+	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+		unsigned int i, nr_sectors, pgoff;
+
+		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+		pgoff = bvec->bv_offset;
+		for (i = 0; i < nr_sectors; i++) {
+			ASSERT(pgoff < PAGE_SIZE);
+			if (check_data_csum(inode, bbio, bio_offset,
+					    bvec->bv_page, pgoff, start))
+				return BLK_STS_IOERR;
+			start += sectorsize;
+			bio_offset += sectorsize;
+			pgoff += sectorsize;
+		}
+	}
+	return BLK_STS_OK;
+}
+
+static void btrfs_encoded_read_endio(struct bio *bio)
+{
+	struct btrfs_encoded_read_private *priv = bio->bi_private;
+	struct btrfs_bio *bbio = btrfs_bio(bio);
+	blk_status_t status;
+
+	status = btrfs_encoded_read_verify_csum(bbio);
+	if (status) {
+		/*
+		 * The memory barrier implied by the atomic_dec_return() here
+		 * pairs with the memory barrier implied by the
+		 * atomic_dec_return() or io_wait_event() in
+		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
+		 * write is observed before the load of status in
+		 * btrfs_encoded_read_regular_fill_pages().
+		 */
+		WRITE_ONCE(priv->status, status);
+	}
+	if (!atomic_dec_return(&priv->pending))
+		wake_up(&priv->wait);
+	btrfs_bio_free_csum(bbio);
+	bio_put(bio);
+}
+
+static int btrfs_encoded_read_regular_fill_pages(struct inode *inode,
+						 u64 file_offset,
+						 u64 disk_bytenr,
+						 u64 disk_io_size,
+						 struct page **pages)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_encoded_read_private priv = {
+		.inode = inode,
+		.file_offset = file_offset,
+		.pending = ATOMIC_INIT(1),
+		.skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM,
+	};
+	unsigned long i = 0;
+	u64 cur = 0;
+	int ret;
+
+	init_waitqueue_head(&priv.wait);
+	/*
+	 * Submit bios for the extent, splitting due to bio or stripe limits as
+	 * necessary.
+	 */
+	while (cur < disk_io_size) {
+		struct extent_map *em;
+		struct btrfs_io_geometry geom;
+		struct bio *bio = NULL;
+		u64 remaining;
+
+		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+					 disk_io_size - cur);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+		} else {
+			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+						    disk_bytenr + cur, &geom);
+			free_extent_map(em);
+		}
+		if (ret) {
+			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+			break;
+		}
+		remaining = min(geom.len, disk_io_size - cur);
+		while (bio || remaining) {
+			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+
+			if (!bio) {
+				bio = btrfs_bio_alloc(BIO_MAX_VECS);
+				bio->bi_iter.bi_sector =
+					(disk_bytenr + cur) >> SECTOR_SHIFT;
+				bio->bi_end_io = btrfs_encoded_read_endio;
+				bio->bi_private = &priv;
+				bio->bi_opf = REQ_OP_READ;
+			}
+
+			if (!bytes ||
+			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+				blk_status_t status;
+
+				status = submit_encoded_read_bio(inode, bio, 0,
+								 0);
+				if (status) {
+					WRITE_ONCE(priv.status, status);
+					bio_put(bio);
+					goto out;
+				}
+				bio = NULL;
+				continue;
+			}
+
+			i++;
+			cur += bytes;
+			remaining -= bytes;
+		}
+	}
+
+out:
+	if (atomic_dec_return(&priv.pending))
+		io_wait_event(priv.wait, !atomic_read(&priv.pending));
+	/* See btrfs_encoded_read_endio() for ordering. */
+	return blk_status_to_errno(READ_ONCE(priv.status));
+}
+
+static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
+					  struct iov_iter *iter,
+					  u64 start, u64 lockend,
+					  struct extent_state **cached_state,
+					  u64 disk_bytenr, u64 disk_io_size,
+					  size_t count, bool compressed,
+					  bool *unlocked)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct page **pages;
+	unsigned long nr_pages, i;
+	u64 cur;
+	size_t page_offset;
+	ssize_t ret;
+
+	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+	if (!pages)
+		return -ENOMEM;
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!pages[i]) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+						    disk_io_size, pages);
+	if (ret)
+		goto out;
+
+	unlock_extent_cached(io_tree, start, lockend, cached_state);
+	inode_unlock_shared(inode);
+	*unlocked = true;
+
+	if (compressed) {
+		i = 0;
+		page_offset = 0;
+	} else {
+		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
+		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
+	}
+	cur = 0;
+	while (cur < count) {
+		size_t bytes = min_t(size_t, count - cur,
+				     PAGE_SIZE - page_offset);
+
+		if (copy_page_to_iter(pages[i], page_offset, bytes,
+				      iter) != bytes) {
+			ret = -EFAULT;
+			goto out;
+		}
+		i++;
+		cur += bytes;
+		page_offset = 0;
+	}
+	ret = count;
+out:
+	for (i = 0; i < nr_pages; i++) {
+		if (pages[i])
+			__free_page(pages[i]);
+	}
+	kfree(pages);
+	return ret;
+}
+
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+			   struct btrfs_ioctl_encoded_io_args *encoded)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	ssize_t ret;
+	size_t count = iov_iter_count(iter);
+	u64 start, lockend, disk_bytenr, disk_io_size;
+	struct extent_state *cached_state = NULL;
+	struct extent_map *em;
+	bool unlocked = false;
+
+	file_accessed(iocb->ki_filp);
+
+	inode_lock_shared(inode);
+
+	if (iocb->ki_pos >= inode->i_size) {
+		inode_unlock_shared(inode);
+		return 0;
+	}
+	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
+	/*
+	 * We don't know how long the extent containing iocb->ki_pos is, but if
+	 * it's compressed we know that it won't be longer than this.
+	 */
+	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+	for (;;) {
+		struct btrfs_ordered_extent *ordered;
+
+		ret = btrfs_wait_ordered_range(inode, start,
+					       lockend - start + 1);
+		if (ret)
+			goto out_unlock_inode;
+		lock_extent_bits(io_tree, start, lockend, &cached_state);
+		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
+						     lockend - start + 1);
+		if (!ordered)
+			break;
+		btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(io_tree, start, lockend, &cached_state);
+		cond_resched();
+	}
+
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start,
+			      lockend - start + 1);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_unlock_extent;
+	}
+
+	if (em->block_start == EXTENT_MAP_INLINE) {
+		u64 extent_start = em->start;
+
+		/*
+		 * For inline extents we get everything we need out of the
+		 * extent item.
+		 */
+		free_extent_map(em);
+		em = NULL;
+		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
+						&cached_state, extent_start,
+						count, encoded, &unlocked);
+		goto out;
+	}
+
+	/*
+	 * We only want to return up to EOF even if the extent extends beyond
+	 * that.
+	 */
+	encoded->len = (min_t(u64, extent_map_end(em), inode->i_size) -
+			iocb->ki_pos);
+	if (em->block_start == EXTENT_MAP_HOLE ||
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		disk_bytenr = EXTENT_MAP_HOLE;
+		encoded->len = encoded->unencoded_len = count =
+			min_t(u64, count, encoded->len);
+	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+		disk_bytenr = em->block_start;
+		/*
+		 * Bail if the buffer isn't large enough to return the whole
+		 * compressed extent.
+		 */
+		if (em->block_len > count) {
+			ret = -ENOBUFS;
+			goto out_em;
+		}
+		disk_io_size = count = em->block_len;
+		encoded->unencoded_len = em->ram_bytes;
+		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+		ret = btrfs_encoded_io_compression_from_extent(
+							     em->compress_type);
+		if (ret < 0)
+			goto out_em;
+		encoded->compression = ret;
+	} else {
+		disk_bytenr = em->block_start + (start - em->start);
+		if (encoded->len > count)
+			encoded->len = count;
+		/*
+		 * Don't read beyond what we locked. This also limits the page
+		 * allocations that we'll do.
+		 */
+		disk_io_size = min(lockend + 1,
+				   iocb->ki_pos + encoded->len) - start;
+		encoded->len = encoded->unencoded_len = count =
+			start + disk_io_size - iocb->ki_pos;
+		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+	}
+	free_extent_map(em);
+	em = NULL;
+
+	if (disk_bytenr == EXTENT_MAP_HOLE) {
+		unlock_extent_cached(io_tree, start, lockend, &cached_state);
+		inode_unlock_shared(inode);
+		unlocked = true;
+		ret = iov_iter_zero(count, iter);
+		if (ret != count)
+			ret = -EFAULT;
+	} else {
+		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
+						 &cached_state, disk_bytenr,
+						 disk_io_size, count,
+						 encoded->compression,
+						 &unlocked);
+	}
+
+out:
+	if (ret >= 0)
+		iocb->ki_pos += encoded->len;
+out_em:
+	free_extent_map(em);
+out_unlock_extent:
+	if (!unlocked)
+		unlock_extent_cached(io_tree, start, lockend, &cached_state);
+out_unlock_inode:
+	if (!unlocked)
+		inode_unlock_shared(inode);
+	return ret;
+}
+
 #ifdef CONFIG_SWAP
 /*
  * Add an entry indicating a block group or device which is pinned by a
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 05c77a1979a9..f0c575223d88 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -28,6 +28,7 @@ 
 #include <linux/iversion.h>
 #include <linux/fileattr.h>
 #include <linux/fsverity.h>
+#include <linux/sched/xacct.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "export.h"
@@ -88,6 +89,22 @@  struct btrfs_ioctl_send_args_32 {
 
 #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
 			       struct btrfs_ioctl_send_args_32)
+
+struct btrfs_ioctl_encoded_io_args_32 {
+	compat_uptr_t iov;
+	compat_ulong_t iovcnt;
+	__s64 offset;
+	__u64 flags;
+	__u64 len;
+	__u64 unencoded_len;
+	__u64 unencoded_offset;
+	__u32 compression;
+	__u32 encryption;
+	__u32 reserved[8];
+};
+
+#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
+				       struct btrfs_ioctl_encoded_io_args_32)
 #endif
 
 /* Mask out flags that are inappropriate for the given type of inode. */
@@ -4861,6 +4878,89 @@  static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
 	return ret;
 }
 
+static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
+				    bool compat)
+{
+	struct btrfs_ioctl_encoded_io_args args = {};
+	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+					     flags);
+	size_t copy_end;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	struct iov_iter iter;
+	loff_t pos;
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out_acct;
+	}
+
+	if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+		struct btrfs_ioctl_encoded_io_args_32 args32;
+
+		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+				       flags);
+		if (copy_from_user(&args32, argp, copy_end)) {
+			ret = -EFAULT;
+			goto out_acct;
+		}
+		args.iov = compat_ptr(args32.iov);
+		args.iovcnt = args32.iovcnt;
+		args.offset = args32.offset;
+		args.flags = args32.flags;
+#else
+		return -ENOTTY;
+#endif
+	} else {
+		copy_end = copy_end_kernel;
+		if (copy_from_user(&args, argp, copy_end)) {
+			ret = -EFAULT;
+			goto out_acct;
+		}
+	}
+	if (args.flags != 0) {
+		ret = -EINVAL;
+		goto out_acct;
+	}
+
+	ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+			   &iov, &iter);
+	if (ret < 0)
+		goto out_acct;
+
+	if (iov_iter_count(&iter) == 0) {
+		ret = 0;
+		goto out_iov;
+	}
+	pos = args.offset;
+	ret = rw_verify_area(READ, file, &pos, args.len);
+	if (ret < 0)
+		goto out_iov;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = pos;
+
+	ret = btrfs_encoded_read(&kiocb, &iter, &args);
+	if (ret >= 0) {
+		fsnotify_access(file);
+		if (copy_to_user(argp + copy_end,
+				 (char *)&args + copy_end_kernel,
+				 sizeof(args) - copy_end_kernel))
+			ret = -EFAULT;
+	}
+
+out_iov:
+	kfree(iov);
+out_acct:
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -5005,6 +5105,12 @@  long btrfs_ioctl(struct file *file, unsigned int
 		return fsverity_ioctl_enable(file, (const void __user *)argp);
 	case FS_IOC_MEASURE_VERITY:
 		return fsverity_ioctl_measure(file, argp);
+	case BTRFS_IOC_ENCODED_READ:
+		return btrfs_ioctl_encoded_read(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+	case BTRFS_IOC_ENCODED_READ_32:
+		return btrfs_ioctl_encoded_read(file, argp, true);
+#endif
 	}
 
 	return -ENOTTY;