@@ -1546,7 +1546,7 @@ const struct file_operations random_fops = {
.compat_ioctl = compat_ptr_ioctl,
.fasync = random_fasync,
.llseek = noop_llseek,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.splice_write = iter_file_splice_write,
};
@@ -1557,7 +1557,7 @@ const struct file_operations urandom_fops = {
.compat_ioctl = compat_ptr_ioctl,
.fasync = random_fasync,
.llseek = noop_llseek,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.splice_write = iter_file_splice_write,
};
@@ -466,7 +466,7 @@ static const struct file_operations tty_fops = {
.llseek = no_llseek,
.read_iter = tty_read,
.write_iter = tty_write,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.splice_write = iter_file_splice_write,
.poll = tty_poll,
.unlocked_ioctl = tty_ioctl,
@@ -481,7 +481,7 @@ static const struct file_operations console_fops = {
.llseek = no_llseek,
.read_iter = tty_read,
.write_iter = redirected_tty_write,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.splice_write = iter_file_splice_write,
.poll = tty_poll,
.unlocked_ioctl = tty_ioctl,
@@ -23,6 +23,7 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/uio.h>
+#include <linux/splice.h>
#include <linux/coda.h>
#include "coda_psdev.h"
@@ -94,6 +95,39 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
+static ssize_t
+coda_file_splice_read(struct file *coda_file, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *coda_inode = file_inode(coda_file);
+ struct coda_file_info *cfi = coda_ftoc(coda_file);
+ struct file *in = cfi->cfi_container;
+ loff_t ki_pos = *ppos;
+ ssize_t ret;
+
+ if (!in->f_op->splice_read)
+ return -EINVAL;
+
+ ret = rw_verify_area(READ, in, ppos, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+ &cfi->cfi_access_intent,
+ len, ki_pos, CODA_ACCESS_TYPE_READ);
+ if (ret)
+ goto finish_read;
+
+ ret = in->f_op->splice_read(in, ppos, pipe, len, flags);
+
+finish_read:
+ venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+ &cfi->cfi_access_intent,
+ len, ki_pos, CODA_ACCESS_TYPE_READ_FINISH);
+ return ret;
+}
+
static void
coda_vm_open(struct vm_area_struct *vma)
{
@@ -302,5 +336,5 @@ const struct file_operations coda_file_operations = {
.open = coda_open,
.release = coda_release,
.fsync = coda_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = coda_file_splice_read,
};
@@ -1011,7 +1011,7 @@ const struct file_operations kernfs_file_fops = {
.release = kernfs_fop_release,
.poll = kernfs_fop_poll,
.fsync = noop_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.splice_write = iter_file_splice_write,
};
@@ -419,6 +419,40 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
return ret;
}
+static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ const struct cred *old_cred;
+ struct fd real;
+ ssize_t ret;
+
+ ret = ovl_real_fdget(in, &real);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if (in->f_flags & O_DIRECT &&
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
+ goto out_fdput;
+ if (!real.file->f_op->splice_read)
+ goto out_fdput;
+
+ ret = rw_verify_area(READ, in, ppos, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ old_cred = ovl_override_creds(file_inode(in)->i_sb);
+ ret = real.file->f_op->splice_read(real.file, ppos, pipe, len, flags);
+
+ revert_creds(old_cred);
+ ovl_file_accessed(in);
+out_fdput:
+ fdput(real);
+
+ return ret;
+}
+
/*
* Calling iter_file_splice_write() directly from overlay's f_op may deadlock
* due to lock order inversion between pipe->mutex in iter_file_splice_write()
@@ -695,7 +729,7 @@ const struct file_operations ovl_file_operations = {
.fallocate = ovl_fallocate,
.fadvise = ovl_fadvise,
.flush = ovl_flush,
- .splice_read = generic_file_splice_read,
+ .splice_read = ovl_splice_read,
.splice_write = ovl_splice_write,
.copy_file_range = ovl_copy_file_range,
@@ -591,7 +591,7 @@ static const struct file_operations proc_iter_file_ops = {
.llseek = proc_reg_llseek,
.read_iter = proc_reg_read_iter,
.write = proc_reg_write,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
@@ -617,7 +617,7 @@ static const struct file_operations proc_reg_file_ops_compat = {
static const struct file_operations proc_iter_file_ops_compat = {
.llseek = proc_reg_llseek,
.read_iter = proc_reg_read_iter,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
@@ -869,7 +869,7 @@ static const struct file_operations proc_sys_file_operations = {
.poll = proc_sys_poll,
.read_iter = proc_sys_read,
.write_iter = proc_sys_write,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.splice_write = iter_file_splice_write,
.llseek = default_llseek,
};
@@ -324,7 +324,7 @@ static int mountstats_open(struct inode *inode, struct file *file)
const struct file_operations proc_mounts_operations = {
.open = mounts_open,
.read_iter = seq_read_iter,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.llseek = seq_lseek,
.release = mounts_release,
.poll = mounts_poll,
@@ -333,7 +333,7 @@ const struct file_operations proc_mounts_operations = {
const struct file_operations proc_mountinfo_operations = {
.open = mountinfo_open,
.read_iter = seq_read_iter,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.llseek = seq_lseek,
.release = mounts_release,
.poll = mounts_poll,
@@ -342,7 +342,7 @@ const struct file_operations proc_mountinfo_operations = {
const struct file_operations proc_mountstats_operations = {
.open = mountstats_open,
.read_iter = seq_read_iter,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.llseek = seq_lseek,
.release = mounts_release,
};
@@ -287,9 +287,9 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
* Splice data from an O_DIRECT file into pages and then add them to the output
* pipe.
*/
-static ssize_t direct_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len, unsigned int flags)
+ssize_t direct_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
{
struct iov_iter to;
struct bio_vec *bv;
@@ -3163,6 +3163,12 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
struct iov_iter *iter);
/* fs/splice.c */
+ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags);
+ssize_t direct_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags);
extern ssize_t generic_file_splice_read(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
@@ -67,10 +67,6 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
typedef int (splice_direct_actor)(struct pipe_inode_info *,
struct splice_desc *);
-ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len, unsigned int flags);
-
extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
loff_t *, size_t, unsigned int,
splice_actor *);
@@ -2846,9 +2846,8 @@ EXPORT_SYMBOL(generic_file_read_iter);
/*
* Splice subpages from a folio into a pipe.
*/
-static size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
- struct folio *folio,
- loff_t fpos, size_t size)
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size)
{
struct page *page;
size_t spliced = 0, offset = offset_in_folio(folio, fpos);
@@ -794,6 +794,12 @@ struct migration_target_control {
gfp_t gfp_mask;
};
+/*
+ * mm/filemap.c
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size);
+
/*
* mm/vmalloc.c
*/
@@ -2711,6 +2711,128 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
+static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return false;
+}
+
+static const struct pipe_buf_operations zero_pipe_buf_ops = {
+ .release = generic_pipe_buf_release,
+ .try_steal = zero_pipe_buf_try_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
+ loff_t fpos, size_t size)
+{
+ size_t offset = fpos & ~PAGE_MASK;
+
+ size = min(size, PAGE_SIZE - offset);
+
+ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &zero_pipe_buf_ops,
+ .page = ZERO_PAGE(0),
+ .offset = offset,
+ .len = size,
+ };
+ get_page(buf->page);
+ pipe->head++;
+ }
+
+ return size;
+}
+
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode = file_inode(in);
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio = NULL;
+ size_t total_spliced = 0, used, npages, n, part;
+ loff_t isize;
+ int error = 0;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ do {
+ if (*ppos >= i_size_read(inode))
+ break;
+
+ error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
+ if (error) {
+ if (error == -EINVAL)
+ error = 0;
+ break;
+ }
+ if (folio) {
+ folio_unlock(folio);
+
+ if (folio_test_hwpoison(folio)) {
+ error = -EIO;
+ break;
+ }
+ }
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(inode);
+ if (unlikely(*ppos >= isize))
+ break;
+ part = min_t(loff_t, isize - *ppos, len);
+
+ if (folio) {
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+ folio_mark_accessed(folio);
+ /*
+ * Ok, we have the page, and it's up-to-date, so we can
+ * now splice it into the pipe.
+ */
+ n = splice_folio_into_pipe(pipe, folio, *ppos, part);
+ folio_put(folio);
+ folio = NULL;
+ } else {
+ n = splice_zeropage_into_pipe(pipe, *ppos, len);
+ }
+
+ if (!n)
+ break;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ break;
+
+ cond_resched();
+ } while (len);
+
+ if (folio)
+ folio_put(folio);
+
+ file_accessed(in);
+ return total_spliced ? total_spliced : error;
+}
+
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
@@ -3929,7 +4051,7 @@ static const struct file_operations shmem_file_operations = {
.read_iter = shmem_file_read_iter,
.write_iter = generic_file_write_iter,
.fsync = noop_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = shmem_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = shmem_fallocate,
#endif
The new filemap_splice_read() has an implicit expectation via filemap_get_pages() that ->read_folio() exists if ->readahead() doesn't fully populate the pagecache of the file it is reading from[1], potentially leading to a jump to NULL if this doesn't exist. A filesystem or driver shouldn't suffer from this if: - It doesn't set ->splice_read() - It implements ->read_folio() - It implements its own ->splice_read() Note that some filesystems set generic_file_splice_read() and generic_file_read_iter() but don't set ->read_folio(). g_f_read_iter() will fall back to filemap_read_iter() which looks like it should suffer from the same issue. Certain drivers, can just use direct_splice_read() rather than generic_file_splice_read() as that creates an output buffer and then just calls their ->read_iter() function: - random & urandom - tty - kernfs - proc - proc_namespace Stacked filesystems just need to pass the operation down a layer: - coda - overlayfs And finally, there's shmem (used in tmpfs, ramfs, rootfs). This needs its own splice-read implementation, based on filemap_splice_read(), but able to paste in zero_page when there's a page missing. Fixes: d9722a475711 ("splice: Do splice read from a buffered file without using ITER_PIPE") Signed-off-by: David Howells <dhowells@redhat.com> cc: Daniel Golle <daniel@makrotopia.org> cc: Guenter Roeck <groeck7@gmail.com> cc: Christoph Hellwig <hch@lst.de> cc: Jens Axboe <axboe@kernel.dk> cc: Al Viro <viro@zeniv.linux.org.uk> cc: John Hubbard <jhubbard@nvidia.com> cc: David Hildenbrand <david@redhat.com> cc: Matthew Wilcox <willy@infradead.org> cc: Miklos Szeredi <miklos@szeredi.hu> cc: Hugh Dickins <hughd@google.com> cc: Jan Harkes <jaharkes@cs.cmu.edu> cc: Arnd Bergmann <arnd@arndb.de> cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> cc: coda@cs.cmu.edu cc: codalist@coda.cs.cmu.edu cc: linux-unionfs@vger.kernel.org cc: linux-block@vger.kernel.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/Y+pdHFFTk1TTEBsO@makrotopia.org/ [1] --- drivers/char/random.c | 4 +- drivers/tty/tty_io.c | 4 +- fs/coda/file.c | 36 +++++++++++- fs/kernfs/file.c | 2 +- fs/overlayfs/file.c | 36 +++++++++++- fs/proc/inode.c | 4 +- fs/proc/proc_sysctl.c | 2 +- fs/proc_namespace.c | 6 +- fs/splice.c | 6 +- include/linux/fs.h | 6 ++ include/linux/splice.h | 4 -- mm/filemap.c | 5 +- mm/internal.h | 6 ++ mm/shmem.c | 124 ++++++++++++++++++++++++++++++++++++++++- 14 files changed, 221 insertions(+), 24 deletions(-)