@@ -524,7 +524,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
}
EXPORT_SYMBOL(generic_file_splice_read);
-static const struct pipe_buf_operations default_pipe_buf_ops = {
+const struct pipe_buf_operations default_pipe_buf_ops = {
.can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
@@ -85,4 +85,5 @@ extern void splice_shrink_spd(struct splice_pipe_desc *);
extern void spd_release_page(struct splice_pipe_desc *, unsigned int);
extern const struct pipe_buf_operations page_cache_pipe_buf_ops;
+extern const struct pipe_buf_operations default_pipe_buf_ops;
#endif
@@ -13,6 +13,7 @@
#include <uapi/linux/uio.h>
struct page;
+struct pipe_inode_info;
struct kvec {
void *iov_base; /* and that should *never* hold a userland pointer */
@@ -23,6 +24,7 @@ enum {
ITER_IOVEC = 0,
ITER_KVEC = 2,
ITER_BVEC = 4,
+ ITER_PIPE = 8,
};
struct iov_iter {
@@ -33,8 +35,12 @@ struct iov_iter {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
+ struct pipe_inode_info *pipe;
+ };
+ union {
+ unsigned long nr_segs;
+ int idx;
};
- unsigned long nr_segs;
};
/*
@@ -64,7 +70,7 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
}
#define iov_for_each(iov, iter, start) \
- if (!((start).type & ITER_BVEC)) \
+ if (!((start).type & (ITER_BVEC | ITER_PIPE))) \
for (iter = (start); \
(iter).count && \
((iov = iov_iter_iovec(&(iter))), 1); \
@@ -94,6 +100,8 @@ void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec,
unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, int direction, const struct bio_vec *bvec,
unsigned long nr_segs, size_t count);
+void iov_iter_pipe(struct iov_iter *i, int direction, struct pipe_inode_info *pipe,
+ size_t count);
ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
@@ -109,7 +117,7 @@ static inline size_t iov_iter_count(struct iov_iter *i)
static inline bool iter_is_iovec(struct iov_iter *i)
{
- return !(i->type & (ITER_BVEC | ITER_KVEC));
+ return !(i->type & (ITER_BVEC | ITER_KVEC | ITER_PIPE));
}
/*
@@ -3,8 +3,11 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/splice.h>
#include <net/checksum.h>
+#define PIPE_PARANOIA /* for now */
+
#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
size_t left; \
size_t wanted = n; \
@@ -290,6 +293,82 @@ done:
return wanted - bytes;
}
+#ifdef PIPE_PARANOIA
+static bool sanity(const struct iov_iter *i)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ int idx = i->idx;
+ int delta = (pipe->curbuf + pipe->nrbufs - idx) & (pipe->buffers - 1);
+ if (i->iov_offset) {
+ struct pipe_buffer *p;
+ if (unlikely(delta != 1) || unlikely(!pipe->nrbufs))
+ goto Bad; // must be at the last buffer...
+
+ p = &pipe->bufs[idx];
+ if (unlikely(p->offset + p->len != i->iov_offset))
+ goto Bad; // ... at the end of segment
+ } else {
+ if (delta)
+ goto Bad; // must be right after the last buffer
+ }
+ return true;
+Bad:
+ WARN_ON(1);
+ return false;
+}
+#else
+#define sanity(i) true
+#endif
+
+static inline int next_idx(int idx, struct pipe_inode_info *pipe)
+{
+ return (idx + 1) & (pipe->buffers - 1);
+}
+
+static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ struct pipe_buffer *buf;
+ size_t off;
+ int idx;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ if (!sanity(i))
+ return 0;
+
+ off = i->iov_offset;
+ idx = i->idx;
+ buf = &pipe->bufs[idx];
+ if (off) {
+ if (offset == off && buf->page == page) {
+ /* merge with the last one */
+ buf->len += bytes;
+ i->iov_offset += bytes;
+ goto out;
+ }
+ idx = next_idx(idx, pipe);
+ buf = &pipe->bufs[idx];
+ }
+ if (idx == pipe->curbuf && pipe->nrbufs)
+ return 0;
+ pipe->nrbufs++;
+ buf->ops = &page_cache_pipe_buf_ops;
+ get_page(buf->page = page);
+ buf->offset = offset;
+ buf->len = bytes;
+ i->iov_offset = offset + bytes;
+ i->idx = idx;
+out:
+ i->count -= bytes;
+ return bytes;
+}
+
/*
* Fault in the first iovec of the given iov_iter, to a maximum length
* of bytes. Returns 0 on success, or non-zero if the memory could not be
@@ -376,9 +455,98 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
kunmap_atomic(addr);
}
+static inline bool allocated(struct pipe_buffer *buf)
+{
+ return buf->ops == &default_pipe_buf_ops;
+}
+
+static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp)
+{
+ size_t off = i->iov_offset;
+ int idx = i->idx;
+ if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) {
+ idx = next_idx(idx, i->pipe);
+ off = 0;
+ }
+ *idxp = idx;
+ *offp = off;
+}
+
+static size_t push_pipe(struct iov_iter *i, size_t size,
+ int *idxp, size_t *offp)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t off;
+ int idx;
+ ssize_t left;
+
+ if (unlikely(size > i->count))
+ size = i->count;
+ if (unlikely(!size))
+ return 0;
+
+ left = size;
+ data_start(i, &idx, &off);
+ *idxp = idx;
+ *offp = off;
+ if (off) {
+ left -= PAGE_SIZE - off;
+ if (left <= 0) {
+ pipe->bufs[idx].len += size;
+ return size;
+ }
+ pipe->bufs[idx].len = PAGE_SIZE;
+ idx = next_idx(idx, pipe);
+ }
+ while (idx != pipe->curbuf || !pipe->nrbufs) {
+ struct page *page = alloc_page(GFP_USER);
+ if (!page)
+ break;
+ pipe->nrbufs++;
+ pipe->bufs[idx].ops = &default_pipe_buf_ops;
+ pipe->bufs[idx].page = page;
+ pipe->bufs[idx].offset = 0;
+ if (left <= PAGE_SIZE) {
+ pipe->bufs[idx].len = left;
+ return size;
+ }
+ pipe->bufs[idx].len = PAGE_SIZE;
+ left -= PAGE_SIZE;
+ idx = next_idx(idx, pipe);
+ }
+ return size - left;
+}
+
+static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
+ struct iov_iter *i)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t n, off;
+ int idx;
+
+ if (!sanity(i))
+ return 0;
+
+ bytes = n = push_pipe(i, bytes, &idx, &off);
+ if (unlikely(!n))
+ return 0;
+ for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+ size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
+ memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk);
+ i->idx = idx;
+ i->iov_offset = off + chunk;
+ n -= chunk;
+ addr += chunk;
+ }
+ i->count -= bytes;
+ return bytes;
+}
+
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
const char *from = addr;
+ if (unlikely(i->type & ITER_PIPE))
+ return copy_pipe_to_iter(addr, bytes, i);
iterate_and_advance(i, bytes, v,
__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
v.iov_len),
@@ -394,6 +562,10 @@ EXPORT_SYMBOL(copy_to_iter);
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
char *to = addr;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
iterate_and_advance(i, bytes, v,
__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
v.iov_len),
@@ -409,6 +581,10 @@ EXPORT_SYMBOL(copy_from_iter);
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
char *to = addr;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
iterate_and_advance(i, bytes, v,
__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
v.iov_base, v.iov_len),
@@ -429,14 +605,20 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
kunmap_atomic(kaddr);
return wanted;
- } else
+ } else if (likely(!(i->type & ITER_PIPE)))
return copy_page_to_iter_iovec(page, offset, bytes, i);
+ else
+ return copy_page_to_iter_pipe(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_to_iter);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
if (i->type & (ITER_BVEC|ITER_KVEC)) {
void *kaddr = kmap_atomic(page);
size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
@@ -447,8 +629,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
}
EXPORT_SYMBOL(copy_page_from_iter);
+static size_t pipe_zero(size_t bytes, struct iov_iter *i)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t n, off;
+ int idx;
+
+ if (!sanity(i))
+ return 0;
+
+ bytes = n = push_pipe(i, bytes, &idx, &off);
+ if (unlikely(!n))
+ return 0;
+
+ for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+ size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
+ memzero_page(pipe->bufs[idx].page, off, chunk);
+ i->idx = idx;
+ i->iov_offset = off + chunk;
+ n -= chunk;
+ }
+ i->count -= bytes;
+ return bytes;
+}
+
size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
+ if (unlikely(i->type & ITER_PIPE))
+ return pipe_zero(bytes, i);
iterate_and_advance(i, bytes, v,
__clear_user(v.iov_base, v.iov_len),
memzero_page(v.bv_page, v.bv_offset, v.bv_len),
@@ -463,6 +671,11 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
{
char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+ if (unlikely(i->type & ITER_PIPE)) {
+ kunmap_atomic(kaddr);
+ WARN_ON(1);
+ return 0;
+ }
iterate_all_kinds(i, bytes, v,
__copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
v.iov_base, v.iov_len),
@@ -475,8 +688,55 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
}
EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+static void pipe_advance(struct iov_iter *i, size_t size)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ struct pipe_buffer *buf;
+ size_t off;
+ int idx;
+
+ if (unlikely(i->count < size))
+ size = i->count;
+
+ idx = i->idx;
+ off = i->iov_offset;
+ if (size || off) {
+ /* take it relative to the beginning of buffer */
+ size += off - pipe->bufs[idx].offset;
+ while (1) {
+ buf = &pipe->bufs[idx];
+ if (size > buf->len) {
+ size -= buf->len;
+ idx = next_idx(idx, pipe);
+ off = 0;
+ } else {
+ buf->len = size;
+ i->idx = idx;
+ i->iov_offset = off = buf->offset + size;
+ break;
+ }
+ }
+ idx = next_idx(idx, pipe);
+ }
+ if (pipe->nrbufs) {
+ int unused = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ /* [curbuf,unused) is in use. Free [idx,unused) */
+ while (idx != unused) {
+ buf = &pipe->bufs[idx];
+ buf->ops->release(pipe, buf);
+ buf->ops = NULL;
+ idx = next_idx(idx, pipe);
+ pipe->nrbufs--;
+ }
+ }
+}
+
void iov_iter_advance(struct iov_iter *i, size_t size)
{
+ if (unlikely(i->type & ITER_PIPE)) {
+ pipe_advance(i, size);
+ return;
+ }
iterate_and_advance(i, size, v, 0, 0, 0)
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -486,6 +746,8 @@ EXPORT_SYMBOL(iov_iter_advance);
*/
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
+ if (unlikely(i->type & ITER_PIPE))
+ return i->count; // it is a silly place, anyway
if (i->nr_segs == 1)
return i->count;
else if (i->type & ITER_BVEC)
@@ -521,6 +783,19 @@ void iov_iter_bvec(struct iov_iter *i, int direction,
}
EXPORT_SYMBOL(iov_iter_bvec);
+void iov_iter_pipe(struct iov_iter *i, int direction,
+ struct pipe_inode_info *pipe,
+ size_t count)
+{
+ BUG_ON(direction != ITER_PIPE);
+ i->type = direction;
+ i->pipe = pipe;
+ i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ i->iov_offset = 0;
+ i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_pipe);
+
unsigned long iov_iter_alignment(const struct iov_iter *i)
{
unsigned long res = 0;
@@ -529,6 +804,11 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
if (!size)
return 0;
+ if (unlikely(i->type & ITER_PIPE)) {
+ if (i->iov_offset && allocated(&i->pipe->bufs[i->idx]))
+ return size | i->iov_offset;
+ return size;
+ }
iterate_all_kinds(i, size, v,
(res |= (unsigned long)v.iov_base | v.iov_len, 0),
res |= v.bv_offset | v.bv_len,
@@ -545,6 +825,11 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
if (!size)
return 0;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return ~0U;
+ }
+
iterate_all_kinds(i, size, v,
(res |= (!res ? 0 : (unsigned long)v.iov_base) |
(size != v.iov_len ? size : 0), 0),
@@ -557,6 +842,47 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
}
EXPORT_SYMBOL(iov_iter_gap_alignment);
+static inline size_t __pipe_get_pages(struct iov_iter *i,
+ size_t maxsize,
+ struct page **pages,
+ int idx,
+ size_t *start)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t n = push_pipe(i, maxsize, &idx, start);
+ if (!n)
+ return 0;
+
+ maxsize = n;
+ n += *start;
+ while (n >= PAGE_SIZE) {
+ *pages++ = pipe->bufs[idx].page;
+ idx = next_idx(idx, pipe);
+ n -= PAGE_SIZE;
+ }
+
+ return maxsize;
+}
+
+static ssize_t pipe_get_pages(struct iov_iter *i,
+ struct page **pages, size_t maxsize, unsigned maxpages,
+ size_t *start)
+{
+ unsigned npages;
+ size_t capacity;
+ int idx;
+
+ if (!sanity(i))
+ return 0;
+
+ data_start(i, &idx, start);
+ /* some of this one + all after this one */
+ npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
+ capacity = min(npages,maxpages) * PAGE_SIZE - *start;
+
+ return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start);
+}
+
ssize_t iov_iter_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
@@ -567,6 +893,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
if (!maxsize)
return 0;
+ if (unlikely(i->type & ITER_PIPE))
+ return pipe_get_pages(i, pages, maxsize, maxpages, start);
iterate_all_kinds(i, maxsize, v, ({
unsigned long addr = (unsigned long)v.iov_base;
size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -602,6 +930,37 @@ static struct page **get_pages_array(size_t n)
return p;
}
+static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *start)
+{
+ struct page **p;
+ size_t n;
+ int idx;
+ int npages;
+
+ if (!sanity(i))
+ return 0;
+
+ data_start(i, &idx, start);
+ /* some of this one + all after this one */
+ npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
+ n = npages * PAGE_SIZE - *start;
+ if (maxsize > n)
+ maxsize = n;
+ else
+ npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
+ p = get_pages_array(npages);
+ if (!p)
+ return -ENOMEM;
+ n = __pipe_get_pages(i, maxsize, p, idx, start);
+ if (n)
+ *pages = p;
+ else
+ kvfree(p);
+ return n;
+}
+
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
struct page ***pages, size_t maxsize,
size_t *start)
@@ -614,6 +973,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
if (!maxsize)
return 0;
+ if (unlikely(i->type & ITER_PIPE))
+ return pipe_get_pages_alloc(i, pages, maxsize, start);
iterate_all_kinds(i, maxsize, v, ({
unsigned long addr = (unsigned long)v.iov_base;
size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -655,6 +1016,10 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
__wsum sum, next;
size_t off = 0;
sum = *csum;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
iterate_and_advance(i, bytes, v, ({
int err = 0;
next = csum_and_copy_from_user(v.iov_base,
@@ -693,6 +1058,10 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
__wsum sum, next;
size_t off = 0;
sum = *csum;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1); /* for now */
+ return 0;
+ }
iterate_and_advance(i, bytes, v, ({
int err = 0;
next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
@@ -732,7 +1101,20 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
if (!size)
return 0;
- iterate_all_kinds(i, size, v, ({
+ if (unlikely(i->type & ITER_PIPE)) {
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t off;
+ int idx;
+
+ if (!sanity(i))
+ return 0;
+
+ data_start(i, &idx, &off);
+ /* some of this one + all after this one */
+ npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1;
+ if (npages >= maxpages)
+ return maxpages;
+ } else iterate_all_kinds(i, size, v, ({
unsigned long p = (unsigned long)v.iov_base;
npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
- p / PAGE_SIZE;
@@ -757,6 +1139,10 @@ EXPORT_SYMBOL(iov_iter_npages);
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
*new = *old;
+ if (unlikely(new->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return NULL;
+ }
if (new->type & ITER_BVEC)
return new->bvec = kmemdup(new->bvec,
new->nr_segs * sizeof(struct bio_vec),
iov_iter variant for passing data into pipe. copy_to_iter() copies data into page(s) it has allocated and stuffs them into the pipe; copy_page_to_iter() stuffs there a reference to the page given to it. Both will try to coalesce if possible. iov_iter_zero() is similar to copy_to_iter(); iov_iter_get_pages() and friends will do as copy_to_iter() would have and return the pages where the data would've been copied. iov_iter_advance() will truncate everything past the spot it has advanced to. New primitive: iov_iter_pipe(), used for initializing those. pipe should be locked all along. Running out of space acts as fault would for iovec-backed ones; in other words, giving it to ->read_iter() may result in short read if the pipe overflows, or -EFAULT if it happens with nothing copied there. In other words, ->read_iter() on those acts pretty much like ->splice_read(). Moreover, all generic_file_splice_read() users, as well as many other ->splice_read() instances can be switched to that scheme - that'll happen in the next commit. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- [this certainly needs to be documented in more details] fs/splice.c | 2 +- include/linux/splice.h | 1 + include/linux/uio.h | 14 +- lib/iov_iter.c | 390 ++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 401 insertions(+), 6 deletions(-)