@@ -11,6 +11,7 @@
#include <uapi/linux/uio.h>
struct page;
+struct address_space;
struct pipe_inode_info;
struct kvec {
@@ -25,6 +26,7 @@ enum iter_type {
ITER_BVEC = 16,
ITER_PIPE = 32,
ITER_DISCARD = 64,
+ ITER_MAPPING = 128,
};
struct iov_iter {
@@ -40,6 +42,7 @@ struct iov_iter {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
+ struct address_space *mapping;
struct pipe_inode_info *pipe;
};
union {
@@ -48,6 +51,7 @@ struct iov_iter {
unsigned int head;
unsigned int start_head;
};
+ loff_t mapping_start;
};
};
@@ -81,6 +85,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i)
return iov_iter_type(i) == ITER_DISCARD;
}
+static inline bool iov_iter_is_mapping(const struct iov_iter *i)
+{
+ return iov_iter_type(i) == ITER_MAPPING;
+}
+
static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
return i->type & (READ | WRITE);
@@ -222,6 +231,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_
void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe,
size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
+void iov_iter_mapping(struct iov_iter *i, unsigned int direction, struct address_space *mapping,
+ loff_t start, size_t count);
ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
@@ -74,7 +74,40 @@
} \
}
-#define iterate_all_kinds(i, n, v, I, B, K) { \
+#define iterate_mapping(i, n, __v, skip, STEP) { \
+ struct page *page; \
+ size_t wanted = n, seg, offset; \
+ loff_t start = i->mapping_start + skip; \
+ pgoff_t index = start >> PAGE_SHIFT; \
+ \
+ XA_STATE(xas, &i->mapping->i_pages, index); \
+ \
+ rcu_read_lock(); \
+ for (page = xas_load(&xas); page; page = xas_next(&xas)) { \
+ if (xas_retry(&xas, page)) \
+ continue; \
+ if (xa_is_value(page)) \
+ break; \
+ if (PageCompound(page)) \
+ break; \
+ if (page_to_pgoff(page) != xas.xa_index) \
+ break; \
+ __v.bv_page = page; \
+ offset = (i->mapping_start + skip) & ~PAGE_MASK; \
+ seg = PAGE_SIZE - offset; \
+ __v.bv_offset = offset; \
+ __v.bv_len = min(n, seg); \
+ (void)(STEP); \
+ n -= __v.bv_len; \
+ skip += __v.bv_len; \
+ if (n == 0) \
+ break; \
+ } \
+ rcu_read_unlock(); \
+ n = wanted - n; \
+}
+
+#define iterate_all_kinds(i, n, v, I, B, K, M) { \
if (likely(n)) { \
size_t skip = i->iov_offset; \
if (unlikely(i->type & ITER_BVEC)) { \
@@ -86,6 +119,9 @@
struct kvec v; \
iterate_kvec(i, n, v, kvec, skip, (K)) \
} else if (unlikely(i->type & ITER_DISCARD)) { \
+ } else if (unlikely(i->type & ITER_MAPPING)) { \
+ struct bio_vec v; \
+ iterate_mapping(i, n, v, skip, (M)); \
} else { \
const struct iovec *iov; \
struct iovec v; \
@@ -94,7 +130,7 @@
} \
}
-#define iterate_and_advance(i, n, v, I, B, K) { \
+#define iterate_and_advance(i, n, v, I, B, K, M) { \
if (unlikely(i->count < n)) \
n = i->count; \
if (i->count) { \
@@ -119,6 +155,9 @@
i->kvec = kvec; \
} else if (unlikely(i->type & ITER_DISCARD)) { \
skip += n; \
+ } else if (unlikely(i->type & ITER_MAPPING)) { \
+ struct bio_vec v; \
+ iterate_mapping(i, n, v, skip, (M)) \
} else { \
const struct iovec *iov; \
struct iovec v; \
@@ -628,7 +667,9 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
memcpy_to_page(v.bv_page, v.bv_offset,
(from += v.bv_len) - v.bv_len, v.bv_len),
- memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
+ memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
+ memcpy_to_page(v.bv_page, v.bv_offset,
+ (from += v.bv_len) - v.bv_len, v.bv_len)
)
return bytes;
@@ -746,6 +787,15 @@ size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i)
bytes = curr_addr - s_addr - rem;
return bytes;
}
+ }),
+ ({
+ rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset,
+ (from += v.bv_len) - v.bv_len, v.bv_len);
+ if (rem) {
+ curr_addr = (unsigned long) from;
+ bytes = curr_addr - s_addr - rem;
+ return bytes;
+ }
})
)
@@ -767,7 +817,9 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
v.bv_offset, v.bv_len),
- memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+ memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len)
)
return bytes;
@@ -793,7 +845,9 @@ bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
0;}),
memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
v.bv_offset, v.bv_len),
- memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+ memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len)
)
iov_iter_advance(i, bytes);
@@ -813,7 +867,9 @@ size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
v.iov_base, v.iov_len),
memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
v.bv_offset, v.bv_len),
- memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+ memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len)
)
return bytes;
@@ -848,7 +904,9 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
v.bv_offset, v.bv_len),
memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
- v.iov_len)
+ v.iov_len),
+ memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len)
)
return bytes;
@@ -872,7 +930,9 @@ bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
0;}),
memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
v.bv_offset, v.bv_len),
- memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+ memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len)
)
iov_iter_advance(i, bytes);
@@ -909,7 +969,7 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
{
if (unlikely(!page_copy_sane(page, offset, bytes)))
return 0;
- if (i->type & (ITER_BVEC|ITER_KVEC)) {
+ if (i->type & (ITER_BVEC | ITER_KVEC | ITER_MAPPING)) {
void *kaddr = kmap_atomic(page);
size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
kunmap_atomic(kaddr);
@@ -932,7 +992,7 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
WARN_ON(1);
return 0;
}
- if (i->type & (ITER_BVEC|ITER_KVEC)) {
+ if (i->type & (ITER_BVEC | ITER_KVEC | ITER_MAPPING)) {
void *kaddr = kmap_atomic(page);
size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
kunmap_atomic(kaddr);
@@ -976,7 +1036,8 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
iterate_and_advance(i, bytes, v,
clear_user(v.iov_base, v.iov_len),
memzero_page(v.bv_page, v.bv_offset, v.bv_len),
- memset(v.iov_base, 0, v.iov_len)
+ memset(v.iov_base, 0, v.iov_len),
+ memzero_page(v.bv_page, v.bv_offset, v.bv_len)
)
return bytes;
@@ -1000,7 +1061,9 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
v.bv_offset, v.bv_len),
- memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+ memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len)
)
kunmap_atomic(kaddr);
return bytes;
@@ -1071,7 +1134,13 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
i->count -= size;
return;
}
- iterate_and_advance(i, size, v, 0, 0, 0)
+ if (unlikely(iov_iter_is_mapping(i))) {
+ /* We really don't want to fetch pages if we can avoid it */
+ i->iov_offset += size;
+ i->count -= size;
+ return;
+ }
+ iterate_and_advance(i, size, v, 0, 0, 0, 0)
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -1115,7 +1184,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
return;
}
unroll -= i->iov_offset;
- if (iov_iter_is_bvec(i)) {
+ if (iov_iter_is_mapping(i)) {
+ BUG(); /* We should never go beyond the start of the specified
+ * range since we might then be straying into pages that
+ * aren't pinned.
+ */
+ } else if (iov_iter_is_bvec(i)) {
const struct bio_vec *bvec = i->bvec;
while (1) {
size_t n = (--bvec)->bv_len;
@@ -1152,9 +1226,9 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
return i->count; // it is a silly place, anyway
if (i->nr_segs == 1)
return i->count;
- if (unlikely(iov_iter_is_discard(i)))
+ if (unlikely(iov_iter_is_discard(i) || iov_iter_is_mapping(i)))
return i->count;
- else if (iov_iter_is_bvec(i))
+ if (iov_iter_is_bvec(i))
return min(i->count, i->bvec->bv_len - i->iov_offset);
else
return min(i->count, i->iov->iov_len - i->iov_offset);
@@ -1202,6 +1276,32 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
}
EXPORT_SYMBOL(iov_iter_pipe);
+/**
+ * iov_iter_mapping - Initialise an I/O iterator to use the pages in a mapping
+ * @i: The iterator to initialise.
+ * @direction: The direction of the transfer.
+ * @mapping: The mapping to access.
+ * @start: The start file position.
+ * @count: The size of the I/O buffer in bytes.
+ *
+ * Set up an I/O iterator to either draw data out of the pages attached to an
+ * inode or to inject data into those pages. The pages *must* be prevented
+ * from evaporation, either by taking a ref on them or locking them by the
+ * caller.
+ */
+void iov_iter_mapping(struct iov_iter *i, unsigned int direction,
+ struct address_space *mapping,
+ loff_t start, size_t count)
+{
+ BUG_ON(direction & ~1);
+ i->type = ITER_MAPPING | (direction & (READ | WRITE));
+ i->mapping = mapping;
+ i->mapping_start = start;
+ i->count = count;
+ i->iov_offset = 0;
+}
+EXPORT_SYMBOL(iov_iter_mapping);
+
/**
* iov_iter_discard - Initialise an I/O iterator that discards data
* @i: The iterator to initialise.
@@ -1235,7 +1335,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
iterate_all_kinds(i, size, v,
(res |= (unsigned long)v.iov_base | v.iov_len, 0),
res |= v.bv_offset | v.bv_len,
- res |= (unsigned long)v.iov_base | v.iov_len
+ res |= (unsigned long)v.iov_base | v.iov_len,
+ res |= v.bv_offset | v.bv_len
)
return res;
}
@@ -1257,7 +1358,9 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
(size != v.bv_len ? size : 0)),
(res |= (!res ? 0 : (unsigned long)v.iov_base) |
- (size != v.iov_len ? size : 0))
+ (size != v.iov_len ? size : 0)),
+ (res |= (!res ? 0 : (unsigned long)v.bv_offset) |
+ (size != v.bv_len ? size : 0))
);
return res;
}
@@ -1307,6 +1410,46 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
}
+static ssize_t iter_mapping_get_pages(struct iov_iter *i,
+ struct page **pages, size_t maxsize,
+ unsigned maxpages, size_t *_start_offset)
+{
+ unsigned nr, offset;
+ pgoff_t index, count;
+ size_t size = maxsize, actual;
+ loff_t pos;
+
+ if (!size || !maxpages)
+ return 0;
+
+ pos = i->mapping_start + i->iov_offset;
+ index = pos >> PAGE_SHIFT;
+ offset = pos & ~PAGE_MASK;
+ *_start_offset = offset;
+
+ count = 1;
+ if (size > PAGE_SIZE - offset) {
+ size -= PAGE_SIZE - offset;
+ count += size >> PAGE_SHIFT;
+ size &= ~PAGE_MASK;
+ if (size)
+ count++;
+ }
+
+ if (count > maxpages)
+ count = maxpages;
+
+ nr = find_get_pages_contig(i->mapping, index, count, pages);
+ if (nr == 0)
+ return 0;
+
+ actual = PAGE_SIZE * nr;
+ actual -= offset;
+ if (nr == count && size > 0)
+ actual -= PAGE_SIZE - size;
+ return actual;
+}
+
ssize_t iov_iter_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
@@ -1316,6 +1459,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
if (unlikely(iov_iter_is_pipe(i)))
return pipe_get_pages(i, pages, maxsize, maxpages, start);
+ if (unlikely(iov_iter_is_mapping(i)))
+ return iter_mapping_get_pages(i, pages, maxsize, maxpages, start);
if (unlikely(iov_iter_is_discard(i)))
return -EFAULT;
@@ -1342,7 +1487,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
return v.bv_len;
}),({
return -EFAULT;
- })
+ }),
+ 0
)
return 0;
}
@@ -1386,6 +1532,49 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
return n;
}
+static ssize_t iter_mapping_get_pages_alloc(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *_start_offset)
+{
+ struct page **p;
+ unsigned nr, offset;
+ pgoff_t index, count;
+ size_t size = maxsize, actual;
+ loff_t pos;
+
+ if (!size)
+ return 0;
+
+ pos = i->mapping_start + i->iov_offset;
+ index = pos >> PAGE_SHIFT;
+ offset = pos & ~PAGE_MASK;
+ *_start_offset = offset;
+
+ count = 1;
+ if (size > PAGE_SIZE - offset) {
+ size -= PAGE_SIZE - offset;
+ count += size >> PAGE_SHIFT;
+ size &= ~PAGE_MASK;
+ if (size)
+ count++;
+ }
+
+ p = get_pages_array(count);
+ if (!p)
+ return -ENOMEM;
+ *pages = p;
+
+ nr = find_get_pages_contig(i->mapping, index, count, p);
+ if (nr == 0)
+ return 0;
+
+ actual = PAGE_SIZE * nr;
+ actual -= offset;
+ if (nr == count && size > 0)
+ actual -= PAGE_SIZE - size;
+ return actual;
+}
+
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
struct page ***pages, size_t maxsize,
size_t *start)
@@ -1397,6 +1586,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
if (unlikely(iov_iter_is_pipe(i)))
return pipe_get_pages_alloc(i, pages, maxsize, start);
+ if (unlikely(iov_iter_is_mapping(i)))
+ return iter_mapping_get_pages_alloc(i, pages, maxsize, start);
if (unlikely(iov_iter_is_discard(i)))
return -EFAULT;
@@ -1429,7 +1620,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
return v.bv_len;
}),({
return -EFAULT;
- })
+ }), 0
)
return 0;
}
@@ -1468,6 +1659,14 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
v.iov_base, v.iov_len,
sum, off);
off += v.iov_len;
+ }), ({
+ char *p = kmap_atomic(v.bv_page);
+ next = csum_partial_copy_nocheck(p + v.bv_offset,
+ (to += v.bv_len) - v.bv_len,
+ v.bv_len, 0);
+ kunmap_atomic(p);
+ sum = csum_block_add(sum, next, off);
+ off += v.bv_len;
})
)
*csum = sum;
@@ -1510,6 +1709,14 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
v.iov_base, v.iov_len,
sum, off);
off += v.iov_len;
+ }), ({
+ char *p = kmap_atomic(v.bv_page);
+ next = csum_partial_copy_nocheck(p + v.bv_offset,
+ (to += v.bv_len) - v.bv_len,
+ v.bv_len, 0);
+ kunmap_atomic(p);
+ sum = csum_block_add(sum, next, off);
+ off += v.bv_len;
})
)
*csum = sum;
@@ -1556,6 +1763,14 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
(from += v.iov_len) - v.iov_len,
v.iov_len, sum, off);
off += v.iov_len;
+ }), ({
+ char *p = kmap_atomic(v.bv_page);
+ next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len,
+ p + v.bv_offset,
+ v.bv_len, 0);
+ kunmap_atomic(p);
+ sum = csum_block_add(sum, next, off);
+ off += v.bv_len;
})
)
*csum = sum;
@@ -1605,6 +1820,21 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
npages = pipe_space_for_user(iter_head, pipe->tail, pipe);
if (npages >= maxpages)
return maxpages;
+ } else if (unlikely(iov_iter_is_mapping(i))) {
+ unsigned offset;
+
+ offset = (i->mapping_start + i->iov_offset) & ~PAGE_MASK;
+
+ npages = 1;
+ if (size > PAGE_SIZE - offset) {
+ size -= PAGE_SIZE - offset;
+ npages += size >> PAGE_SHIFT;
+ size &= ~PAGE_MASK;
+ if (size)
+ npages++;
+ }
+ if (npages >= maxpages)
+ return maxpages;
} else iterate_all_kinds(i, size, v, ({
unsigned long p = (unsigned long)v.iov_base;
npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
@@ -1621,7 +1851,8 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
- p / PAGE_SIZE;
if (npages >= maxpages)
return maxpages;
- })
+ }),
+ 0
)
return npages;
}
@@ -1634,7 +1865,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
WARN_ON(1);
return NULL;
}
- if (unlikely(iov_iter_is_discard(new)))
+ if (unlikely(iov_iter_is_discard(new) || iov_iter_is_mapping(new)))
return NULL;
if (iov_iter_is_bvec(new))
return new->bvec = kmemdup(new->bvec,
@@ -1746,7 +1977,12 @@ int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
kunmap(v.bv_page);
err;}), ({
w = v;
- err = f(&w, context);})
+ err = f(&w, context);}), ({
+ w.iov_base = kmap(v.bv_page) + v.bv_offset;
+ w.iov_len = v.bv_len;
+ err = f(&w, context);
+ kunmap(v.bv_page);
+ err;})
)
return err;
}
Add an iterator, ITER_MAPPING, that walks through a set of pages attached to an address_space, starting at a given file position and length. The caller must guarantee that the pages are all present and they must be locked is some way[*] (eg. ref, PG_locked, PG_writeback or PG_fscache) to prevent them from going away or being migrated whilst they're being accessed. [*] Note that I'm assuming that a page cannot get detached from or replaced in a mapping if it is so 'locked' without permission from the appropriate address_space operation. This is useful for copying data from socket buffers to inodes in network filesystems and for transferring data between those inodes and the cache using direct I/O. Whilst it is true that ITER_BVEC could be used instead, the following issues arise: (1) A bio_vec array would need to be allocated to list to all the pages - which is redundant if inode->i_pages *also* points to all these pages. (2) A bio_vec array might exceed the size of a page (a page-sized bio_vec array would correspond to a 1M payload on a 64-bit machine with 4K pages). (3) The offset and length fields in the bio_vec elements are superfluous in this situation. An alternative could be to have an "ITER_ARRAY" that just has the page pointers and not the offset/length info. This decreases the redundancy and increases the max payload-per-array-page to 2M. I've been using this in my rewrite of fscache/cachefiles, which can be found here: https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/log/?h=fscache-iter Signed-off-by: David Howells <dhowells@redhat.com> --- include/linux/uio.h | 11 ++ lib/iov_iter.c | 282 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 270 insertions(+), 23 deletions(-)