@@ -513,6 +513,7 @@ enum positive_aop_returns {
struct page;
struct address_space;
struct writeback_control;
+struct iomap;
struct iov_iter {
const struct iovec *iov;
@@ -604,6 +605,9 @@ struct address_space_operations {
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
+
+ int (*iomap)(struct address_space *mapping, loff_t pos,
+ ssize_t length, struct iomap *iomap, int cmd);
};
/*
new file mode 100644
@@ -0,0 +1,45 @@
+#ifndef _IOMAP_H
+#define _IOMAP_H
+
+/* ->iomap a_op command types */
+#define IOMAP_READ 0x01 /* read the current mapping starting at the
+ given position, trimmed to a maximum length.
+ FS's should use this to obtain and lock
+ resources within this range */
+#define IOMAP_RESERVE 0x02 /* reserve space for an allocation that spans
+ the given iomap */
+#define IOMAP_ALLOCATE 0x03 /* allocate space in a given iomap - must have
+ first been reserved */
+#define IOMAP_UNRESERVE 0x04 /* return unused reserved space for the given
+ iomap and used space. This will always be
+ called after a IOMAP_READ so as to allow the
+ FS to release held resources. */
+
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
+#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
+
+struct iomap {
+ sector_t blkno; /* first sector of mapping */
+ loff_t offset; /* file offset of mapping, bytes */
+ ssize_t length; /* length of mapping, bytes */
+ int type; /* type of mapping */
+ void *priv; /* fs private data associated with map */
+};
+
+static inline bool
+iomap_needs_allocation(struct iomap *iomap)
+{
+ return iomap->type == IOMAP_HOLE;
+}
+
+/* multipage write interfaces use iomaps */
+typedef int (*mpw_actor_t)(struct address_space *mapping, void *src,
+ loff_t pos, ssize_t len, struct iomap *iomap);
+
+ssize_t multipage_write_segment(struct address_space *mapping, void *src,
+ loff_t pos, ssize_t length, mpw_actor_t actor);
+
+#endif /* _IOMAP_H */
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/uaccess.h>
#include <linux/aio.h>
#include <linux/capability.h>
@@ -2221,10 +2222,14 @@ repeat:
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-static ssize_t generic_perform_write(struct file *file,
- struct iov_iter *i, loff_t pos)
+static ssize_t
+__generic_perform_write(
+ struct file *file,
+ struct address_space *mapping,
+ struct iov_iter *i,
+ loff_t pos,
+ void *priv)
{
- struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
@@ -2241,7 +2246,6 @@ static ssize_t generic_perform_write(struct file *file,
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
- void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
@@ -2265,7 +2269,7 @@ again:
}
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
- &page, &fsdata);
+ &page, priv);
if (unlikely(status))
break;
@@ -2279,7 +2283,7 @@ again:
mark_page_accessed(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
- page, fsdata);
+ page, priv);
if (unlikely(status < 0))
break;
copied = status;
@@ -2310,6 +2314,159 @@ again:
return written ? written : status;
}
+static int
+multipage_write_actor(
+ struct address_space *mapping,
+ void *src,
+ loff_t pos,
+ ssize_t length,
+ struct iomap *iomap)
+{
+ struct iov_iter *i = src;
+
+ return __generic_perform_write(NULL, mapping, i, pos, iomap);
+}
+
+/*
+ * Execute a multipage write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the IOMAP_READ call, and release them in the
+ * IOMAP_COMPLETE call.
+ */
+ssize_t
+multipage_write_segment(
+ struct address_space *mapping,
+ void *src,
+ loff_t pos,
+ ssize_t length,
+ mpw_actor_t actor)
+{
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ long status;
+ bool need_alloc;
+ struct iomap iomap = { 0 };
+
+ /*
+ * need to map a range from start position for count bytes. This can
+ * span multiple pages - it is only guaranteed to return a range of a
+ * single type of pages (e.g. all into a hole, all mapped or all
+ * unwritten). Failure at this point has nothing to undo.
+ *
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
+ * to keep the chunks of work done where somewhat symmetric with the
+ * work writeback does. This is a completely arbitrary number pulled
+ * out of thin air as a best guess for initial testing.
+ */
+ length = min_t(ssize_t, length, 1024 * PAGE_SIZE);
+ status = a_ops->iomap(mapping, pos, length, &iomap, IOMAP_READ);
+ if (status)
+ goto out;
+
+ /*
+ * If allocation is required for this range, reserve the space now so
+ * that the allocation is guaranteed to succeed later on. Once we copy
+ * the data into the page cache pages, then we cannot fail otherwise we
+ * expose transient stale data. If the reserve fails, we can safely
+ * back out at this point as there is nothing to undo.
+ */
+ need_alloc = iomap_needs_allocation(&iomap);
+ if (need_alloc) {
+ status = a_ops->iomap(mapping, pos, length, &iomap, IOMAP_RESERVE);
+ if (status)
+ goto out;
+ }
+
+ /*
+ * because we have now guaranteed that the space allocation will
+ * succeed, we can do the copy-in page by page without having to worry
+ * about failures exposing transient data. Hence we can mostly reuse
+ * the existing method of:
+ * - grab and lock page
+ * - set up page mapping structures (e.g. bufferheads)
+ * - copy data in
+ * - update page state and unlock page
+ *
+ * This avoids the need to hold MAX_WRITEBACK_PAGES locked at once
+ * while we execute the copy-in. It does mean, however, that the
+ * filesystem needs to avoid any attempts at writeback of pages in this
+ * iomap until the allocation is completed after the copyin.
+ *
+ * XXX: needs to be done per-filesystem in ->writepage
+ */
+
+ status = actor(mapping, src, pos, length, &iomap);
+ printk("mpws: pos %lld, len %lld, status %lld\n", pos, length, status);
+ if (status == -ERANGE)
+ status = 0;
+ if (status <= 0)
+ goto out_failed_write;
+
+ /* now the data has been copied, allocate the range we've copied */
+ if (need_alloc) {
+ int error;
+ /*
+ * allocate should not fail unless the filesystem has had a
+ * fatal error. Issue a warning here to track this situation.
+ */
+ error = a_ops->iomap(mapping, pos, status, &iomap, IOMAP_ALLOCATE);
+ if (error) {
+ WARN_ON(0);
+ status = error;
+ /* XXX: mark all pages not-up-to-date? */
+ goto out_failed_write;
+ }
+ }
+
+
+out:
+ return status;
+ /*
+ * if we copied less than we reserved, release any unused
+ * reservation we hold so that we don't leak the space. Unreserving
+ * space never fails.
+ */
+out_failed_write:
+ if (need_alloc)
+ a_ops->iomap(mapping, pos, 0, &iomap, IOMAP_UNRESERVE);
+ return status;
+}
+EXPORT_SYMBOL(multipage_write_segment);
+
+/*
+ * Loop writing multiple pages in segments until we have run out
+ * of data to write in the iovec iterator.
+ */
+static ssize_t
+generic_perform_multipage_write(struct file *file,
+ struct iov_iter *i, loff_t pos)
+{
+ long status = 0;
+ ssize_t written = 0;
+
+ do {
+ status = multipage_write_segment(file->f_mapping, i, pos,
+ iov_iter_count(i), multipage_write_actor);
+ if (status <= 0)
+ break;
+ pos += status;
+ written += status;
+ } while (iov_iter_count(i));
+
+ return written ? written : status;
+}
+
+static ssize_t generic_perform_write(struct file *file,
+ struct iov_iter *i, loff_t pos)
+{
+ void *fs_data;
+
+ return __generic_perform_write(file, file->f_mapping, i, pos, &fs_data);
+}
+
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
@@ -2320,13 +2477,17 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
struct iov_iter i;
iov_iter_init(&i, iov, nr_segs, count, written);
- status = generic_perform_write(file, &i, pos);
+
+ if (file->f_mapping->a_ops->iomap)
+ status = generic_perform_multipage_write(file, &i, pos);
+ else
+ status = generic_perform_write(file, &i, pos);
if (likely(status >= 0)) {
written += status;
*ppos = pos + status;
- }
-
+ }
+
return written ? written : status;
}
EXPORT_SYMBOL(generic_file_buffered_write);