Message ID | 1473438884-674-6-git-send-email-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Sep 09, 2016 at 06:34:39PM +0200, Christoph Hellwig wrote: > This is a much simpler implementation of the DAX read/write path that makes > use of the iomap infrastructure. It does not try to mirror the direct I/O > calling conventions and thus doesn't have to deal with i_dio_count or the > end_io handler, but instead leaves locking and filesystem-specific I/O > completion to the caller. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/dax.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/iomap.h | 2 + > 2 files changed, 105 insertions(+) > > diff --git a/fs/dax.c b/fs/dax.c > index 84343ce..57ad456 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -31,6 +31,8 @@ > #include <linux/vmstat.h> > #include <linux/pfn_t.h> > #include <linux/sizes.h> > +#include <linux/iomap.h> > +#include "internal.h" > > /* > * We use lowest available bit in exceptional entry for locking, other two > @@ -1241,3 +1243,104 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) > return dax_zero_page_range(inode, from, length, get_block); > } > EXPORT_SYMBOL_GPL(dax_truncate_page); > + > +#ifdef CONFIG_FS_IOMAP > +static loff_t > +iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > + struct iomap *iomap) > +{ > + struct iov_iter *iter = data; > + loff_t end = pos + length, done = 0; > + ssize_t ret = 0; > + > + if (iov_iter_rw(iter) == READ) { > + end = min(end, i_size_read(inode)); > + if (pos >= end) > + return 0; > + > + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) > + return iov_iter_zero(min(length, end - pos), iter); > + } > + > + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) > + return -EIO; > + > + while (pos < end) { > + unsigned offset = pos & (PAGE_SIZE - 1); > + struct blk_dax_ctl dax = { 0 }; > + ssize_t map_len; > + > + dax.sector = iomap->blkno + > + (((pos & PAGE_MASK) - iomap->offset) >> 9); > + dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; > + map_len = dax_map_atomic(iomap->bdev, &dax); > + if (map_len < 0) { > + ret = map_len; > + break; > + } > + > + dax.addr += offset; > + map_len -= offset; > + if (map_len > end - pos) > + map_len = end - pos; > + > + if (iov_iter_rw(iter) == WRITE) > + map_len = copy_from_iter_pmem(dax.addr, map_len, iter); > + else > + map_len = copy_to_iter(dax.addr, map_len, iter); > + dax_unmap_atomic(iomap->bdev, &dax); > + if (map_len <= 0) { > + ret = map_len ? map_len : -EFAULT; > + break; > + } > + > + pos += map_len; > + length -= map_len; > + done += map_len; > + } > + > + return done ? done : ret; > +} > + > +/** > + * iomap_dax_rw - Perform I/O to a DAX file > + * @iocb: The control block for this I/O > + * @iter: The addresses to do I/O from or to > + * @ops: iomap ops passed from the file system > + * > + * This funtions performs read and write operations to directly mapped function > + * persistent memory. The callers needs to take care of read/write exclusion > + * and evicting any page cache pages in the region under I/O. > + */ > +ssize_t > +iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, > + struct iomap_ops *ops) > +{ > + struct inode *inode = iocb->ki_filp->f_mapping->host; > + loff_t pos = iocb->ki_pos, ret = 0, done = 0; Just a note that 'ret' is loff_t about half the time in the iomap code and ssize_t the other half. I guess it doesn't really matter since they should both be big unsigned values (64 bits on x96_64), but it's a bit inconsistent. > + size_t count = iov_iter_count(iter); > + unsigned flags = 0; > + > + if (!count) > + return 0; > + > + if (iov_iter_rw(iter) == WRITE) > + flags |= IOMAP_WRITE; > + > + do { > + ret = iomap_apply(inode, pos, count, flags, ops, iter, > + iomap_dax_actor); > + if (ret <= 0) > + break; > + pos += ret; > + done += ret; > + } while ((count = iov_iter_count(iter))); > + > + if (!done) > + return ret; > + > + iocb->ki_pos += done; > + return done; > +} I think you can remove the special casing around 'done' and 'count' and make this a bit simpler: ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops) { struct inode *inode = iocb->ki_filp->f_mapping->host; loff_t pos = iocb->ki_pos, ret = 0, done = 0; unsigned flags = 0; size_t count; if (iov_iter_rw(iter) == WRITE) flags |= IOMAP_WRITE; while ((count = iov_iter_count(iter))) { ret = iomap_apply(inode, pos, count, flags, ops, iter, iomap_dax_actor); if (ret <= 0) break; pos += ret; done += ret; } iocb->ki_pos += done; return done ? done : ret; } This is now very similar to iomap_file_buffered_write(). > +EXPORT_SYMBOL_GPL(iomap_dax_rw); > +#endif /* CONFIG_FS_IOMAP */ > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index 14d7067..3d5f785 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -65,6 +65,8 @@ struct iomap_ops { > > ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, > struct iomap_ops *ops); > +ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, > + struct iomap_ops *ops); > int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, > bool *did_zero, struct iomap_ops *ops); > int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, > -- > 2.1.4 > > _______________________________________________ > Linux-nvdimm mailing list > Linux-nvdimm@lists.01.org > https://lists.01.org/mailman/listinfo/linux-nvdimm -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/dax.c b/fs/dax.c index 84343ce..57ad456 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -31,6 +31,8 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/iomap.h> +#include "internal.h" /* * We use lowest available bit in exceptional entry for locking, other two @@ -1241,3 +1243,104 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) return dax_zero_page_range(inode, from, length, get_block); } EXPORT_SYMBOL_GPL(dax_truncate_page); + +#ifdef CONFIG_FS_IOMAP +static loff_t +iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *iter = data; + loff_t end = pos + length, done = 0; + ssize_t ret = 0; + + if (iov_iter_rw(iter) == READ) { + end = min(end, i_size_read(inode)); + if (pos >= end) + return 0; + + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return iov_iter_zero(min(length, end - pos), iter); + } + + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) + return -EIO; + + while (pos < end) { + unsigned offset = pos & (PAGE_SIZE - 1); + struct blk_dax_ctl dax = { 0 }; + ssize_t map_len; + + dax.sector = iomap->blkno + + (((pos & PAGE_MASK) - iomap->offset) >> 9); + dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; + map_len = dax_map_atomic(iomap->bdev, &dax); + if (map_len < 0) { + ret = map_len; + break; + } + + dax.addr += offset; + map_len -= offset; + if (map_len > end - pos) + map_len = end - pos; + + if (iov_iter_rw(iter) == WRITE) + map_len = copy_from_iter_pmem(dax.addr, map_len, iter); + else + map_len = copy_to_iter(dax.addr, map_len, iter); + dax_unmap_atomic(iomap->bdev, &dax); + if (map_len <= 0) { + ret = map_len ? map_len : -EFAULT; + break; + } + + pos += map_len; + length -= map_len; + done += map_len; + } + + return done ? done : ret; +} + +/** + * iomap_dax_rw - Perform I/O to a DAX file + * @iocb: The control block for this I/O + * @iter: The addresses to do I/O from or to + * @ops: iomap ops passed from the file system + * + * This funtions performs read and write operations to directly mapped + * persistent memory. The callers needs to take care of read/write exclusion + * and evicting any page cache pages in the region under I/O. + */ +ssize_t +iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, done = 0; + size_t count = iov_iter_count(iter); + unsigned flags = 0; + + if (!count) + return 0; + + if (iov_iter_rw(iter) == WRITE) + flags |= IOMAP_WRITE; + + do { + ret = iomap_apply(inode, pos, count, flags, ops, iter, + iomap_dax_actor); + if (ret <= 0) + break; + pos += ret; + done += ret; + } while ((count = iov_iter_count(iter))); + + if (!done) + return ret; + + iocb->ki_pos += done; + return done; +} +EXPORT_SYMBOL_GPL(iomap_dax_rw); +#endif /* CONFIG_FS_IOMAP */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 14d7067..3d5f785 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -65,6 +65,8 @@ struct iomap_ops { ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, struct iomap_ops *ops); +ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
This is a much simpler implementation of the DAX read/write path that makes use of the iomap infrastructure. It does not try to mirror the direct I/O calling conventions and thus doesn't have to deal with i_dio_count or the end_io handler, but instead leaves locking and filesystem-specific I/O completion to the caller. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/dax.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 2 + 2 files changed, 105 insertions(+)