Message ID | 1425425427-16283-4-git-send-email-david@fromorbit.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 03/04/2015 01:30 AM, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > Add the initial support for DAX file operations to XFS. This > includes the necessary block allocation and mmap page fault hooks > for DAX to function. > > Note that the current block allocation code abuses the mapping > buffer head to provide a completion callback for unwritten extent > allocation when DAX is clearing blocks. The DAX interface needs to > be changed to provide a callback similar to get_blocks for this > callback. It looks like this comment is stale for this set A question below > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > --- > fs/xfs/xfs_aops.c | 72 ++++++++++++++++++++++++++++++++++-- > fs/xfs/xfs_aops.h | 7 +++- > fs/xfs/xfs_file.c | 108 ++++++++++++++++++++++++++++++++++++++++++++---------- > fs/xfs/xfs_iops.c | 4 ++ > fs/xfs/xfs_iops.h | 6 +++ > 5 files changed, 173 insertions(+), 24 deletions(-) > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 3a9b7a1..22cb03a 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -1233,13 +1233,63 @@ xfs_vm_releasepage( > return try_to_free_buffers(page); > } > > +/* > + * For DAX we need a mapping buffer callback for unwritten extent conversion > + * when page faults allocation blocks and then zero them. > + */ > +#ifdef CONFIG_FS_DAX > +static struct xfs_ioend * > +xfs_dax_alloc_ioend( > + struct inode *inode, > + xfs_off_t offset, > + ssize_t size) > +{ > + struct xfs_ioend *ioend; > + > + ASSERT(IS_DAX(inode)); > + ioend = xfs_alloc_ioend(inode, XFS_IO_UNWRITTEN); > + ioend->io_offset = offset; > + ioend->io_size = size; > + return ioend; > +} > + > +void > +xfs_get_blocks_dax_complete( > + struct buffer_head *bh, > + int uptodate) > +{ > + struct xfs_ioend *ioend = bh->b_private; > + struct xfs_inode *ip = XFS_I(ioend->io_inode); > + int error; > + > + ASSERT(IS_DAX(ioend->io_inode)); > + > + /* if there was an error zeroing, then don't convert it */ > + if (!uptodate) > + goto out_free; > + > + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); > + if (error) > + xfs_warn(ip->i_mount, > +"%s: conversion failed, ino 0x%llx, offset 0x%llx, len 0x%lx, error %d\n", > + __func__, ip->i_ino, ioend->io_offset, > + ioend->io_size, error); > +out_free: > + mempool_free(ioend, xfs_ioend_pool); > + > +} > +#else > +#define xfs_dax_alloc_ioend(i,o,s) NULL > +#endif > + > STATIC int > __xfs_get_blocks( > struct inode *inode, > sector_t iblock, > struct buffer_head *bh_result, > int create, > - int direct) > + bool direct, > + bool clear) > { > struct xfs_inode *ip = XFS_I(inode); > struct xfs_mount *mp = ip->i_mount; > @@ -1304,6 +1354,7 @@ __xfs_get_blocks( > if (error) > return error; > new = 1; > + > } else { > /* > * Delalloc reservations do not require a transaction, > @@ -1340,7 +1391,10 @@ __xfs_get_blocks( > if (create || !ISUNWRITTEN(&imap)) > xfs_map_buffer(inode, bh_result, &imap, offset); > if (create && ISUNWRITTEN(&imap)) { > - if (direct) { > + if (clear) { > + bh_result->b_private = xfs_dax_alloc_ioend( > + inode, offset, size); > + } else if (direct) { > bh_result->b_private = inode; > set_buffer_defer_completion(bh_result); > } > @@ -1425,7 +1479,7 @@ xfs_get_blocks( > struct buffer_head *bh_result, > int create) > { > - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); > + return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); > } > > STATIC int > @@ -1435,7 +1489,17 @@ xfs_get_blocks_direct( > struct buffer_head *bh_result, > int create) > { > - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); > + return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); > +} > + > +int > +xfs_get_blocks_dax( > + struct inode *inode, > + sector_t iblock, > + struct buffer_head *bh_result, > + int create) > +{ > + return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); > } > > /* > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > index ac644e0..7c6fb3f 100644 > --- a/fs/xfs/xfs_aops.h > +++ b/fs/xfs/xfs_aops.h > @@ -53,7 +53,12 @@ typedef struct xfs_ioend { > } xfs_ioend_t; > > extern const struct address_space_operations xfs_address_space_operations; > -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); > + > +int xfs_get_blocks(struct inode *inode, sector_t offset, > + struct buffer_head *map_bh, int create); > +int xfs_get_blocks_dax(struct inode *inode, sector_t offset, > + struct buffer_head *map_bh, int create); > +void xfs_get_blocks_dax_complete(struct buffer_head *bh, int uptodate); > > extern void xfs_count_page_state(struct page *, int *, int *); > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index bc0008f..4bfcba0 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -654,7 +654,7 @@ xfs_file_dio_aio_write( > mp->m_rtdev_targp : mp->m_ddev_targp; > > /* DIO must be aligned to device logical sector size */ > - if ((pos | count) & target->bt_logical_sectormask) > + if (!IS_DAX(inode) && (pos | count) & target->bt_logical_sectormask) > return -EINVAL; > > /* "unaligned" here means not aligned to a filesystem block */ > @@ -724,8 +724,11 @@ xfs_file_dio_aio_write( > out: > xfs_rw_iunlock(ip, iolock); > > - /* No fallback to buffered IO on errors for XFS. */ > - ASSERT(ret < 0 || ret == count); > + /* > + * No fallback to buffered IO on errors for XFS. DAX can result in > + * partial writes, but direct IO will either complete fully or fail. > + */ > + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); > return ret; > } > > @@ -810,7 +813,7 @@ xfs_file_write_iter( > if (XFS_FORCED_SHUTDOWN(ip->i_mount)) > return -EIO; > > - if (unlikely(file->f_flags & O_DIRECT)) > + if ((file->f_flags & O_DIRECT) || IS_DAX(inode)) > ret = xfs_file_dio_aio_write(iocb, from); > else > ret = xfs_file_buffered_aio_write(iocb, from); > @@ -1031,17 +1034,6 @@ xfs_file_readdir( > return xfs_readdir(ip, ctx, bufsize); > } > > -STATIC int > -xfs_file_mmap( > - struct file *filp, > - struct vm_area_struct *vma) > -{ > - vma->vm_ops = &xfs_file_vm_ops; > - > - file_accessed(filp); > - return 0; > -} > - > /* > * This type is designed to indicate the type of offset we would like > * to search from page cache for xfs_seek_hole_data(). > @@ -1466,6 +1458,71 @@ xfs_filemap_page_mkwrite( > return error; > } > > +static const struct vm_operations_struct xfs_file_vm_ops = { > + .fault = xfs_filemap_fault, > + .map_pages = filemap_map_pages, > + .page_mkwrite = xfs_filemap_page_mkwrite, > +}; > + > +#ifdef CONFIG_FS_DAX > +static int > +xfs_filemap_dax_fault( > + struct vm_area_struct *vma, > + struct vm_fault *vmf) > +{ > + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); > + int error; > + > + trace_xfs_filemap_fault(ip); > + > + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); > + error = dax_fault(vma, vmf, xfs_get_blocks_dax, > + xfs_get_blocks_dax_complete); > + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); > + > + return error; > +} > + > +static int > +xfs_filemap_dax_page_mkwrite( > + struct vm_area_struct *vma, > + struct vm_fault *vmf) > +{ > + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); > + int error; > + > + trace_xfs_filemap_page_mkwrite(ip); > + > + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); > + error = dax_mkwrite(vma, vmf, xfs_get_blocks_dax, > + xfs_get_blocks_dax_complete); > + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); > + > + return error; > +} > + > +static const struct vm_operations_struct xfs_file_dax_vm_ops = { > + .fault = xfs_filemap_dax_fault, > + .page_mkwrite = xfs_filemap_dax_page_mkwrite, > +}; > +#else > +#define xfs_file_dax_vm_ops xfs_file_vm_ops > +#endif /* CONFIG_FS_DAX */ > + > +STATIC int > +xfs_file_mmap( > + struct file *filp, > + struct vm_area_struct *vma) > +{ > + file_accessed(filp); > + if (IS_DAX(file_inode(filp))) { > + vma->vm_ops = &xfs_file_dax_vm_ops; > + vma->vm_flags |= VM_MIXEDMAP; > + } else > + vma->vm_ops = &xfs_file_vm_ops; > + return 0; > +} > + > const struct file_operations xfs_file_operations = { > .llseek = xfs_file_llseek, > .read = new_sync_read, > @@ -1497,8 +1554,21 @@ const struct file_operations xfs_dir_file_operations = { > .fsync = xfs_dir_fsync, > }; > > -static const struct vm_operations_struct xfs_file_vm_ops = { > - .fault = xfs_filemap_fault, > - .map_pages = filemap_map_pages, > - .page_mkwrite = xfs_filemap_page_mkwrite, > +#ifdef CONFIG_FS_DAX > +const struct file_operations xfs_file_dax_operations = { > + .llseek = xfs_file_llseek, > + .read = new_sync_read, > + .write = new_sync_write, > + .read_iter = xfs_file_read_iter, > + .write_iter = xfs_file_write_iter, > + .unlocked_ioctl = xfs_file_ioctl, > +#ifdef CONFIG_COMPAT > + .compat_ioctl = xfs_file_compat_ioctl, > +#endif > + .mmap = xfs_file_mmap, > + .open = xfs_file_open, > + .release = xfs_file_release, > + .fsync = xfs_file_fsync, > + .fallocate = xfs_file_fallocate, > }; sigh, The same problem was in ext4, the reason you need a second xfs_file_operations vector is because of the minus .splice_read = xfs_file_splice_read, .splice_write = iter_file_splice_write, Which do not exist for DAX Inspecting do_splice_from && do_splice_to, if I'm reading the code right it looks like the difference of these vectors present is the call to default_file_splice_write/read. [default_file_splice_read for example would go head and use kernel_readv() ] Would it be cleaner to call default_file_splice_write/read directly from xfs_file_splice_read/write in the DAX case and only keep one vector? I have looked through the code, nothing stands out that I can see. Do you have a public tree I can pull for easy testing? Thanks Boaz > +#endif /* CONFIG_FS_DAX */ > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c > index 8b9e688..9f38142 100644 > --- a/fs/xfs/xfs_iops.c > +++ b/fs/xfs/xfs_iops.c > @@ -1264,6 +1264,10 @@ xfs_setup_inode( > case S_IFREG: > inode->i_op = &xfs_inode_operations; > inode->i_fop = &xfs_file_operations; > + if (IS_DAX(inode)) > + inode->i_fop = &xfs_file_dax_operations; > + else > + inode->i_fop = &xfs_file_operations; > inode->i_mapping->a_ops = &xfs_address_space_operations; > break; > case S_IFDIR: > diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h > index a0f84ab..c08983e 100644 > --- a/fs/xfs/xfs_iops.h > +++ b/fs/xfs/xfs_iops.h > @@ -23,6 +23,12 @@ struct xfs_inode; > extern const struct file_operations xfs_file_operations; > extern const struct file_operations xfs_dir_file_operations; > > +#ifdef CONFIG_FS_DAX > +extern const struct file_operations xfs_file_dax_operations; > +#else > +#define xfs_file_dax_operations xfs_file_operations > +#endif > + > extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); > > /* > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Mar 04, 2015 at 12:09:40PM +0200, Boaz Harrosh wrote: > On 03/04/2015 01:30 AM, Dave Chinner wrote: > > From: Dave Chinner <dchinner@redhat.com> > > > > Add the initial support for DAX file operations to XFS. This > > includes the necessary block allocation and mmap page fault hooks > > for DAX to function. > > > > Note that the current block allocation code abuses the mapping > > buffer head to provide a completion callback for unwritten extent > > allocation when DAX is clearing blocks. The DAX interface needs to > > be changed to provide a callback similar to get_blocks for this > > callback. > > It looks like this comment is stale for this set > > A question below ..... > > .fsync = xfs_dir_fsync, > > }; > > > > -static const struct vm_operations_struct xfs_file_vm_ops = { > > - .fault = xfs_filemap_fault, > > - .map_pages = filemap_map_pages, > > - .page_mkwrite = xfs_filemap_page_mkwrite, > > +#ifdef CONFIG_FS_DAX > > +const struct file_operations xfs_file_dax_operations = { > > + .llseek = xfs_file_llseek, > > + .read = new_sync_read, > > + .write = new_sync_write, > > + .read_iter = xfs_file_read_iter, > > + .write_iter = xfs_file_write_iter, > > + .unlocked_ioctl = xfs_file_ioctl, > > +#ifdef CONFIG_COMPAT > > + .compat_ioctl = xfs_file_compat_ioctl, > > +#endif > > + .mmap = xfs_file_mmap, > > + .open = xfs_file_open, > > + .release = xfs_file_release, > > + .fsync = xfs_file_fsync, > > + .fallocate = xfs_file_fallocate, > > }; > > sigh, The same problem was in ext4, the reason you need > a second xfs_file_operations vector is because of the minus > .splice_read = xfs_file_splice_read, > .splice_write = iter_file_splice_write, > Which do not exist for DAX Right, because they use buffered IO, and DAX doesn't do buffered IO. > Would it be cleaner to call default_file_splice_write/read > directly from xfs_file_splice_read/write in the DAX case > and only keep one vector? Umm, looking at the code, that is different ot what I remember. it used to be if the filesystem did not support splice, then it did not set a vector. That has changed - it now calls a generic splice path instead. So, we definitely need splice to/from DAX enabled inodes to be rejected. I'll have a look at that... > I have looked through the code, nothing stands out that I > can see. Do you have a public tree I can pull for easy > testing? No, not for a small RFC patchset. git am is your friend ;) Cheers, Dave.
On 03/04/2015 03:01 PM, Dave Chinner wrote: > On Wed, Mar 04, 2015 at 12:09:40PM +0200, Boaz Harrosh wrote: <> > > So, we definitely need splice to/from DAX enabled inodes to be > rejected. I'll have a look at that... > default_file_splice_read uses kernel_readv which I think might actually work. Do you know what xfstest(s) exercise splice? Thanks Boaz -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed 04-03-15 10:30:24, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > Add the initial support for DAX file operations to XFS. This > includes the necessary block allocation and mmap page fault hooks > for DAX to function. > > Note that the current block allocation code abuses the mapping > buffer head to provide a completion callback for unwritten extent > allocation when DAX is clearing blocks. The DAX interface needs to > be changed to provide a callback similar to get_blocks for this > callback. > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > --- > fs/xfs/xfs_aops.c | 72 ++++++++++++++++++++++++++++++++++-- > fs/xfs/xfs_aops.h | 7 +++- > fs/xfs/xfs_file.c | 108 ++++++++++++++++++++++++++++++++++++++++++++---------- > fs/xfs/xfs_iops.c | 4 ++ > fs/xfs/xfs_iops.h | 6 +++ > 5 files changed, 173 insertions(+), 24 deletions(-) > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 3a9b7a1..22cb03a 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -1233,13 +1233,63 @@ xfs_vm_releasepage( > return try_to_free_buffers(page); > } > > +/* > + * For DAX we need a mapping buffer callback for unwritten extent conversion > + * when page faults allocation blocks and then zero them. > + */ > +#ifdef CONFIG_FS_DAX > +static struct xfs_ioend * > +xfs_dax_alloc_ioend( > + struct inode *inode, > + xfs_off_t offset, > + ssize_t size) > +{ > + struct xfs_ioend *ioend; > + > + ASSERT(IS_DAX(inode)); > + ioend = xfs_alloc_ioend(inode, XFS_IO_UNWRITTEN); > + ioend->io_offset = offset; > + ioend->io_size = size; > + return ioend; > +} > + > +void > +xfs_get_blocks_dax_complete( > + struct buffer_head *bh, > + int uptodate) > +{ > + struct xfs_ioend *ioend = bh->b_private; > + struct xfs_inode *ip = XFS_I(ioend->io_inode); > + int error; > + > + ASSERT(IS_DAX(ioend->io_inode)); > + > + /* if there was an error zeroing, then don't convert it */ > + if (!uptodate) > + goto out_free; > + > + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); > + if (error) > + xfs_warn(ip->i_mount, > +"%s: conversion failed, ino 0x%llx, offset 0x%llx, len 0x%lx, error %d\n", > + __func__, ip->i_ino, ioend->io_offset, > + ioend->io_size, error); > +out_free: > + mempool_free(ioend, xfs_ioend_pool); > + > +} > +#else > +#define xfs_dax_alloc_ioend(i,o,s) NULL > +#endif > + > STATIC int > __xfs_get_blocks( > struct inode *inode, > sector_t iblock, > struct buffer_head *bh_result, > int create, > - int direct) > + bool direct, > + bool clear) > { > struct xfs_inode *ip = XFS_I(inode); > struct xfs_mount *mp = ip->i_mount; > @@ -1304,6 +1354,7 @@ __xfs_get_blocks( > if (error) > return error; > new = 1; > + > } else { > /* > * Delalloc reservations do not require a transaction, > @@ -1340,7 +1391,10 @@ __xfs_get_blocks( > if (create || !ISUNWRITTEN(&imap)) > xfs_map_buffer(inode, bh_result, &imap, offset); > if (create && ISUNWRITTEN(&imap)) { > - if (direct) { > + if (clear) { > + bh_result->b_private = xfs_dax_alloc_ioend( > + inode, offset, size); > + } else if (direct) { > bh_result->b_private = inode; > set_buffer_defer_completion(bh_result); > } > @@ -1425,7 +1479,7 @@ xfs_get_blocks( > struct buffer_head *bh_result, > int create) > { > - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); > + return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); > } > > STATIC int > @@ -1435,7 +1489,17 @@ xfs_get_blocks_direct( > struct buffer_head *bh_result, > int create) > { > - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); > + return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); > +} > + > +int > +xfs_get_blocks_dax( > + struct inode *inode, > + sector_t iblock, > + struct buffer_head *bh_result, > + int create) > +{ > + return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); > } > > /* > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > index ac644e0..7c6fb3f 100644 > --- a/fs/xfs/xfs_aops.h > +++ b/fs/xfs/xfs_aops.h > @@ -53,7 +53,12 @@ typedef struct xfs_ioend { > } xfs_ioend_t; > > extern const struct address_space_operations xfs_address_space_operations; > -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); > + > +int xfs_get_blocks(struct inode *inode, sector_t offset, > + struct buffer_head *map_bh, int create); > +int xfs_get_blocks_dax(struct inode *inode, sector_t offset, > + struct buffer_head *map_bh, int create); > +void xfs_get_blocks_dax_complete(struct buffer_head *bh, int uptodate); > > extern void xfs_count_page_state(struct page *, int *, int *); > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index bc0008f..4bfcba0 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -654,7 +654,7 @@ xfs_file_dio_aio_write( > mp->m_rtdev_targp : mp->m_ddev_targp; > > /* DIO must be aligned to device logical sector size */ > - if ((pos | count) & target->bt_logical_sectormask) > + if (!IS_DAX(inode) && (pos | count) & target->bt_logical_sectormask) > return -EINVAL; > > /* "unaligned" here means not aligned to a filesystem block */ > @@ -724,8 +724,11 @@ xfs_file_dio_aio_write( > out: > xfs_rw_iunlock(ip, iolock); > > - /* No fallback to buffered IO on errors for XFS. */ > - ASSERT(ret < 0 || ret == count); > + /* > + * No fallback to buffered IO on errors for XFS. DAX can result in > + * partial writes, but direct IO will either complete fully or fail. > + */ > + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); > return ret; > } > > @@ -810,7 +813,7 @@ xfs_file_write_iter( > if (XFS_FORCED_SHUTDOWN(ip->i_mount)) > return -EIO; > > - if (unlikely(file->f_flags & O_DIRECT)) > + if ((file->f_flags & O_DIRECT) || IS_DAX(inode)) > ret = xfs_file_dio_aio_write(iocb, from); > else > ret = xfs_file_buffered_aio_write(iocb, from); > @@ -1031,17 +1034,6 @@ xfs_file_readdir( > return xfs_readdir(ip, ctx, bufsize); > } > > -STATIC int > -xfs_file_mmap( > - struct file *filp, > - struct vm_area_struct *vma) > -{ > - vma->vm_ops = &xfs_file_vm_ops; > - > - file_accessed(filp); > - return 0; > -} > - > /* > * This type is designed to indicate the type of offset we would like > * to search from page cache for xfs_seek_hole_data(). > @@ -1466,6 +1458,71 @@ xfs_filemap_page_mkwrite( > return error; > } > > +static const struct vm_operations_struct xfs_file_vm_ops = { > + .fault = xfs_filemap_fault, > + .map_pages = filemap_map_pages, > + .page_mkwrite = xfs_filemap_page_mkwrite, > +}; > + > +#ifdef CONFIG_FS_DAX > +static int > +xfs_filemap_dax_fault( > + struct vm_area_struct *vma, > + struct vm_fault *vmf) > +{ > + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); > + int error; > + > + trace_xfs_filemap_fault(ip); > + > + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); > + error = dax_fault(vma, vmf, xfs_get_blocks_dax, > + xfs_get_blocks_dax_complete); > + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); > + > + return error; > +} > + > +static int > +xfs_filemap_dax_page_mkwrite( > + struct vm_area_struct *vma, > + struct vm_fault *vmf) > +{ > + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); > + int error; > + > + trace_xfs_filemap_page_mkwrite(ip); > + > + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); So I think the lock ordering of XFS_MMAPLOCK and freezing protection is suspicious (and actually so is for normal write faults as I'm looking - didn't realize that when I was first reading your MMAPLOCK patches). Because you take XFS_MMAPLOCK outside of freeze protection however usually we want freeze protection to be the outermost lock - in particular in xfs_file_fallocate() you take XFS_MMAPLOCK inside freeze protection I think. So you'll need to do what ext4 needs to do - take freeze protection, take fs specific locks, and then call do_dax_fault(). Matthew has a patch to actually export do_dax_fault (as __dax_fault()) for filesystems. Similarly for xfs_filemap_page_mkwrite() you need to use __block_page_mkwrite() as a callback (similarly as ext4 does) and do freeze protection early by hand. Honza > + error = dax_mkwrite(vma, vmf, xfs_get_blocks_dax, > + xfs_get_blocks_dax_complete); > + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); > + > + return error; > +} > + > +static const struct vm_operations_struct xfs_file_dax_vm_ops = { > + .fault = xfs_filemap_dax_fault, > + .page_mkwrite = xfs_filemap_dax_page_mkwrite, > +}; > +#else > +#define xfs_file_dax_vm_ops xfs_file_vm_ops > +#endif /* CONFIG_FS_DAX */ > + > +STATIC int > +xfs_file_mmap( > + struct file *filp, > + struct vm_area_struct *vma) > +{ > + file_accessed(filp); > + if (IS_DAX(file_inode(filp))) { > + vma->vm_ops = &xfs_file_dax_vm_ops; > + vma->vm_flags |= VM_MIXEDMAP; > + } else > + vma->vm_ops = &xfs_file_vm_ops; > + return 0; > +} > + > const struct file_operations xfs_file_operations = { > .llseek = xfs_file_llseek, > .read = new_sync_read, > @@ -1497,8 +1554,21 @@ const struct file_operations xfs_dir_file_operations = { > .fsync = xfs_dir_fsync, > }; > > -static const struct vm_operations_struct xfs_file_vm_ops = { > - .fault = xfs_filemap_fault, > - .map_pages = filemap_map_pages, > - .page_mkwrite = xfs_filemap_page_mkwrite, > +#ifdef CONFIG_FS_DAX > +const struct file_operations xfs_file_dax_operations = { > + .llseek = xfs_file_llseek, > + .read = new_sync_read, > + .write = new_sync_write, > + .read_iter = xfs_file_read_iter, > + .write_iter = xfs_file_write_iter, > + .unlocked_ioctl = xfs_file_ioctl, > +#ifdef CONFIG_COMPAT > + .compat_ioctl = xfs_file_compat_ioctl, > +#endif > + .mmap = xfs_file_mmap, > + .open = xfs_file_open, > + .release = xfs_file_release, > + .fsync = xfs_file_fsync, > + .fallocate = xfs_file_fallocate, > }; > +#endif /* CONFIG_FS_DAX */ > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c > index 8b9e688..9f38142 100644 > --- a/fs/xfs/xfs_iops.c > +++ b/fs/xfs/xfs_iops.c > @@ -1264,6 +1264,10 @@ xfs_setup_inode( > case S_IFREG: > inode->i_op = &xfs_inode_operations; > inode->i_fop = &xfs_file_operations; > + if (IS_DAX(inode)) > + inode->i_fop = &xfs_file_dax_operations; > + else > + inode->i_fop = &xfs_file_operations; > inode->i_mapping->a_ops = &xfs_address_space_operations; > break; > case S_IFDIR: > diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h > index a0f84ab..c08983e 100644 > --- a/fs/xfs/xfs_iops.h > +++ b/fs/xfs/xfs_iops.h > @@ -23,6 +23,12 @@ struct xfs_inode; > extern const struct file_operations xfs_file_operations; > extern const struct file_operations xfs_dir_file_operations; > > +#ifdef CONFIG_FS_DAX > +extern const struct file_operations xfs_file_dax_operations; > +#else > +#define xfs_file_dax_operations xfs_file_operations > +#endif > + > extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); > > /* > -- > 2.0.0 >
On Wed, Mar 04, 2015 at 05:18:48PM +0100, Jan Kara wrote: > On Wed 04-03-15 10:30:24, Dave Chinner wrote: > > From: Dave Chinner <dchinner@redhat.com> > > > > Add the initial support for DAX file operations to XFS. This > > includes the necessary block allocation and mmap page fault hooks > > for DAX to function. > > > > Note that the current block allocation code abuses the mapping > > buffer head to provide a completion callback for unwritten extent > > allocation when DAX is clearing blocks. The DAX interface needs to > > be changed to provide a callback similar to get_blocks for this > > callback. > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> ..... > > +static int > > +xfs_filemap_dax_page_mkwrite( > > + struct vm_area_struct *vma, > > + struct vm_fault *vmf) > > +{ > > + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); > > + int error; > > + > > + trace_xfs_filemap_page_mkwrite(ip); > > + > > + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); > So I think the lock ordering of XFS_MMAPLOCK and freezing protection is > suspicious (and actually so is for normal write faults as I'm looking - > didn't realize that when I was first reading your MMAPLOCK patches). > Because you take XFS_MMAPLOCK outside of freeze protection however usually > we want freeze protection to be the outermost lock - in particular in > xfs_file_fallocate() you take XFS_MMAPLOCK inside freeze protection I > think. OK, so why isn't lockdep triggering on that? lockdep is aware of inode locks and the freeze states, supposedly to pick up these exact issues... Oh, probably because the sb freeze order is write, pagefault, transaction. i.e. In the fallocate case, we do sb_start_write, MMAP_LOCK. If we are in a freeze case, we aren't going to freeze page faults until we've frozen all the writes have drained, so there isn't a lock order dependency there. Same for any other mnt_want_write/sb-start_write based modification. Hence the fallocate path and anything that runs through setattr will complete and release the mmap lock and then be prevented from taking it again by the time sb_start_pagefault() can block with the mmap lock held. So there isn't actually a deadlock there because of the way freeze works, and that's why lockdep is staying silent. Still, I probably need to fix it so I'm not leaving a potential landmine around. > So you'll need to do what ext4 needs to do - take freeze protection, take > fs specific locks, and then call do_dax_fault(). Matthew has a patch to > actually export do_dax_fault (as __dax_fault()) for filesystems. pointer to it? if none, I'll just write my own.... Cheers, Dave.
On Wed, Mar 04, 2015 at 04:54:50PM +0200, Boaz Harrosh wrote: > On 03/04/2015 03:01 PM, Dave Chinner wrote: > > On Wed, Mar 04, 2015 at 12:09:40PM +0200, Boaz Harrosh wrote: > <> > > > > So, we definitely need splice to/from DAX enabled inodes to be > > rejected. I'll have a look at that... > > > > default_file_splice_read uses kernel_readv which I think might actually > work. Do you know what xfstest(s) exercise splice? We have a rudimentary one only because I discovered a while back none existed at all. i.e. splice is effectively untested by xfstests. If you want to write some tests to execise it, that'd be great.... Cheers, Dave.
On Thu 05-03-15 09:00:05, Dave Chinner wrote: > On Wed, Mar 04, 2015 at 05:18:48PM +0100, Jan Kara wrote: > > On Wed 04-03-15 10:30:24, Dave Chinner wrote: > > > From: Dave Chinner <dchinner@redhat.com> > > > > > > Add the initial support for DAX file operations to XFS. This > > > includes the necessary block allocation and mmap page fault hooks > > > for DAX to function. > > > > > > Note that the current block allocation code abuses the mapping > > > buffer head to provide a completion callback for unwritten extent > > > allocation when DAX is clearing blocks. The DAX interface needs to > > > be changed to provide a callback similar to get_blocks for this > > > callback. > > > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > ..... > > > +static int > > > +xfs_filemap_dax_page_mkwrite( > > > + struct vm_area_struct *vma, > > > + struct vm_fault *vmf) > > > +{ > > > + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); > > > + int error; > > > + > > > + trace_xfs_filemap_page_mkwrite(ip); > > > + > > > + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); > > So I think the lock ordering of XFS_MMAPLOCK and freezing protection is > > suspicious (and actually so is for normal write faults as I'm looking - > > didn't realize that when I was first reading your MMAPLOCK patches). > > Because you take XFS_MMAPLOCK outside of freeze protection however usually > > we want freeze protection to be the outermost lock - in particular in > > xfs_file_fallocate() you take XFS_MMAPLOCK inside freeze protection I > > think. > > OK, so why isn't lockdep triggering on that? lockdep is aware of > inode locks and the freeze states, supposedly to pick up these exact > issues... > > Oh, probably because the sb freeze order is write, pagefault, > transaction. > > i.e. In the fallocate case, we do sb_start_write, MMAP_LOCK. If we are in > a freeze case, we aren't going to freeze page faults until we've > frozen all the writes have drained, so there isn't a lock order > dependency there. Same for any other mnt_want_write/sb-start_write > based modification. > > Hence the fallocate path and anything that runs through setattr will > complete and release the mmap lock and then be prevented from taking > it again by the time sb_start_pagefault() can block with the mmap > lock held. So there isn't actually a deadlock there because of the > way freeze works, and that's why lockdep is staying silent. Yeah, you're right there isn't a deadlock possibility. After all the lock ranking of your MMAP_LOCk is currently the same as of mmap_sem (and the difficult lock ordering of that semaphore has been the reason why we have special type of freeze protection for page faults). > Still, I probably need to fix it so I'm not leaving a potential > landmine around. I would find it easier to grasp. Yes. > > So you'll need to do what ext4 needs to do - take freeze protection, take > > fs specific locks, and then call do_dax_fault(). Matthew has a patch to > > actually export do_dax_fault (as __dax_fault()) for filesystems. > > pointer to it? if none, I'll just write my own.... http://permalink.gmane.org/gmane.comp.file-systems.ext4/47866 Honza
On Thu, Mar 05, 2015 at 12:05:04PM +0100, Jan Kara wrote: > On Thu 05-03-15 09:00:05, Dave Chinner wrote: > > On Wed, Mar 04, 2015 at 05:18:48PM +0100, Jan Kara wrote: > > > On Wed 04-03-15 10:30:24, Dave Chinner wrote: > > > > From: Dave Chinner <dchinner@redhat.com> > > > > > > > > Add the initial support for DAX file operations to XFS. This > > > > includes the necessary block allocation and mmap page fault hooks > > > > for DAX to function. > > > > > > > > Note that the current block allocation code abuses the mapping > > > > buffer head to provide a completion callback for unwritten extent > > > > allocation when DAX is clearing blocks. The DAX interface needs to > > > > be changed to provide a callback similar to get_blocks for this > > > > callback. > > > > > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > ..... > > > > +static int > > > > +xfs_filemap_dax_page_mkwrite( > > > > + struct vm_area_struct *vma, > > > > + struct vm_fault *vmf) > > > > +{ > > > > + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); > > > > + int error; > > > > + > > > > + trace_xfs_filemap_page_mkwrite(ip); > > > > + > > > > + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); > > > So I think the lock ordering of XFS_MMAPLOCK and freezing protection is > > > suspicious (and actually so is for normal write faults as I'm looking - > > > didn't realize that when I was first reading your MMAPLOCK patches). > > > Because you take XFS_MMAPLOCK outside of freeze protection however usually > > > we want freeze protection to be the outermost lock - in particular in > > > xfs_file_fallocate() you take XFS_MMAPLOCK inside freeze protection I > > > think. > > > > OK, so why isn't lockdep triggering on that? lockdep is aware of > > inode locks and the freeze states, supposedly to pick up these exact > > issues... > > > > Oh, probably because the sb freeze order is write, pagefault, > > transaction. > > > > i.e. In the fallocate case, we do sb_start_write, MMAP_LOCK. If we are in > > a freeze case, we aren't going to freeze page faults until we've > > frozen all the writes have drained, so there isn't a lock order > > dependency there. Same for any other mnt_want_write/sb-start_write > > based modification. > > > > Hence the fallocate path and anything that runs through setattr will > > complete and release the mmap lock and then be prevented from taking > > it again by the time sb_start_pagefault() can block with the mmap > > lock held. So there isn't actually a deadlock there because of the > > way freeze works, and that's why lockdep is staying silent. > Yeah, you're right there isn't a deadlock possibility. After all the lock > ranking of your MMAP_LOCk is currently the same as of mmap_sem (and the > difficult lock ordering of that semaphore has been the reason why we have > special type of freeze protection for page faults). > > > Still, I probably need to fix it so I'm not leaving a potential > > landmine around. > I would find it easier to grasp. Yes. Finally getting back to this. Fixed this, but... > > > > So you'll need to do what ext4 needs to do - take freeze protection, take > > > fs specific locks, and then call do_dax_fault(). Matthew has a patch to > > > actually export do_dax_fault (as __dax_fault()) for filesystems. > > > > pointer to it? if none, I'll just write my own.... > http://permalink.gmane.org/gmane.comp.file-systems.ext4/47866 I can't find any followup to this patch. Is it in any tree anywhere? Right now, I've just pulled the dax instructure part of the patch into my series and modified it to suit because I've fixed the unwritten extent conversion problem differently (i.e. the extra callback) and ext4 needs a lot more help to fix the problems than in that patch. I'll post the patches when I've at least smoke tested them.... Cheers, Dave.
On Thu, Mar 05, 2015 at 09:03:48AM +1100, Dave Chinner wrote: > On Wed, Mar 04, 2015 at 04:54:50PM +0200, Boaz Harrosh wrote: > > On 03/04/2015 03:01 PM, Dave Chinner wrote: > > > On Wed, Mar 04, 2015 at 12:09:40PM +0200, Boaz Harrosh wrote: > > <> > > > > > > So, we definitely need splice to/from DAX enabled inodes to be > > > rejected. I'll have a look at that... > > > > > > > default_file_splice_read uses kernel_readv which I think might actually > > work. Do you know what xfstest(s) exercise splice? > > We have a rudimentary one only because I discovered a while back > none existed at all. i.e. splice is effectively untested by > xfstests. If you want to write some tests to execise it, that'd be > great.... Turns out there's no great need to write splice tests for xfstests - the current loopback device uses splice, and so all of the tests that run on loopback are exercising the splice path through the filesystem. I found this out by disabling splice on dax altogether, and then finding out that lots of tests failed badly, then narrowing it down to: $ sudo mount -o dax /dev/ram0 /mnt/test $ sudo mkfs.xfs -dfile,name=/mnt/test/foo1,size=1g meta-data=/mnt/test/foo1 isize=512 agcount=4, agsize=65536 blks = sectsz=512 attr=2, projid32bit=1 = crc=1 finobt=1 data = bsize=4096 blocks=262144, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0 ftype=1 log =internal log bsize=4096 blocks=2560, version=2 = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 $ sudo mount -o loop /mnt/test/foo1 /mnt/test/foo mount: /dev/loop0: can't read superblock $ because the splice read returned EINVAL rather than data. So, yes, splice canbe made to work with dax if we pass it through the paths that aren't interacting directly with the page cache. Cheers, Dave.
On Tue, Mar 24, 2015 at 03:27:27PM +1100, Dave Chinner wrote: > Turns out there's no great need to write splice tests for xfstests - > the current loopback device uses splice, and so all of the tests > that run on loopback are exercising the splice path through the > filesystem. FYI, we're getting rid of the splice read (ab-)use in loop, so this won't last forever. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 03/24/2015 06:27 AM, Dave Chinner wrote: > On Thu, Mar 05, 2015 at 09:03:48AM +1100, Dave Chinner wrote: >> On Wed, Mar 04, 2015 at 04:54:50PM +0200, Boaz Harrosh wrote: >>> On 03/04/2015 03:01 PM, Dave Chinner wrote: >>>> On Wed, Mar 04, 2015 at 12:09:40PM +0200, Boaz Harrosh wrote: >>> <> >>>> >>>> So, we definitely need splice to/from DAX enabled inodes to be >>>> rejected. I'll have a look at that... >>>> >>> >>> default_file_splice_read uses kernel_readv which I think might actually >>> work. Do you know what xfstest(s) exercise splice? >> >> We have a rudimentary one only because I discovered a while back >> none existed at all. i.e. splice is effectively untested by >> xfstests. If you want to write some tests to execise it, that'd be >> great.... > > Turns out there's no great need to write splice tests for xfstests - > the current loopback device uses splice, and so all of the tests > that run on loopback are exercising the splice path through the > filesystem. > > I found this out by disabling splice on dax altogether, and then finding out > that lots of tests failed badly, then narrowing it down to: > > $ sudo mount -o dax /dev/ram0 /mnt/test > $ sudo mkfs.xfs -dfile,name=/mnt/test/foo1,size=1g > meta-data=/mnt/test/foo1 isize=512 agcount=4, agsize=65536 blks > = sectsz=512 attr=2, projid32bit=1 > = crc=1 finobt=1 > data = bsize=4096 blocks=262144, imaxpct=25 > = sunit=0 swidth=0 blks > naming =version 2 bsize=4096 ascii-ci=0 ftype=1 > log =internal log bsize=4096 blocks=2560, version=2 > = sectsz=512 sunit=0 blks, lazy-count=1 > realtime =none extsz=4096 blocks=0, rtextents=0 > $ sudo mount -o loop /mnt/test/foo1 /mnt/test/foo > mount: /dev/loop0: can't read superblock > $ > > because the splice read returned EINVAL rather than data. So, yes, > splice canbe made to work with dax if we pass it through the paths > that aren't interacting directly with the page cache. > Cool so current dax code actually does support splice by using default_file_splice_read/write indirectly. therefor I think there is merit in keeping just the one file_operations vector pointing to an internal function and doing an if (IS_DAX()) default_file_splice_read/write() at run time. Because with current code, if CONFIG_FS_DAX is enabled at compile time, then also the regular HD none dax mounts will use the slow default_file_splice_read instead of what ever something better that the FS is doing. Do you think we should do the IS_DAX() switch at generic_file_splice_read and iter_file_splice_write to fix all the Fss in one go or point to an internal FS function and do the switch there? Please advise? I will send up a patch that fixes up ext2 ext4, to see how it looks like. > Cheers, > Dave. > Thanks Dave Boaz -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3a9b7a1..22cb03a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1233,13 +1233,63 @@ xfs_vm_releasepage( return try_to_free_buffers(page); } +/* + * For DAX we need a mapping buffer callback for unwritten extent conversion + * when page faults allocation blocks and then zero them. + */ +#ifdef CONFIG_FS_DAX +static struct xfs_ioend * +xfs_dax_alloc_ioend( + struct inode *inode, + xfs_off_t offset, + ssize_t size) +{ + struct xfs_ioend *ioend; + + ASSERT(IS_DAX(inode)); + ioend = xfs_alloc_ioend(inode, XFS_IO_UNWRITTEN); + ioend->io_offset = offset; + ioend->io_size = size; + return ioend; +} + +void +xfs_get_blocks_dax_complete( + struct buffer_head *bh, + int uptodate) +{ + struct xfs_ioend *ioend = bh->b_private; + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error; + + ASSERT(IS_DAX(ioend->io_inode)); + + /* if there was an error zeroing, then don't convert it */ + if (!uptodate) + goto out_free; + + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); + if (error) + xfs_warn(ip->i_mount, +"%s: conversion failed, ino 0x%llx, offset 0x%llx, len 0x%lx, error %d\n", + __func__, ip->i_ino, ioend->io_offset, + ioend->io_size, error); +out_free: + mempool_free(ioend, xfs_ioend_pool); + +} +#else +#define xfs_dax_alloc_ioend(i,o,s) NULL +#endif + STATIC int __xfs_get_blocks( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create, - int direct) + bool direct, + bool clear) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1304,6 +1354,7 @@ __xfs_get_blocks( if (error) return error; new = 1; + } else { /* * Delalloc reservations do not require a transaction, @@ -1340,7 +1391,10 @@ __xfs_get_blocks( if (create || !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); if (create && ISUNWRITTEN(&imap)) { - if (direct) { + if (clear) { + bh_result->b_private = xfs_dax_alloc_ioend( + inode, offset, size); + } else if (direct) { bh_result->b_private = inode; set_buffer_defer_completion(bh_result); } @@ -1425,7 +1479,7 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); + return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); } STATIC int @@ -1435,7 +1489,17 @@ xfs_get_blocks_direct( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); + return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); +} + +int +xfs_get_blocks_dax( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); } /* diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index ac644e0..7c6fb3f 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -53,7 +53,12 @@ typedef struct xfs_ioend { } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +int xfs_get_blocks(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +int xfs_get_blocks_dax(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +void xfs_get_blocks_dax_complete(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index bc0008f..4bfcba0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -654,7 +654,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if ((pos | count) & target->bt_logical_sectormask) + if (!IS_DAX(inode) && (pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -724,8 +724,11 @@ xfs_file_dio_aio_write( out: xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* + * No fallback to buffered IO on errors for XFS. DAX can result in + * partial writes, but direct IO will either complete fully or fail. + */ + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); return ret; } @@ -810,7 +813,7 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (unlikely(file->f_flags & O_DIRECT)) + if ((file->f_flags & O_DIRECT) || IS_DAX(inode)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1031,17 +1034,6 @@ xfs_file_readdir( return xfs_readdir(ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - - file_accessed(filp); - return 0; -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1466,6 +1458,71 @@ xfs_filemap_page_mkwrite( return error; } +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; + +#ifdef CONFIG_FS_DAX +static int +xfs_filemap_dax_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_fault(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = dax_fault(vma, vmf, xfs_get_blocks_dax, + xfs_get_blocks_dax_complete); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +static int +xfs_filemap_dax_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_page_mkwrite(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = dax_mkwrite(vma, vmf, xfs_get_blocks_dax, + xfs_get_blocks_dax_complete); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +static const struct vm_operations_struct xfs_file_dax_vm_ops = { + .fault = xfs_filemap_dax_fault, + .page_mkwrite = xfs_filemap_dax_page_mkwrite, +}; +#else +#define xfs_file_dax_vm_ops xfs_file_vm_ops +#endif /* CONFIG_FS_DAX */ + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + file_accessed(filp); + if (IS_DAX(file_inode(filp))) { + vma->vm_ops = &xfs_file_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else + vma->vm_ops = &xfs_file_vm_ops; + return 0; +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read = new_sync_read, @@ -1497,8 +1554,21 @@ const struct file_operations xfs_dir_file_operations = { .fsync = xfs_dir_fsync, }; -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = xfs_filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = xfs_filemap_page_mkwrite, +#ifdef CONFIG_FS_DAX +const struct file_operations xfs_file_dax_operations = { + .llseek = xfs_file_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = xfs_file_read_iter, + .write_iter = xfs_file_write_iter, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .mmap = xfs_file_mmap, + .open = xfs_file_open, + .release = xfs_file_release, + .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, }; +#endif /* CONFIG_FS_DAX */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8b9e688..9f38142 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1264,6 +1264,10 @@ xfs_setup_inode( case S_IFREG: inode->i_op = &xfs_inode_operations; inode->i_fop = &xfs_file_operations; + if (IS_DAX(inode)) + inode->i_fop = &xfs_file_dax_operations; + else + inode->i_fop = &xfs_file_operations; inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index a0f84ab..c08983e 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -23,6 +23,12 @@ struct xfs_inode; extern const struct file_operations xfs_file_operations; extern const struct file_operations xfs_dir_file_operations; +#ifdef CONFIG_FS_DAX +extern const struct file_operations xfs_file_dax_operations; +#else +#define xfs_file_dax_operations xfs_file_operations +#endif + extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); /*