Message ID | 1466609236-23801-8-git-send-email-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Jun 22, 2016 at 05:27:15PM +0200, Christoph Hellwig wrote: > So far the DAX code overloaded the direct I/O code path. There is very little > in common between the two, and untangling them allows to clean up both variants. > > As a ѕide effect we also get separate trace points for both I/O types. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/xfs_file.c | 139 ++++++++++++++++++++++++++++++++++++++++++----------- > fs/xfs/xfs_trace.h | 2 + > 2 files changed, 112 insertions(+), 29 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 24b267d..0e74325 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -262,13 +262,11 @@ xfs_file_dio_aio_read( > else > target = ip->i_mount->m_ddev_targp; > > - if (!IS_DAX(inode)) { > - /* DIO must be aligned to device logical sector size */ > - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { > - if (iocb->ki_pos == isize) > - return 0; > - return -EINVAL; > - } > + /* DIO must be aligned to device logical sector size */ > + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { > + if (iocb->ki_pos == isize) > + return 0; > + return -EINVAL; > } > > /* > @@ -317,13 +315,37 @@ xfs_file_dio_aio_read( > } > > data = *to; > - if (IS_DAX(inode)) { > - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > - NULL, 0); > - } else { > - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > - xfs_get_blocks_direct, NULL, NULL, 0); > + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > + xfs_get_blocks_direct, NULL, NULL, 0); > + if (ret > 0) { > + iocb->ki_pos += ret; > + iov_iter_advance(to, ret); > } > + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); > + > + file_accessed(iocb->ki_filp); So I noticed that generic/323 starts crashing in file_accessed -> touch_atime because iocb->ki_filp->f_path.dentry == NULL. For a while I thought it was some weird reflink bug, but I finally had time to go build a vanilla 4.8-rc8 kernel and that blew up here too. I'm not sure why this line got inserted here, since it wasn't there prior to this patch, AFAICT. <shrug> --D > + return ret; > +} > + > +STATIC ssize_t > +xfs_file_dax_read( > + struct kiocb *iocb, > + struct iov_iter *to) > +{ > + struct address_space *mapping = iocb->ki_filp->f_mapping; > + struct inode *inode = mapping->host; > + struct xfs_inode *ip = XFS_I(inode); > + struct iov_iter data = *to; > + size_t count = iov_iter_count(to); > + ssize_t ret = 0; > + > + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); > + > + if (!count) > + return 0; /* skip atime */ > + > + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); > + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); > if (ret > 0) { > iocb->ki_pos += ret; > iov_iter_advance(to, ret); > @@ -356,7 +378,8 @@ xfs_file_read_iter( > struct kiocb *iocb, > struct iov_iter *to) > { > - struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; > + struct inode *inode = file_inode(iocb->ki_filp); > + struct xfs_mount *mp = XFS_I(inode)->i_mount; > ssize_t ret = 0; > > XFS_STATS_INC(mp, xs_read_calls); > @@ -364,7 +387,9 @@ xfs_file_read_iter( > if (XFS_FORCED_SHUTDOWN(mp)) > return -EIO; > > - if (iocb->ki_flags & IOCB_DIRECT) > + if (IS_DAX(inode)) > + ret = xfs_file_dax_read(iocb, to); > + else if (iocb->ki_flags & IOCB_DIRECT) > ret = xfs_file_dio_aio_read(iocb, to); > else > ret = xfs_file_buffered_aio_read(iocb, to); > @@ -586,8 +611,7 @@ xfs_file_dio_aio_write( > mp->m_rtdev_targp : mp->m_ddev_targp; > > /* DIO must be aligned to device logical sector size */ > - if (!IS_DAX(inode) && > - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) > + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) > return -EINVAL; > > /* "unaligned" here means not aligned to a filesystem block */ > @@ -656,14 +680,9 @@ xfs_file_dio_aio_write( > trace_xfs_file_direct_write(ip, count, iocb->ki_pos); > > data = *from; > - if (IS_DAX(inode)) { > - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > - xfs_end_io_direct_write, 0); > - } else { > - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > - xfs_get_blocks_direct, xfs_end_io_direct_write, > - NULL, DIO_ASYNC_EXTEND); > - } > + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > + xfs_get_blocks_direct, xfs_end_io_direct_write, > + NULL, DIO_ASYNC_EXTEND); > > /* see generic_file_direct_write() for why this is necessary */ > if (mapping->nrpages) { > @@ -680,10 +699,70 @@ out: > xfs_rw_iunlock(ip, iolock); > > /* > - * No fallback to buffered IO on errors for XFS. DAX can result in > - * partial writes, but direct IO will either complete fully or fail. > + * No fallback to buffered IO on errors for XFS, direct IO will either > + * complete fully or fail. > + */ > + ASSERT(ret < 0 || ret == count); > + return ret; > +} > + > +STATIC ssize_t > +xfs_file_dax_write( > + struct kiocb *iocb, > + struct iov_iter *from) > +{ > + struct address_space *mapping = iocb->ki_filp->f_mapping; > + struct inode *inode = mapping->host; > + struct xfs_inode *ip = XFS_I(inode); > + struct xfs_mount *mp = ip->i_mount; > + ssize_t ret = 0; > + int unaligned_io = 0; > + int iolock; > + struct iov_iter data; > + > + /* "unaligned" here means not aligned to a filesystem block */ > + if ((iocb->ki_pos & mp->m_blockmask) || > + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { > + unaligned_io = 1; > + iolock = XFS_IOLOCK_EXCL; > + } else if (mapping->nrpages) { > + iolock = XFS_IOLOCK_EXCL; > + } else { > + iolock = XFS_IOLOCK_SHARED; > + } > + xfs_rw_ilock(ip, iolock); > + > + ret = xfs_file_aio_write_checks(iocb, from, &iolock); > + if (ret) > + goto out; > + > + /* > + * Yes, even DAX files can have page cache attached to them: A zeroed > + * page is inserted into the pagecache when we have to serve a write > + * fault on a hole. It should never be dirtied and can simply be > + * dropped from the pagecache once we get real data for the page. > */ > - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); > + if (mapping->nrpages) { > + ret = invalidate_inode_pages2(mapping); > + WARN_ON_ONCE(ret); > + } > + > + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { > + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); > + iolock = XFS_IOLOCK_SHARED; > + } > + > + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); > + > + data = *from; > + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > + xfs_end_io_direct_write, 0); > + if (ret > 0) { > + iocb->ki_pos += ret; > + iov_iter_advance(from, ret); > + } > +out: > + xfs_rw_iunlock(ip, iolock); > return ret; > } > > @@ -765,7 +844,9 @@ xfs_file_write_iter( > if (XFS_FORCED_SHUTDOWN(ip->i_mount)) > return -EIO; > > - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) > + if (IS_DAX(inode)) > + ret = xfs_file_dax_write(iocb, from); > + else if (iocb->ki_flags & IOCB_DIRECT) > ret = xfs_file_dio_aio_write(iocb, from); > else > ret = xfs_file_buffered_aio_write(iocb, from); > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index 2504f94..1451690 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name, \ > TP_ARGS(ip, count, offset)) > DEFINE_RW_EVENT(xfs_file_buffered_read); > DEFINE_RW_EVENT(xfs_file_direct_read); > +DEFINE_RW_EVENT(xfs_file_dax_read); > DEFINE_RW_EVENT(xfs_file_buffered_write); > DEFINE_RW_EVENT(xfs_file_direct_write); > +DEFINE_RW_EVENT(xfs_file_dax_write); > DEFINE_RW_EVENT(xfs_file_splice_read); > > DECLARE_EVENT_CLASS(xfs_page_class, > -- > 2.1.4 > > _______________________________________________ > xfs mailing list > xfs@oss.sgi.com > http://oss.sgi.com/mailman/listinfo/xfs -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Sep 28, 2016 at 07:53:52PM -0700, Darrick J. Wong wrote: > So I noticed that generic/323 starts crashing in file_accessed -> touch_atime > because iocb->ki_filp->f_path.dentry == NULL. For a while I thought it was > some weird reflink bug, but I finally had time to go build a vanilla 4.8-rc8 > kernel and that blew up here too. I'm not sure why this line got inserted > here, since it wasn't there prior to this patch, AFAICT. This line was there before near the end of xfs_file_dio_aio_read already, e.g. line 376 just before the above commit, but it only got introduced a bit earlier in "xfs: stop using generic_file_read_iter for direct I/O", which copied it over from generic_file_read_iter. І think any new issues in these commits could just be a minor timing change, as we're not changing struct file refcounting in any way here. generic/323 reproduces the last struct file reference being dropped by aio completions, so it seems like we have an issue here, which I suspect is something in the common code. I can't reproduce it locally, but looking at the aio_complete -> kiocb_free callchain and the lack of other struct file refcounting in aio.c it seems inherently unsafe to reference struct file once the completion may have run, that is after (__)blkdev_direct_IO returned. I'll see if I can come up with a solution for that, most likely that would involve moving the file_accessed call into __blkdev_direct_IO before we drop the final reference on the dio structure. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Can you try the patch below? That just moves the file_accessed call before the I/O, similar to how we handle timestamp updates on the write side. generic_file_read_iter will also need a similar update. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 24b267d..0e74325 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -262,13 +262,11 @@ xfs_file_dio_aio_read( else target = ip->i_mount->m_ddev_targp; - if (!IS_DAX(inode)) { - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; } /* @@ -317,13 +315,37 @@ xfs_file_dio_aio_read( } data = *to; - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - NULL, 0); - } else { - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); if (ret > 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); @@ -356,7 +378,8 @@ xfs_file_read_iter( struct kiocb *iocb, struct iov_iter *to) { - struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; ssize_t ret = 0; XFS_STATS_INC(mp, xs_read_calls); @@ -364,7 +387,9 @@ xfs_file_read_iter( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (iocb->ki_flags & IOCB_DIRECT) + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_read(iocb, to); else ret = xfs_file_buffered_aio_read(iocb, to); @@ -586,8 +611,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -656,14 +680,9 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - xfs_end_io_direct_write, 0); - } else { - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - } + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -680,10 +699,70 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. + */ + ASSERT(ret < 0 || ret == count); + return ret; +} + +STATIC ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + if (mapping->nrpages) { + ret = invalidate_inode_pages2(mapping); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -765,7 +844,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2504f94..1451690 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name, \ TP_ARGS(ip, count, offset)) DEFINE_RW_EVENT(xfs_file_buffered_read); DEFINE_RW_EVENT(xfs_file_direct_read); +DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class,
So far the DAX code overloaded the direct I/O code path. There is very little in common between the two, and untangling them allows to clean up both variants. As a ѕide effect we also get separate trace points for both I/O types. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_file.c | 139 ++++++++++++++++++++++++++++++++++++++++++----------- fs/xfs/xfs_trace.h | 2 + 2 files changed, 112 insertions(+), 29 deletions(-)