diff mbox

[v7] fs: Fix page cache inconsistency when mixing buffered and AIO DIO

Message ID 1502803734-27706-1-git-send-email-lczerner@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lukas Czerner Aug. 15, 2017, 1:28 p.m. UTC
Currently when mixing buffered reads and asynchronous direct writes it
is possible to end up with the situation where we have stale data in the
page cache while the new data is already written to disk. This is
permanent until the affected pages are flushed away. Despite the fact
that mixing buffered and direct IO is ill-advised it does pose a thread
for a data integrity, is unexpected and should be fixed.

Fix this by deferring completion of asynchronous direct writes to a
process context in the case that there are mapped pages to be found in
the inode. Later before the completion in dio_complete() invalidate
the pages in question. This ensures that after the completion the pages
in the written area are either unmapped, or populated with up-to-date
data. Also do the same for the iomap case which uses
iomap_dio_complete() instead.

This has a side effect of deferring the completion to a process context
for every AIO DIO that happens on inode that has pages mapped. However
since the consensus is that this is ill-advised practice the performance
implication should not be a problem.

This was based on proposal from Jeff Moyer, thanks!

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
---
v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
v3: Do not invalidate in case of error. Add some coments
v4: Remove unnecessary variable, remove unnecessary inner braces
v5: Style changes
v6: Remove redundant invalidatepage, add warning and comment
v7: Run invalidateion conditionally from generic_file_direct_write()

 fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
 fs/iomap.c     | 29 ++++++++++++++++-------------
 mm/filemap.c   | 10 ++++++++--
 3 files changed, 67 insertions(+), 21 deletions(-)

Comments

Jan Kara Aug. 16, 2017, 1:15 p.m. UTC | #1
On Tue 15-08-17 15:28:54, Lukas Czerner wrote:
> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
> 
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
> 
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
> 
> This was based on proposal from Jeff Moyer, thanks!
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>

Looks good to me. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
> v6: Remove redundant invalidatepage, add warning and comment
> v7: Run invalidateion conditionally from generic_file_direct_write()
> 
>  fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
>  fs/iomap.c     | 29 ++++++++++++++++-------------
>  mm/filemap.c   | 10 ++++++++--
>  3 files changed, 67 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..ffb9e19 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  {
>  	loff_t offset = dio->iocb->ki_pos;
>  	ssize_t transferred = 0;
> +	int err;
>  
>  	/*
>  	 * AIO submission can race with bio completion to get here while
> @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(err);
> +	}
> +
>  	if (dio->end_io) {
> -		int err;
>  
>  		// XXX: ki_pos??
>  		err = dio->end_io(dio->iocb, offset, ret, dio->private);
> @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 0392661..c3e299a 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,24 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
> +		ret = invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(ret);
> +	}
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
> @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	ret = iomap_dio_complete(dio);
>  
> -	/*
> -	 * Try again to invalidate clean pages which might have been cached by
> -	 * non-direct readahead, or faulted in by get_user_pages() if the source
> -	 * of the write was an mmap'ed region of the file we're writing.  Either
> -	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> -	 * this invalidation fails, tough, the write still worked...
> -	 */
> -	if (iov_iter_rw(iter) == WRITE) {
> -		int err = invalidate_inode_pages2_range(mapping,
> -				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> -		WARN_ON_ONCE(err);
> -	}
> -
>  	return ret;
>  
>  out_free_dio:
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..9440e02 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2885,9 +2885,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  	 * we're writing.  Either one is a pretty crazy thing to do,
>  	 * so we don't support it 100%.  If this invalidation
>  	 * fails, tough, the write still worked...
> +	 *
> +	 * Most of the time we do not need this since dio_complete() will do
> +	 * the invalidation for us. However there are some file systems that
> +	 * do not end up with dio_complete() being called, so let's not break
> +	 * them by removing it completely
>  	 */
> -	invalidate_inode_pages2_range(mapping,
> -				pos >> PAGE_SHIFT, end);
> +	if (mapping->nrpages)
> +		invalidate_inode_pages2_range(mapping,
> +					pos >> PAGE_SHIFT, end);
>  
>  	if (written > 0) {
>  		pos += written;
> -- 
> 2.7.5
>
Darrick J. Wong Aug. 16, 2017, 4:01 p.m. UTC | #2
On Tue, Aug 15, 2017 at 03:28:54PM +0200, Lukas Czerner wrote:
> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
> 
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
> 
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
> 
> This was based on proposal from Jeff Moyer, thanks!
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>

Looks ok to me,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
> v6: Remove redundant invalidatepage, add warning and comment
> v7: Run invalidateion conditionally from generic_file_direct_write()
> 
>  fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
>  fs/iomap.c     | 29 ++++++++++++++++-------------
>  mm/filemap.c   | 10 ++++++++--
>  3 files changed, 67 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..ffb9e19 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  {
>  	loff_t offset = dio->iocb->ki_pos;
>  	ssize_t transferred = 0;
> +	int err;
>  
>  	/*
>  	 * AIO submission can race with bio completion to get here while
> @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(err);
> +	}
> +
>  	if (dio->end_io) {
> -		int err;
>  
>  		// XXX: ki_pos??
>  		err = dio->end_io(dio->iocb, offset, ret, dio->private);
> @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 0392661..c3e299a 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,24 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
> +		ret = invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(ret);
> +	}
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
> @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	ret = iomap_dio_complete(dio);
>  
> -	/*
> -	 * Try again to invalidate clean pages which might have been cached by
> -	 * non-direct readahead, or faulted in by get_user_pages() if the source
> -	 * of the write was an mmap'ed region of the file we're writing.  Either
> -	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> -	 * this invalidation fails, tough, the write still worked...
> -	 */
> -	if (iov_iter_rw(iter) == WRITE) {
> -		int err = invalidate_inode_pages2_range(mapping,
> -				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> -		WARN_ON_ONCE(err);
> -	}
> -
>  	return ret;
>  
>  out_free_dio:
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..9440e02 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2885,9 +2885,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  	 * we're writing.  Either one is a pretty crazy thing to do,
>  	 * so we don't support it 100%.  If this invalidation
>  	 * fails, tough, the write still worked...
> +	 *
> +	 * Most of the time we do not need this since dio_complete() will do
> +	 * the invalidation for us. However there are some file systems that
> +	 * do not end up with dio_complete() being called, so let's not break
> +	 * them by removing it completely
>  	 */
> -	invalidate_inode_pages2_range(mapping,
> -				pos >> PAGE_SHIFT, end);
> +	if (mapping->nrpages)
> +		invalidate_inode_pages2_range(mapping,
> +					pos >> PAGE_SHIFT, end);
>  
>  	if (written > 0) {
>  		pos += written;
> -- 
> 2.7.5
>
Jeff Moyer Sept. 21, 2017, 1:44 p.m. UTC | #3
Lukas Czerner <lczerner@redhat.com> writes:

> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
>
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
>
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
>
> This was based on proposal from Jeff Moyer, thanks!
>
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>

Is this still in limbo?

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>

> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
> v6: Remove redundant invalidatepage, add warning and comment
> v7: Run invalidateion conditionally from generic_file_direct_write()
>
>  fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
>  fs/iomap.c     | 29 ++++++++++++++++-------------
>  mm/filemap.c   | 10 ++++++++--
>  3 files changed, 67 insertions(+), 21 deletions(-)
>
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..ffb9e19 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  {
>  	loff_t offset = dio->iocb->ki_pos;
>  	ssize_t transferred = 0;
> +	int err;
>  
>  	/*
>  	 * AIO submission can race with bio completion to get here while
> @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(err);
> +	}
> +
>  	if (dio->end_io) {
> -		int err;
>  
>  		// XXX: ki_pos??
>  		err = dio->end_io(dio->iocb, offset, ret, dio->private);
> @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 0392661..c3e299a 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,24 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
> +		ret = invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(ret);
> +	}
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
> @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	ret = iomap_dio_complete(dio);
>  
> -	/*
> -	 * Try again to invalidate clean pages which might have been cached by
> -	 * non-direct readahead, or faulted in by get_user_pages() if the source
> -	 * of the write was an mmap'ed region of the file we're writing.  Either
> -	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> -	 * this invalidation fails, tough, the write still worked...
> -	 */
> -	if (iov_iter_rw(iter) == WRITE) {
> -		int err = invalidate_inode_pages2_range(mapping,
> -				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> -		WARN_ON_ONCE(err);
> -	}
> -
>  	return ret;
>  
>  out_free_dio:
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..9440e02 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2885,9 +2885,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  	 * we're writing.  Either one is a pretty crazy thing to do,
>  	 * so we don't support it 100%.  If this invalidation
>  	 * fails, tough, the write still worked...
> +	 *
> +	 * Most of the time we do not need this since dio_complete() will do
> +	 * the invalidation for us. However there are some file systems that
> +	 * do not end up with dio_complete() being called, so let's not break
> +	 * them by removing it completely
>  	 */
> -	invalidate_inode_pages2_range(mapping,
> -				pos >> PAGE_SHIFT, end);
> +	if (mapping->nrpages)
> +		invalidate_inode_pages2_range(mapping,
> +					pos >> PAGE_SHIFT, end);
>  
>  	if (written > 0) {
>  		pos += written;
Lukas Czerner Sept. 21, 2017, 1:44 p.m. UTC | #4
Al, Jens,

can any of you please take this throught your tree ?

Thanks!
-Lukas

On Tue, Aug 15, 2017 at 03:28:54PM +0200, Lukas Czerner wrote:
> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
> 
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
> 
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
> 
> This was based on proposal from Jeff Moyer, thanks!
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
> v6: Remove redundant invalidatepage, add warning and comment
> v7: Run invalidateion conditionally from generic_file_direct_write()
> 
>  fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
>  fs/iomap.c     | 29 ++++++++++++++++-------------
>  mm/filemap.c   | 10 ++++++++--
>  3 files changed, 67 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..ffb9e19 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  {
>  	loff_t offset = dio->iocb->ki_pos;
>  	ssize_t transferred = 0;
> +	int err;
>  
>  	/*
>  	 * AIO submission can race with bio completion to get here while
> @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(err);
> +	}
> +
>  	if (dio->end_io) {
> -		int err;
>  
>  		// XXX: ki_pos??
>  		err = dio->end_io(dio->iocb, offset, ret, dio->private);
> @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 0392661..c3e299a 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,24 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
> +		ret = invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(ret);
> +	}
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
> @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	ret = iomap_dio_complete(dio);
>  
> -	/*
> -	 * Try again to invalidate clean pages which might have been cached by
> -	 * non-direct readahead, or faulted in by get_user_pages() if the source
> -	 * of the write was an mmap'ed region of the file we're writing.  Either
> -	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> -	 * this invalidation fails, tough, the write still worked...
> -	 */
> -	if (iov_iter_rw(iter) == WRITE) {
> -		int err = invalidate_inode_pages2_range(mapping,
> -				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> -		WARN_ON_ONCE(err);
> -	}
> -
>  	return ret;
>  
>  out_free_dio:
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..9440e02 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2885,9 +2885,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  	 * we're writing.  Either one is a pretty crazy thing to do,
>  	 * so we don't support it 100%.  If this invalidation
>  	 * fails, tough, the write still worked...
> +	 *
> +	 * Most of the time we do not need this since dio_complete() will do
> +	 * the invalidation for us. However there are some file systems that
> +	 * do not end up with dio_complete() being called, so let's not break
> +	 * them by removing it completely
>  	 */
> -	invalidate_inode_pages2_range(mapping,
> -				pos >> PAGE_SHIFT, end);
> +	if (mapping->nrpages)
> +		invalidate_inode_pages2_range(mapping,
> +					pos >> PAGE_SHIFT, end);
>  
>  	if (written > 0) {
>  		pos += written;
> -- 
> 2.7.5
>
Jens Axboe Sept. 21, 2017, 2:14 p.m. UTC | #5
On 09/21/2017 07:44 AM, Lukas Czerner wrote:
> Al, Jens,
> 
> can any of you please take this throught your tree ?

I can take it, it's well reviewed at this point (to say the least).
David Sterba Oct. 10, 2017, 2:34 p.m. UTC | #6
On Tue, Aug 15, 2017 at 03:28:54PM +0200, Lukas Czerner wrote:
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(err);

fstests/btrfs/062 reports this:

[ 6235.547298] ------------[ cut here ]------------
[ 6235.552098] WARNING: CPU: 7 PID: 24321 at fs/direct-io.c:274 dio_complete+0x16f/0x1f0
[ 6235.560858] Modules linked in: dm_flakey loop rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace sunrpc fscache af_packet br_netfilter bridge stp llc iscsi_ibft iscsi_boot_sysfs btrfs xor zstd_decompress zstd_compress i2c_algo_bit drm_kms_helper xxhash zlib_deflate raid6_pq syscopyarea sysfillrect sysimgblt fb_sys_fops ttm tg3 drm dm_mod dax ptp kvm_amd pps_core kvm libphy tpm_infineon mptctl shpchp k10temp tpm_tis tpm_tis_core button i2c_piix4 tpm pcspkr irqbypass acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom ohci_pci ehci_pci ohci_hcd mptsas ehci_hcd scsi_transport_sas ata_generic mptscsih serio_raw mptbase usbcore sata_svw pata_serverworks sg scsi_dh_rdac scsi_dh_emc scsi_dh_alua
[ 6235.560942] CPU: 7 PID: 24321 Comm: kworker/7:1 Not tainted 4.14.0-rc4-1.ge195904-vanilla+ #71
[ 6235.560944] Hardware name: empty empty/S3993, BIOS PAQEX0-3 02/24/2008
[ 6235.560950] Workqueue: dio/sdb6 dio_aio_complete_work
[ 6235.560953] task: ffff894fe0bd8300 task.stack: ffffb45742f7c000
[ 6235.560957] RIP: 0010:dio_complete+0x16f/0x1f0
[ 6235.560959] RSP: 0018:ffffb45742f7fde8 EFLAGS: 00010286
[ 6235.560968] RAX: 00000000fffffff0 RBX: ffff894fd1e3a680 RCX: ffff894fe0bd8300
[ 6235.560970] RDX: 0000000000000000 RSI: 00000000000002b4 RDI: ffffffffaba438e9
[ 6235.560971] RBP: ffffb45742f7fe10 R08: 0000000000000000 R09: 0000000000000025
[ 6235.560973] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000001e000
[ 6235.560974] R13: 000000000001e000 R14: 0000000000007000 R15: 0000000000000001
[ 6235.560977] FS:  0000000000000000(0000) GS:ffff894fefdc0000(0000) knlGS:0000000000000000
[ 6235.560978] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6235.560980] CR2: 00007fe1e1dfb610 CR3: 0000000213ee7000 CR4: 00000000000006e0
[ 6235.560982] Call Trace:
[ 6235.561075]  dio_aio_complete_work+0x1c/0x20
[ 6235.561082]  process_one_work+0x1d8/0x620
[ 6235.561085]  ? process_one_work+0x14b/0x620
[ 6235.561092]  worker_thread+0x4d/0x3c0
[ 6235.561097]  ? trace_hardirqs_on+0xd/0x10
[ 6235.561105]  kthread+0x152/0x190
[ 6235.561107]  ? process_one_work+0x620/0x620
[ 6235.561111]  ? kthread_create_on_node+0x40/0x40
[ 6235.561116]  ? do_syscall_64+0x69/0x180
[ 6235.561122]  ret_from_fork+0x2a/0x40
[ 6235.561131] Code: 48 83 bf 00 01 00 00 00 0f 84 37 ff ff ff 4b 8d 54 34 ff 4c 89 f6 48 c1 fe 0c 48 c1 fa 0c e8 49 82 f2 ff 85 c0 0f 84 1a ff ff ff <0f> ff e9 13 ff ff ff 48 81 c7 e0 00 00 00 be 09 00 00 00 e8 79
[ 6235.561179] ---[ end trace ba80cd81f19cb389 ]---

I've added Chris and Bo to CC if they have more to say about the specifics of
dio and buffered writes as implemented in btrfs.
Lukas Czerner Oct. 11, 2017, 9:21 a.m. UTC | #7
On Tue, Oct 10, 2017 at 04:34:45PM +0200, David Sterba wrote:
> On Tue, Aug 15, 2017 at 03:28:54PM +0200, Lukas Czerner wrote:
> > +	/*
> > +	 * Try again to invalidate clean pages which might have been cached by
> > +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> > +	 * of the write was an mmap'ed region of the file we're writing.  Either
> > +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> > +	 * this invalidation fails, tough, the write still worked...
> > +	 */
> > +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> > +	    dio->inode->i_mapping->nrpages) {
> > +		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> > +					offset >> PAGE_SHIFT,
> > +					(offset + ret - 1) >> PAGE_SHIFT);
> > +		WARN_ON_ONCE(err);
> 
> fstests/btrfs/062 reports this:
> 
> [ 6235.547298] ------------[ cut here ]------------
> [ 6235.552098] WARNING: CPU: 7 PID: 24321 at fs/direct-io.c:274 dio_complete+0x16f/0x1f0
> [ 6235.560858] Modules linked in: dm_flakey loop rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace sunrpc fscache af_packet br_netfilter bridge stp llc iscsi_ibft iscsi_boot_sysfs btrfs xor zstd_decompress zstd_compress i2c_algo_bit drm_kms_helper xxhash zlib_deflate raid6_pq syscopyarea sysfillrect sysimgblt fb_sys_fops ttm tg3 drm dm_mod dax ptp kvm_amd pps_core kvm libphy tpm_infineon mptctl shpchp k10temp tpm_tis tpm_tis_core button i2c_piix4 tpm pcspkr irqbypass acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom ohci_pci ehci_pci ohci_hcd mptsas ehci_hcd scsi_transport_sas ata_generic mptscsih serio_raw mptbase usbcore sata_svw pata_serverworks sg scsi_dh_rdac scsi_dh_emc scsi_dh_alua
> [ 6235.560942] CPU: 7 PID: 24321 Comm: kworker/7:1 Not tainted 4.14.0-rc4-1.ge195904-vanilla+ #71
> [ 6235.560944] Hardware name: empty empty/S3993, BIOS PAQEX0-3 02/24/2008
> [ 6235.560950] Workqueue: dio/sdb6 dio_aio_complete_work
> [ 6235.560953] task: ffff894fe0bd8300 task.stack: ffffb45742f7c000
> [ 6235.560957] RIP: 0010:dio_complete+0x16f/0x1f0
> [ 6235.560959] RSP: 0018:ffffb45742f7fde8 EFLAGS: 00010286
> [ 6235.560968] RAX: 00000000fffffff0 RBX: ffff894fd1e3a680 RCX: ffff894fe0bd8300
> [ 6235.560970] RDX: 0000000000000000 RSI: 00000000000002b4 RDI: ffffffffaba438e9
> [ 6235.560971] RBP: ffffb45742f7fe10 R08: 0000000000000000 R09: 0000000000000025
> [ 6235.560973] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000001e000
> [ 6235.560974] R13: 000000000001e000 R14: 0000000000007000 R15: 0000000000000001
> [ 6235.560977] FS:  0000000000000000(0000) GS:ffff894fefdc0000(0000) knlGS:0000000000000000
> [ 6235.560978] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 6235.560980] CR2: 00007fe1e1dfb610 CR3: 0000000213ee7000 CR4: 00000000000006e0
> [ 6235.560982] Call Trace:
> [ 6235.561075]  dio_aio_complete_work+0x1c/0x20
> [ 6235.561082]  process_one_work+0x1d8/0x620
> [ 6235.561085]  ? process_one_work+0x14b/0x620
> [ 6235.561092]  worker_thread+0x4d/0x3c0
> [ 6235.561097]  ? trace_hardirqs_on+0xd/0x10
> [ 6235.561105]  kthread+0x152/0x190
> [ 6235.561107]  ? process_one_work+0x620/0x620
> [ 6235.561111]  ? kthread_create_on_node+0x40/0x40
> [ 6235.561116]  ? do_syscall_64+0x69/0x180
> [ 6235.561122]  ret_from_fork+0x2a/0x40
> [ 6235.561131] Code: 48 83 bf 00 01 00 00 00 0f 84 37 ff ff ff 4b 8d 54 34 ff 4c 89 f6 48 c1 fe 0c 48 c1 fa 0c e8 49 82 f2 ff 85 c0 0f 84 1a ff ff ff <0f> ff e9 13 ff ff ff 48 81 c7 e0 00 00 00 be 09 00 00 00 e8 79
> [ 6235.561179] ---[ end trace ba80cd81f19cb389 ]---
> 
> I've added Chris and Bo to CC if they have more to say about the specifics of
> dio and buffered writes as implemented in btrfs.

There are several places where we warn on invalidation failing. The
question is why invalidation failed on btrfs in this test.

-Lukas
diff mbox

Patch

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 08cf278..ffb9e19 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -229,6 +229,7 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 {
 	loff_t offset = dio->iocb->ki_pos;
 	ssize_t transferred = 0;
+	int err;
 
 	/*
 	 * AIO submission can race with bio completion to get here while
@@ -258,8 +259,22 @@  static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 	if (ret == 0)
 		ret = transferred;
 
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 */
+	if (ret > 0 && dio->op == REQ_OP_WRITE &&
+	    dio->inode->i_mapping->nrpages) {
+		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
+					offset >> PAGE_SHIFT,
+					(offset + ret - 1) >> PAGE_SHIFT);
+		WARN_ON_ONCE(err);
+	}
+
 	if (dio->end_io) {
-		int err;
 
 		// XXX: ki_pos??
 		err = dio->end_io(dio->iocb, offset, ret, dio->private);
@@ -304,6 +319,7 @@  static void dio_bio_end_aio(struct bio *bio)
 	struct dio *dio = bio->bi_private;
 	unsigned long remaining;
 	unsigned long flags;
+	bool defer_completion = false;
 
 	/* cleanup the bio */
 	dio_bio_complete(dio, bio);
@@ -315,7 +331,19 @@  static void dio_bio_end_aio(struct bio *bio)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (remaining == 0) {
-		if (dio->result && dio->defer_completion) {
+		/*
+		 * Defer completion when defer_completion is set or
+		 * when the inode has pages mapped and this is AIO write.
+		 * We need to invalidate those pages because there is a
+		 * chance they contain stale data in the case buffered IO
+		 * went in between AIO submission and completion into the
+		 * same region.
+		 */
+		if (dio->result)
+			defer_completion = dio->defer_completion ||
+					   (dio->op == REQ_OP_WRITE &&
+					    dio->inode->i_mapping->nrpages);
+		if (defer_completion) {
 			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
 			queue_work(dio->inode->i_sb->s_dio_done_wq,
 				   &dio->complete_work);
@@ -1210,10 +1238,19 @@  do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
 	 * so that we can call ->fsync.
 	 */
-	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
-	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
-	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
-		retval = dio_set_defer_completion(dio);
+	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
+		retval = 0;
+		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
+		    IS_SYNC(iocb->ki_filp->f_mapping->host))
+			retval = dio_set_defer_completion(dio);
+		else if (!dio->inode->i_sb->s_dio_done_wq) {
+			/*
+			 * In case of AIO write racing with buffered read we
+			 * need to defer completion. We can't decide this now,
+			 * however the workqueue needs to be initialized here.
+			 */
+			retval = sb_init_dio_done_wq(dio->inode->i_sb);
+		}
 		if (retval) {
 			/*
 			 * We grab i_mutex only for reads so we don't have
diff --git a/fs/iomap.c b/fs/iomap.c
index 0392661..c3e299a 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -713,8 +713,24 @@  struct iomap_dio {
 static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
 	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
 
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 */
+	if (!dio->error &&
+	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+		ret = invalidate_inode_pages2_range(inode->i_mapping,
+				iocb->ki_pos >> PAGE_SHIFT,
+				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
+		WARN_ON_ONCE(ret);
+	}
+
 	if (dio->end_io) {
 		ret = dio->end_io(iocb,
 				dio->error ? dio->error : dio->size,
@@ -1042,19 +1058,6 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 
 	ret = iomap_dio_complete(dio);
 
-	/*
-	 * Try again to invalidate clean pages which might have been cached by
-	 * non-direct readahead, or faulted in by get_user_pages() if the source
-	 * of the write was an mmap'ed region of the file we're writing.  Either
-	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
-	 * this invalidation fails, tough, the write still worked...
-	 */
-	if (iov_iter_rw(iter) == WRITE) {
-		int err = invalidate_inode_pages2_range(mapping,
-				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-		WARN_ON_ONCE(err);
-	}
-
 	return ret;
 
 out_free_dio:
diff --git a/mm/filemap.c b/mm/filemap.c
index a497024..9440e02 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2885,9 +2885,15 @@  generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	 * we're writing.  Either one is a pretty crazy thing to do,
 	 * so we don't support it 100%.  If this invalidation
 	 * fails, tough, the write still worked...
+	 *
+	 * Most of the time we do not need this since dio_complete() will do
+	 * the invalidation for us. However there are some file systems that
+	 * do not end up with dio_complete() being called, so let's not break
+	 * them by removing it completely
 	 */
-	invalidate_inode_pages2_range(mapping,
-				pos >> PAGE_SHIFT, end);
+	if (mapping->nrpages)
+		invalidate_inode_pages2_range(mapping,
+					pos >> PAGE_SHIFT, end);
 
 	if (written > 0) {
 		pos += written;