diff mbox series

[RFC] iomap: fix race between readahead and direct write

Message ID 20200116063601.39201-1-yukuai3@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RFC] iomap: fix race between readahead and direct write | expand

Commit Message

Yu Kuai Jan. 16, 2020, 6:36 a.m. UTC
I noticed that generic/418 test may fail with small probability. And with
futher investiation, it can be reproduced with:

./src/dio-invalidate-cache -wp -b 4096 -n 8 -i 1 -f filename
./src/dio-invalidate-cache -wt -b 4096-n 8 -i 1 -f filename

The failure is because direct write wrote none-zero but buffer read got
zero.

In the process of buffer read, if the page do not exist, readahead will
be triggered.  __do_page_cache_readahead() will allocate page first. Next,
if the file block is unwritten(or hole), iomap_begin() will set iomap->type
to IOMAP_UNWRITTEN(or IOMAP_HOLE). Then, iomap_readpages_actor() will add
page to page cache. Finally, iomap_readpage_actor() will zero the page.

However, there is no lock or serialization between initializing iomap and
adding page to page cache against direct write. If direct write happen to
fininsh between them, the type of iomap should be IOMAP_MAPPED instead of
IOMAP_UNWRITTEN or IOMAP_HOLE. And the page will end up zeroed out in page
cache, while on-disk page hold the data of direct write.

| thread 1                    | thread 2                   |
| --------------------------  | -------------------------- |
| generic_file_buffered_read  |                            |
|  ondemand_readahead         |                            |
|   read_pages                |                            |
|    iomap_readpages          |                            |
|     iomap_apply             |                            |
|      xfs_read_iomap_begin   |                            |
|                             | xfs_file_dio_aio_write     |
|                             |  iomap_dio_rw              |
|                             |   ioamp_apply              |
|     ioamp_readpages_actor   |                            |
|      iomap_next_page        |                            |
|       add_to_page_cache_lru |                            |
|      iomap_readpage_actor   |                            |
|       zero_user             |                            |
|    iomap_set_range_uptodate |                            |
|                             | generic_file_buffered_read |
|                             |  copy_page_to_iter        |

For consequences, the content in the page is zero while the content in the
disk is not.

I tried to fix the problem by moving "add to page cache" before
iomap_begin(). However, performance might be worse since iomap_begin()
will be called for each page. I tested the performance for sequential
read with fio:

kernel version: v5.5-rc6
platform: arm64, 96 cpu
fio version: fio-3.15-2
test cmd:
fio -filename=/mnt/testfile -rw=read -bs=4k -size=20g -direct=0 -fsync=0
-numjobs=1 / 32 -ioengine=libaio -name=test -ramp_time=10 -runtime=120
test result:
|                  | without patch MiB/s | with patch MiB/s |
| ---------------- | ------------------- | ---------------- |
| ssd, numjobs=1   | 512                 | 512              |
| ssd, numjobs=32  | 3615                | 3714             |
| nvme, numjobs=1  | 1167                | 1118             |
| nvme, numjobs=32 | 3679                | 3606             |

Test result shows that the impact on performance is minimal.

Signed-off-by: yu kuai <yukuai3@huawei.com>
---
 fs/iomap/buffered-io.c | 104 ++++++++++++++++++++---------------------
 1 file changed, 52 insertions(+), 52 deletions(-)

Comments

Jan Kara Jan. 16, 2020, 3:32 p.m. UTC | #1
On Thu 16-01-20 14:36:01, yu kuai wrote:
> I noticed that generic/418 test may fail with small probability. And with
> futher investiation, it can be reproduced with:
> 
> ./src/dio-invalidate-cache -wp -b 4096 -n 8 -i 1 -f filename
> ./src/dio-invalidate-cache -wt -b 4096-n 8 -i 1 -f filename
> 
> The failure is because direct write wrote none-zero but buffer read got
> zero.
> 
> In the process of buffer read, if the page do not exist, readahead will
> be triggered.  __do_page_cache_readahead() will allocate page first. Next,
> if the file block is unwritten(or hole), iomap_begin() will set iomap->type
> to IOMAP_UNWRITTEN(or IOMAP_HOLE). Then, iomap_readpages_actor() will add
> page to page cache. Finally, iomap_readpage_actor() will zero the page.
> 
> However, there is no lock or serialization between initializing iomap and
> adding page to page cache against direct write. If direct write happen to
> fininsh between them, the type of iomap should be IOMAP_MAPPED instead of
> IOMAP_UNWRITTEN or IOMAP_HOLE. And the page will end up zeroed out in page
> cache, while on-disk page hold the data of direct write.
> 
> | thread 1                    | thread 2                   |
> | --------------------------  | -------------------------- |
> | generic_file_buffered_read  |                            |
> |  ondemand_readahead         |                            |
> |   read_pages                |                            |
> |    iomap_readpages          |                            |
> |     iomap_apply             |                            |
> |      xfs_read_iomap_begin   |                            |
> |                             | xfs_file_dio_aio_write     |
> |                             |  iomap_dio_rw              |
> |                             |   ioamp_apply              |
> |     ioamp_readpages_actor   |                            |
> |      iomap_next_page        |                            |
> |       add_to_page_cache_lru |                            |
> |      iomap_readpage_actor   |                            |
> |       zero_user             |                            |
> |    iomap_set_range_uptodate |                            |
> |                             | generic_file_buffered_read |
> |                             |  copy_page_to_iter        |

Thanks for the report and the patch. But the data integrity when mixing
buffered and direct IO like this is best effort only. We definitely do not
want to sacrifice performance of common cases or code complexity to make
cases like this work reliably.

								Honza

> For consequences, the content in the page is zero while the content in the
> disk is not.
> 
> I tried to fix the problem by moving "add to page cache" before
> iomap_begin(). However, performance might be worse since iomap_begin()
> will be called for each page. I tested the performance for sequential
> read with fio:
> 
> kernel version: v5.5-rc6
> platform: arm64, 96 cpu
> fio version: fio-3.15-2
> test cmd:
> fio -filename=/mnt/testfile -rw=read -bs=4k -size=20g -direct=0 -fsync=0
> -numjobs=1 / 32 -ioengine=libaio -name=test -ramp_time=10 -runtime=120
> test result:
> |                  | without patch MiB/s | with patch MiB/s |
> | ---------------- | ------------------- | ---------------- |
> | ssd, numjobs=1   | 512                 | 512              |
> | ssd, numjobs=32  | 3615                | 3714             |
> | nvme, numjobs=1  | 1167                | 1118             |
> | nvme, numjobs=32 | 3679                | 3606             |
> 
> Test result shows that the impact on performance is minimal.
> 
> Signed-off-by: yu kuai <yukuai3@huawei.com>
> ---
>  fs/iomap/buffered-io.c | 104 ++++++++++++++++++++---------------------
>  1 file changed, 52 insertions(+), 52 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 828444e14d09..ccfa1a52d966 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -329,26 +329,44 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  	return pos - orig_pos + plen;
>  }
>  
> -int
> -iomap_readpage(struct page *page, const struct iomap_ops *ops)
> +static int
> +do_iomap_readpage_apply(
> +	loff_t				offset,
> +	int				flag,
> +	const struct iomap_ops		*ops,
> +	struct iomap_readpage_ctx	*ctx,
> +	iomap_actor_t			actor,
> +	bool				fatal)
>  {
> -	struct iomap_readpage_ctx ctx = { .cur_page = page };
> -	struct inode *inode = page->mapping->host;
> -	unsigned poff;
> -	loff_t ret;
> -
> -	trace_iomap_readpage(page->mapping->host, 1);
> +	unsigned int			poff;
> +	loff_t				ret;
> +	struct page			*page = ctx->cur_page;
> +	struct inode			*inode = page->mapping->host;
>  
>  	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
> -		ret = iomap_apply(inode, page_offset(page) + poff,
> -				PAGE_SIZE - poff, 0, ops, &ctx,
> -				iomap_readpage_actor);
> +		ret = iomap_apply(inode, offset + poff, PAGE_SIZE - poff,
> +				  flag, ops, ctx, actor);
>  		if (ret <= 0) {
>  			WARN_ON_ONCE(ret == 0);
> +			if (fatal)
> +				return ret;
>  			SetPageError(page);
> -			break;
> +			return 0;
>  		}
>  	}
> +	return ret;
> +}
> +
> +
> +int
> +iomap_readpage(struct page *page, const struct iomap_ops *ops)
> +{
> +	struct iomap_readpage_ctx ctx = { .cur_page = page };
> +
> +	trace_iomap_readpage(page->mapping->host, 1);
> +
> +	do_iomap_readpage_apply(page_offset(page), 0, ops, &ctx,
> +				iomap_readpage_actor, false);
>  
>  	if (ctx.bio) {
>  		submit_bio(ctx.bio);
> @@ -395,34 +413,6 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
>  	return NULL;
>  }
>  
> -static loff_t
> -iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
> -		void *data, struct iomap *iomap, struct iomap *srcmap)
> -{
> -	struct iomap_readpage_ctx *ctx = data;
> -	loff_t done, ret;
> -
> -	for (done = 0; done < length; done += ret) {
> -		if (ctx->cur_page && offset_in_page(pos + done) == 0) {
> -			if (!ctx->cur_page_in_bio)
> -				unlock_page(ctx->cur_page);
> -			put_page(ctx->cur_page);
> -			ctx->cur_page = NULL;
> -		}
> -		if (!ctx->cur_page) {
> -			ctx->cur_page = iomap_next_page(inode, ctx->pages,
> -					pos, length, &done);
> -			if (!ctx->cur_page)
> -				break;
> -			ctx->cur_page_in_bio = false;
> -		}
> -		ret = iomap_readpage_actor(inode, pos + done, length - done,
> -				ctx, iomap, srcmap);
> -	}
> -
> -	return done;
> -}
> -
>  int
>  iomap_readpages(struct address_space *mapping, struct list_head *pages,
>  		unsigned nr_pages, const struct iomap_ops *ops)
> @@ -433,22 +423,32 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
>  	};
>  	loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
>  	loff_t last = page_offset(list_entry(pages->next, struct page, lru));
> -	loff_t length = last - pos + PAGE_SIZE, ret = 0;
> +	loff_t length = last - pos + PAGE_SIZE, ret = 0, done;
>  
>  	trace_iomap_readpages(mapping->host, nr_pages);
>  
> -	while (length > 0) {
> -		ret = iomap_apply(mapping->host, pos, length, 0, ops,
> -				&ctx, iomap_readpages_actor);
> +	for (done = 0; done < length; done += PAGE_SIZE) {
> +		if (ctx.cur_page) {
> +			if (!ctx.cur_page_in_bio)
> +				unlock_page(ctx.cur_page);
> +			put_page(ctx.cur_page);
> +			ctx.cur_page = NULL;
> +		}
> +		ctx.cur_page = iomap_next_page(mapping->host, ctx.pages,
> +					       pos, length, &done);
> +		if (!ctx.cur_page)
> +			break;
> +		ctx.cur_page_in_bio = false;
> +
> +		ret = do_iomap_readpage_apply(pos+done, 0, ops, &ctx,
> +					      iomap_readpage_actor, true);
>  		if (ret <= 0) {
> -			WARN_ON_ONCE(ret == 0);
> -			goto done;
> +			done = ret;
> +			break;
>  		}
> -		pos += ret;
> -		length -= ret;
> +
>  	}
> -	ret = 0;
> -done:
> +
>  	if (ctx.bio)
>  		submit_bio(ctx.bio);
>  	if (ctx.cur_page) {
> @@ -461,8 +461,8 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
>  	 * Check that we didn't lose a page due to the arcance calling
>  	 * conventions..
>  	 */
> -	WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
> -	return ret;
> +	WARN_ON_ONCE((done == length) && !list_empty(ctx.pages));
> +	return done;
>  }
>  EXPORT_SYMBOL_GPL(iomap_readpages);
>  
> -- 
> 2.17.2
>
Yu Kuai Jan. 17, 2020, 9:39 a.m. UTC | #2
On 2020/1/16 23:32, Jan Kara wrote:
> Thanks for the report and the patch. But the data integrity when mixing
> buffered and direct IO like this is best effort only. We definitely do not
> want to sacrifice performance of common cases or code complexity to make
> cases like this work reliably.

In the patch, the only thing that is diffrent is that iomap_begin() will
be called for each page. However, it seems the performance in sequential
read didn't get worse. Is there a specific case that the performance
will get worse?

Thanks!
Yu Kuai!
Jan Kara Jan. 17, 2020, 11:05 a.m. UTC | #3
On Fri 17-01-20 17:39:03, yukuai (C) wrote:
> On 2020/1/16 23:32, Jan Kara wrote:
> > Thanks for the report and the patch. But the data integrity when mixing
> > buffered and direct IO like this is best effort only. We definitely do not
> > want to sacrifice performance of common cases or code complexity to make
> > cases like this work reliably.
> 
> In the patch, the only thing that is diffrent is that iomap_begin() will
> be called for each page. However, it seems the performance in sequential
> read didn't get worse. Is there a specific case that the performance
> will get worse?

Well, one of the big points of iomap infrastructure is that you call
filesystem once to give you large extent instead of calling it to provide
allocation for each page separately. The additional CPU overhead will be
visible if you push the machine hard enough. So IMHO the overhead just is
not worth it for a corner-case like you presented. But that's just my
opinion, Darrick and Christoph are definitive arbiters here...

								Honza
Darrick J. Wong Jan. 17, 2020, 4:24 p.m. UTC | #4
On Fri, Jan 17, 2020 at 12:05:36PM +0100, Jan Kara wrote:
> On Fri 17-01-20 17:39:03, yukuai (C) wrote:
> > On 2020/1/16 23:32, Jan Kara wrote:
> > > Thanks for the report and the patch. But the data integrity when mixing
> > > buffered and direct IO like this is best effort only. We definitely do not
> > > want to sacrifice performance of common cases or code complexity to make
> > > cases like this work reliably.
> > 
> > In the patch, the only thing that is diffrent is that iomap_begin() will
> > be called for each page. However, it seems the performance in sequential
> > read didn't get worse. Is there a specific case that the performance
> > will get worse?
> 
> Well, one of the big points of iomap infrastructure is that you call
> filesystem once to give you large extent instead of calling it to provide
> allocation for each page separately. The additional CPU overhead will be
> visible if you push the machine hard enough. So IMHO the overhead just is
> not worth it for a corner-case like you presented. But that's just my
> opinion, Darrick and Christoph are definitive arbiters here...

Does the problem go away if you apply[1]?  If I understand the race
correctly, marking the extents unwritten and leaving them that way until
after we've written the disk should eliminate the exposure vector...? :)

[1] https://lore.kernel.org/linux-xfs/157915535059.2406747.264640456606868955.stgit@magnolia/

--D

> 								Honza
> 
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Matthew Wilcox Jan. 18, 2020, 11:08 p.m. UTC | #5
On Thu, Jan 16, 2020 at 02:36:01PM +0800, yu kuai wrote:
> I noticed that generic/418 test may fail with small probability. And with
> futher investiation, it can be reproduced with:
> 
> ./src/dio-invalidate-cache -wp -b 4096 -n 8 -i 1 -f filename
> ./src/dio-invalidate-cache -wt -b 4096-n 8 -i 1 -f filename
> 
> The failure is because direct write wrote none-zero but buffer read got
> zero.
> 
> In the process of buffer read, if the page do not exist, readahead will
> be triggered.  __do_page_cache_readahead() will allocate page first. Next,
> if the file block is unwritten(or hole), iomap_begin() will set iomap->type
> to IOMAP_UNWRITTEN(or IOMAP_HOLE). Then, iomap_readpages_actor() will add
> page to page cache. Finally, iomap_readpage_actor() will zero the page.
> 
> However, there is no lock or serialization between initializing iomap and
> adding page to page cache against direct write. If direct write happen to
> fininsh between them, the type of iomap should be IOMAP_MAPPED instead of
> IOMAP_UNWRITTEN or IOMAP_HOLE. And the page will end up zeroed out in page
> cache, while on-disk page hold the data of direct write.

It's worth noting that my patch series from earlier this week to
redesign the readahead API will fix this problem.  Direct write will block
on the locked pages in the page cache.
Yu Kuai Jan. 19, 2020, 1:17 a.m. UTC | #6
On 2020/1/17 19:05, Jan Kara wrote:
> provide
> allocation for each page separately

Thank you for your response!

I do understand there will be additional CPU overhead. But page is 
allocated in __do_page_cache_readahead(), which is called before
iomap_begin(). And I did not change that.

Thanks!
Yu Kuai
Yu Kuai Jan. 19, 2020, 1:25 a.m. UTC | #7
On 2020/1/18 0:24, Darrick J. Wong wrote:
> Does the problem go away if you apply[1]?  If I understand the race
> correctly, marking the extents unwritten and leaving them that way until
> after we've written the disk should eliminate the exposure vector...?:)

Thank you for your response.

The patch [1] fixed the stale data exposure problem which was tested by
generic/042. I'm afarid it's completely a diffrent case for generic/418.

In generic/418, direct write wrote some data to disk successfully, while
buffer read left the correspond page uptodate with 0 content in
pagecache.

Thanks!
Yu Kuai
Yu Kuai Jan. 19, 2020, 1:34 a.m. UTC | #8
On 2020/1/19 7:08, Matthew Wilcox wrote:
> It's worth noting that my patch series from earlier this week to
> redesign the readahead API will fix this problem.  Direct write will block
> on the locked pages in the page cache.

Thank you for your response!

In this case, direct write finish while page do not exist in the page
cache. This is the fundamental condition of the race, because readahead
won't allocate page if page exist in page cache.

By the way, in the current logic, if page exist in page cache, direct
write need to hold lock for page in invalidate_inode_pages2_range().

Thanks!
Yu Kuai
Matthew Wilcox Jan. 19, 2020, 1:42 a.m. UTC | #9
On Sun, Jan 19, 2020 at 09:34:32AM +0800, yukuai (C) wrote:
> 
> 
> On 2020/1/19 7:08, Matthew Wilcox wrote:
> > It's worth noting that my patch series from earlier this week to
> > redesign the readahead API will fix this problem.  Direct write will block
> > on the locked pages in the page cache.
> 
> Thank you for your response!
> 
> In this case, direct write finish while page do not exist in the page
> cache. This is the fundamental condition of the race, because readahead
> won't allocate page if page exist in page cache.
> 
> By the way, in the current logic, if page exist in page cache, direct
> write need to hold lock for page in invalidate_inode_pages2_range().

Did you read my patch series?  The current code allocates pages,
but does not put them in the page cache until after iomap is called.
My patch series changes that to put the pages in the page cache as soon
as they're allocated, and before iomap is called.
Yu Kuai Jan. 19, 2020, 1:57 a.m. UTC | #10
On 2020/1/19 9:42, Matthew Wilcox wrote:
> Did you read my patch series?  The current code allocates pages,
> but does not put them in the page cache until after iomap is called.
> My patch series changes that to put the pages in the page cache as soon
> as they're allocated, and before iomap is called.

The current code to determin if page need to be allocated:

     page = xa_load(&mapping->i_pages, page_offset);
     if (page && !xa_is_value(page)) {
       /*
       ┊* Page already present?  Kick off the current batch of
       ┊* contiguous pages before continuing with the next
       ┊* batch.
       ┊*/
       if (nr_pages)
         read_pages(mapping, filp, &page_pool, nr_pages,
             gfp_mask);
       nr_pages = 0;
       continue;
     }

     page = __page_cache_alloc(gfp_mask);
     if (!page)
       break;

Page will be allocated if the page do not exist in page cache. And I
don't see your patch series change that. Or am I missing something?

Thanks!
Yu Kuai
Yu Kuai Jan. 19, 2020, 2:51 a.m. UTC | #11
On 2020/1/19 9:42, Matthew Wilcox wrote:
> Did you read my patch series?  The current code allocates pages,
> but does not put them in the page cache until after iomap is called.
> My patch series changes that to put the pages in the page cache as soon
> as they're allocated, and before iomap is called.

I just read you patch series again.

At first, if you try to add all pages to pagecache and lock them before
iomap_begin. I thought aboult it before, but I throw away the idea
becacuse all other operation that will lock the page will need to wait
for readahead to finish. And it might cause problem for performance
overhead. And if you try to add each page to page cache and call iomap
before adding the next page. Then, we are facing the same CPU overhead
issure.

Then, there might be a problem in your implementation.
if 'use_list' is set to true here:
+	bool use_list = mapping->a_ops->readpages;

Your code do not call add_to_page_cache_lru for the page.
+		if (use_list) {
+			page->index = page_offset;
+			list_add(&page->lru, &page_pool);
+		} else if (!add_to_page_cache_lru(page, mapping, page_offset,
+					gfp_mask)) {
+			if (nr_pages)
+				read_pages(mapping, filp, &page_pool,
+						page_offset - nr_pages,
+						nr_pages);
+			nr_pages = 0;
+			continue;
+		}

And later, you replace 'iomap_next_page' with 'readahead_page'
+static inline
+struct page *readahead_page(struct address_space *mapping, loff_t pos)
+{
+	struct page *page = xa_load(&mapping->i_pages, pos / PAGE_SIZE);
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+	return page;
+}
+

It seems that the page will never add to page cache.

Thanks!
Yu Kuai
Gao Xiang Jan. 19, 2020, 3:01 a.m. UTC | #12
On Sun, Jan 19, 2020 at 10:51:37AM +0800, yukuai (C) wrote:
> 
> 
> Then, there might be a problem in your implementation.
> if 'use_list' is set to true here:
> +	bool use_list = mapping->a_ops->readpages;
> 
> Your code do not call add_to_page_cache_lru for the page.

IMO, if use_list == true, it will call read_pages -> .readpages
and go just like the current implementation.

->.readahead is just alloc_page and then add_to_page_cache_lru
right after in time and in principle it saves some extra page
allocation.

Thanks,
Gao Xiang

> +		if (use_list) {
> +			page->index = page_offset;
> +			list_add(&page->lru, &page_pool);
> +		} else if (!add_to_page_cache_lru(page, mapping, page_offset,
> +					gfp_mask)) {
> +			if (nr_pages)
> +				read_pages(mapping, filp, &page_pool,
> +						page_offset - nr_pages,
> +						nr_pages);
> +			nr_pages = 0;
> +			continue;
> +		}
>
Yu Kuai Jan. 19, 2020, 3:15 a.m. UTC | #13
On 2020/1/19 11:01, Gao Xiang wrote:
> IMO, if use_list == true, it will call read_pages -> .readpages
> and go just like the current implementation.

The problem is that in the current implementatin, iomap_next_page()
add page to page cache and it is replaced with readahead_page()

Thanks!
Yu Kuai
Matthew Wilcox Jan. 19, 2020, 6:14 a.m. UTC | #14
On Sun, Jan 19, 2020 at 10:51:37AM +0800, yukuai (C) wrote:
> At first, if you try to add all pages to pagecache and lock them before
> iomap_begin. I thought aboult it before, but I throw away the idea
> becacuse all other operation that will lock the page will need to wait
> for readahead to finish. And it might cause problem for performance
> overhead. And if you try to add each page to page cache and call iomap
> before adding the next page. Then, we are facing the same CPU overhead
> issure.

I don't understand your reasoning here.  If another process wants to
access a page of the file which isn't currently in cache, it would have
to first read the page in from storage.  If it's under readahead, it
has to wait for the read to finish.  Why is the second case worse than
the second?  It seems better to me.

The implementation doesn't call iomap for each page.  It allocates all
the pages and then calls iomap for the range.

> Then, there might be a problem in your implementation.
> if 'use_list' is set to true here:
> +	bool use_list = mapping->a_ops->readpages;
> 
> Your code do not call add_to_page_cache_lru for the page.

It can't.  The readpages implementation has to call add_to_page_cache_lru.
But for filesystems which use readpage or readahead, we can put the pages
in the page cache before calling readahead.

> And later, you replace 'iomap_next_page' with 'readahead_page'
> +static inline
> +struct page *readahead_page(struct address_space *mapping, loff_t pos)
> +{
> +	struct page *page = xa_load(&mapping->i_pages, pos / PAGE_SIZE);
> +	VM_BUG_ON_PAGE(!PageLocked(page), page);
> +
> +	return page;
> +}
> +
> 
> It seems that the page will never add to page cache.

At the same time, the iomap code is switched from ->readpages to
->readahead, so yes, the pages are added to the page cache.
Yu Kuai Jan. 19, 2020, 6:55 a.m. UTC | #15
On 2020/1/19 14:14, Matthew Wilcox wrote:
> I don't understand your reasoning here.  If another process wants to
> access a page of the file which isn't currently in cache, it would have
> to first read the page in from storage.  If it's under readahead, it
> has to wait for the read to finish.  Why is the second case worse than
> the second?  It seems better to me.

Thanks for your response! My worries is that, for example:

We read page 0, and trigger readahead to read n pages(0 - n-1). While in 
another thread, we read page n-1.

In the current implementation, if readahead is in the process of reading
page 0 - n-2,  later operation doesn't need to wait the former one to 
finish. However, later operation will have to wait if we add all pages
to page cache first. And that is why I said it might cause problem for
performance overhead.

> At the same time, the iomap code is switched from ->readpages to
> ->readahead, so yes, the pages are added to the page cache.

Yes, it's not a problem in your implementation.

Thanks!
Yu Kuai
Matthew Wilcox Jan. 19, 2020, 7:58 a.m. UTC | #16
On Sun, Jan 19, 2020 at 02:55:14PM +0800, yukuai (C) wrote:
> On 2020/1/19 14:14, Matthew Wilcox wrote:
> > I don't understand your reasoning here.  If another process wants to
> > access a page of the file which isn't currently in cache, it would have
> > to first read the page in from storage.  If it's under readahead, it
> > has to wait for the read to finish.  Why is the second case worse than
> > the second?  It seems better to me.
> 
> Thanks for your response! My worries is that, for example:
> 
> We read page 0, and trigger readahead to read n pages(0 - n-1). While in
> another thread, we read page n-1.
> 
> In the current implementation, if readahead is in the process of reading
> page 0 - n-2,  later operation doesn't need to wait the former one to
> finish. However, later operation will have to wait if we add all pages
> to page cache first. And that is why I said it might cause problem for
> performance overhead.

OK, but let's put some numbers on that.  Imagine that we're using high
performance spinning rust so we have an access latency of 5ms (200
IOPS), we're accessing 20 consecutive pages which happen to have their
data contiguous on disk.  Our CPU is running at 2GHz and takes about
100,000 cycles to submit an I/O, plus 1,000 cycles to add an extra page
to the I/O.

Current implementation: Allocate 20 pages, place 19 of them in the cache,
fail to place the last one in the cache.  The later thread actually gets
to jump the queue and submit its bio first.  Its latency will be 100,000
cycles (20us) plus the 5ms access time.  But it only has 20,000 cycles
(4us) to hit this race, or it will end up behaving the same way as below.

New implementation: Allocate 20 pages, place them all in the cache,
then takes 120,000 cycles to build & submit the I/O, and wait 5ms for
the I/O to complete.

But look how much more likely it is that it'll hit during the window
where we're waiting for the I/O to complete -- 5ms is 1250 times longer
than 4us.

If it _does_ get the latency benefit of jumping the queue, the readahead
will create one or two I/Os.  If it hit page 18 instead of page 19, we'd
end up doing three I/Os; the first for page 18, then one for pages 0-17,
and one for page 19.  And that means the disk is going to be busy for
15ms, delaying the next I/O for up to 10ms.  It's actually beneficial in
the long term for the second thread to wait for the readahead to finish.

Oh, and the current ->readpages code has a race where if the page tagged
with PageReadahead ends up not being inserted, we'll lose that bit,
which means the readahead will just stop and have to restart (because
it will look to the readahead code like it's not being effective).
That's a far worse performance problem.
Yu Kuai Jan. 19, 2020, 11:21 a.m. UTC | #17
On 2020/1/19 15:58, Matthew Wilcox wrote:
> On Sun, Jan 19, 2020 at 02:55:14PM +0800, yukuai (C) wrote:
>> On 2020/1/19 14:14, Matthew Wilcox wrote:
>>> I don't understand your reasoning here.  If another process wants to
>>> access a page of the file which isn't currently in cache, it would have
>>> to first read the page in from storage.  If it's under readahead, it
>>> has to wait for the read to finish.  Why is the second case worse than
>>> the second?  It seems better to me.
>>
>> Thanks for your response! My worries is that, for example:
>>
>> We read page 0, and trigger readahead to read n pages(0 - n-1). While in
>> another thread, we read page n-1.
>>
>> In the current implementation, if readahead is in the process of reading
>> page 0 - n-2,  later operation doesn't need to wait the former one to
>> finish. However, later operation will have to wait if we add all pages
>> to page cache first. And that is why I said it might cause problem for
>> performance overhead.
> 
> OK, but let's put some numbers on that.  Imagine that we're using high
> performance spinning rust so we have an access latency of 5ms (200
> IOPS), we're accessing 20 consecutive pages which happen to have their
> data contiguous on disk.  Our CPU is running at 2GHz and takes about
> 100,000 cycles to submit an I/O, plus 1,000 cycles to add an extra page
> to the I/O.
> 
> Current implementation: Allocate 20 pages, place 19 of them in the cache,
> fail to place the last one in the cache.  The later thread actually gets
> to jump the queue and submit its bio first.  Its latency will be 100,000
> cycles (20us) plus the 5ms access time.  But it only has 20,000 cycles
> (4us) to hit this race, or it will end up behaving the same way as below.
> 
> New implementation: Allocate 20 pages, place them all in the cache,
> then takes 120,000 cycles to build & submit the I/O, and wait 5ms for
> the I/O to complete.
> 
> But look how much more likely it is that it'll hit during the window
> where we're waiting for the I/O to complete -- 5ms is 1250 times longer
> than 4us.
> 
> If it _does_ get the latency benefit of jumping the queue, the readahead
> will create one or two I/Os.  If it hit page 18 instead of page 19, we'd
> end up doing three I/Os; the first for page 18, then one for pages 0-17,
> and one for page 19.  And that means the disk is going to be busy for
> 15ms, delaying the next I/O for up to 10ms.  It's actually beneficial in
> the long term for the second thread to wait for the readahead to finish.
> 

Thank you very much for your detailed explanation, I was too blind for
my sided view. And I do agree that your patch series is a better
solution for the problem.

Yu Kuai
Jan Kara Jan. 20, 2020, 11:42 a.m. UTC | #18
On Sun 19-01-20 09:17:00, yukuai (C) wrote:
> 
> 
> On 2020/1/17 19:05, Jan Kara wrote:
> > provide
> > allocation for each page separately
> 
> Thank you for your response!
> 
> I do understand there will be additional CPU overhead. But page is allocated
> in __do_page_cache_readahead(), which is called before
> iomap_begin(). And I did not change that.

Sorry, I didn't express myself clearly. In "...one of the big points of iomap
infrastructure is that you call filesystem once to give you large extent
instead of calling it to provide allocation for each page separately." I
meant that with your patch, you call into filesystem to provide "block
allocation information" for each page separately. And it seems we both
agree this is going to cause additional CPU usage in the common case to
improve mostly unsupported corner case.

								Honza
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 828444e14d09..ccfa1a52d966 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -329,26 +329,44 @@  iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	return pos - orig_pos + plen;
 }
 
-int
-iomap_readpage(struct page *page, const struct iomap_ops *ops)
+static int
+do_iomap_readpage_apply(
+	loff_t				offset,
+	int				flag,
+	const struct iomap_ops		*ops,
+	struct iomap_readpage_ctx	*ctx,
+	iomap_actor_t			actor,
+	bool				fatal)
 {
-	struct iomap_readpage_ctx ctx = { .cur_page = page };
-	struct inode *inode = page->mapping->host;
-	unsigned poff;
-	loff_t ret;
-
-	trace_iomap_readpage(page->mapping->host, 1);
+	unsigned int			poff;
+	loff_t				ret;
+	struct page			*page = ctx->cur_page;
+	struct inode			*inode = page->mapping->host;
 
 	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
-		ret = iomap_apply(inode, page_offset(page) + poff,
-				PAGE_SIZE - poff, 0, ops, &ctx,
-				iomap_readpage_actor);
+		ret = iomap_apply(inode, offset + poff, PAGE_SIZE - poff,
+				  flag, ops, ctx, actor);
 		if (ret <= 0) {
 			WARN_ON_ONCE(ret == 0);
+			if (fatal)
+				return ret;
 			SetPageError(page);
-			break;
+			return 0;
 		}
 	}
+	return ret;
+}
+
+
+int
+iomap_readpage(struct page *page, const struct iomap_ops *ops)
+{
+	struct iomap_readpage_ctx ctx = { .cur_page = page };
+
+	trace_iomap_readpage(page->mapping->host, 1);
+
+	do_iomap_readpage_apply(page_offset(page), 0, ops, &ctx,
+				iomap_readpage_actor, false);
 
 	if (ctx.bio) {
 		submit_bio(ctx.bio);
@@ -395,34 +413,6 @@  iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
 	return NULL;
 }
 
-static loff_t
-iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
-		void *data, struct iomap *iomap, struct iomap *srcmap)
-{
-	struct iomap_readpage_ctx *ctx = data;
-	loff_t done, ret;
-
-	for (done = 0; done < length; done += ret) {
-		if (ctx->cur_page && offset_in_page(pos + done) == 0) {
-			if (!ctx->cur_page_in_bio)
-				unlock_page(ctx->cur_page);
-			put_page(ctx->cur_page);
-			ctx->cur_page = NULL;
-		}
-		if (!ctx->cur_page) {
-			ctx->cur_page = iomap_next_page(inode, ctx->pages,
-					pos, length, &done);
-			if (!ctx->cur_page)
-				break;
-			ctx->cur_page_in_bio = false;
-		}
-		ret = iomap_readpage_actor(inode, pos + done, length - done,
-				ctx, iomap, srcmap);
-	}
-
-	return done;
-}
-
 int
 iomap_readpages(struct address_space *mapping, struct list_head *pages,
 		unsigned nr_pages, const struct iomap_ops *ops)
@@ -433,22 +423,32 @@  iomap_readpages(struct address_space *mapping, struct list_head *pages,
 	};
 	loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
 	loff_t last = page_offset(list_entry(pages->next, struct page, lru));
-	loff_t length = last - pos + PAGE_SIZE, ret = 0;
+	loff_t length = last - pos + PAGE_SIZE, ret = 0, done;
 
 	trace_iomap_readpages(mapping->host, nr_pages);
 
-	while (length > 0) {
-		ret = iomap_apply(mapping->host, pos, length, 0, ops,
-				&ctx, iomap_readpages_actor);
+	for (done = 0; done < length; done += PAGE_SIZE) {
+		if (ctx.cur_page) {
+			if (!ctx.cur_page_in_bio)
+				unlock_page(ctx.cur_page);
+			put_page(ctx.cur_page);
+			ctx.cur_page = NULL;
+		}
+		ctx.cur_page = iomap_next_page(mapping->host, ctx.pages,
+					       pos, length, &done);
+		if (!ctx.cur_page)
+			break;
+		ctx.cur_page_in_bio = false;
+
+		ret = do_iomap_readpage_apply(pos+done, 0, ops, &ctx,
+					      iomap_readpage_actor, true);
 		if (ret <= 0) {
-			WARN_ON_ONCE(ret == 0);
-			goto done;
+			done = ret;
+			break;
 		}
-		pos += ret;
-		length -= ret;
+
 	}
-	ret = 0;
-done:
+
 	if (ctx.bio)
 		submit_bio(ctx.bio);
 	if (ctx.cur_page) {
@@ -461,8 +461,8 @@  iomap_readpages(struct address_space *mapping, struct list_head *pages,
 	 * Check that we didn't lose a page due to the arcance calling
 	 * conventions..
 	 */
-	WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
-	return ret;
+	WARN_ON_ONCE((done == length) && !list_empty(ctx.pages));
+	return done;
 }
 EXPORT_SYMBOL_GPL(iomap_readpages);