diff mbox series

[v2] mm: fix long time stall from mm_populate

Message ID 20200303002638.206421-1-minchan@kernel.org (mailing list archive)
State New, archived
Headers show
Series [v2] mm: fix long time stall from mm_populate | expand

Commit Message

Minchan Kim March 3, 2020, 12:26 a.m. UTC
Basically, fault handler releases mmap_sem before requesting readahead
and then it is supposed to retry lookup the page from page cache with
FAULT_FLAG_TRIED so that it avoids the live lock of infinite retry.

However, what happens if the fault handler find a page from page
cache and the page has readahead marker but are waiting under
writeback? Plus one more condition, it happens under mm_populate
which repeats faulting unless it encounters error. So let's assemble
conditions below.

       CPU 1                                                        CPU 2

- first loop
    mm_populate
     for ()
       ..
       ret = populate_vma_page_range
         __get_user_pages
           faultin_page
             handle_mm_fault
               filemap_fault
                 do_async_mmap_readahead
                   if (PageReadahead(pageA))
                     maybe_unlock_mmap_for_io
                       up_read(mmap_sem)
					                    shrink_page_list
                                                              pageout
                                                                SetPageReclaim(=SetPageReadahead)(pageA)
                                                                writepage
                                                                  SetPageWriteback(pageA)

                     page_cache_async_readahead()
		       ClearPageReadahead(pageA)
                 do_async_mmap_readahead
		 lock_page_maybe_drop_mmap
		   goto out_retry

					                    the pageA is reclaimed
							    and new pageB is populated to the file offset
							    and finally has become PG_readahead

- second loop

	  __get_user_pages
           faultin_page
             handle_mm_fault
               filemap_fault
                 do_async_mmap_readahead
                   if (PageReadahead(pageB))
                     maybe_unlock_mmap_for_io
                       up_read(mmap_sem)
					                    shrink_page_list
                                                              pageout
                                                                SetPageReclaim(=SetPageReadahead)(pageB)
                                                                writepage
                                                                  SetPageWriteback(pageB)

                     page_cache_async_readahead()
		       ClearPageReadahead(pageB)
                 do_async_mmap_readahead
		 lock_page_maybe_drop_mmap
		   goto out_retry

It could be repeated forever so it's livelock. Without involving reclaim,
it could happens if ra_pages become zero by fadvise/other threads who
have same fd one doing randome while the other one is sequential
because page_cache_async_readahead has following condition check like
PageWriteback and ra_pages are never synchrnized with fadvise and
shrink_readahead_size_eio from other threads.

page_cache_async_readahead(struct address_space *mapping,
                           unsigned long req_size)
{
        /* no read-ahead */
        if (!ra->ra_pages)
                return;

Thus, we need to limit fault retry from mm_populate like page
fault handler.

Fixes: 6b4c9f446981 ("filemap: drop the mmap_sem for all blocking operations")
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/gup.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

Comments

Matthew Wilcox (Oracle) March 3, 2020, 1:22 a.m. UTC | #1
On Mon, Mar 02, 2020 at 04:26:38PM -0800, Minchan Kim wrote:
> @@ -1196,6 +1196,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
>  	struct vm_area_struct *vma = NULL;
>  	int locked = 0;
>  	long ret = 0;
> +	bool tried = false;

How about ...

	int *lockedp = &locked;

>  
>  	end = start + len;
>  
> @@ -1226,14 +1227,18 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
>  		 * double checks the vma flags, so that it won't mlock pages
>  		 * if the vma was already munlocked.
>  		 */
> -		ret = populate_vma_page_range(vma, nstart, nend, &locked);
> +		ret = populate_vma_page_range(vma, nstart, nend,
> +						tried ? NULL : &locked);

		ret = populate_vma_page_range(vma, nstart, nend, lockedp);

>  		if (ret < 0) {
>  			if (ignore_errors) {
>  				ret = 0;
>  				continue;	/* continue at next VMA */
>  			}
>  			break;
> -		}
> +		} else if (ret == 0)
> +			tried = true;
> +		else
> +			tried = false;

		} else if (ret == 0)
			lockedp = NULL;

Maybe there's a better name than lockedp.
Minchan Kim March 3, 2020, 9:18 p.m. UTC | #2
On Mon, Mar 02, 2020 at 05:22:03PM -0800, Matthew Wilcox wrote:
> On Mon, Mar 02, 2020 at 04:26:38PM -0800, Minchan Kim wrote:
> > @@ -1196,6 +1196,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
> >  	struct vm_area_struct *vma = NULL;
> >  	int locked = 0;
> >  	long ret = 0;
> > +	bool tried = false;
> 
> How about ...
> 
> 	int *lockedp = &locked;
> 
> >  
> >  	end = start + len;
> >  
> > @@ -1226,14 +1227,18 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
> >  		 * double checks the vma flags, so that it won't mlock pages
> >  		 * if the vma was already munlocked.
> >  		 */
> > -		ret = populate_vma_page_range(vma, nstart, nend, &locked);
> > +		ret = populate_vma_page_range(vma, nstart, nend,
> > +						tried ? NULL : &locked);
> 
> 		ret = populate_vma_page_range(vma, nstart, nend, lockedp);
> 
> >  		if (ret < 0) {
> >  			if (ignore_errors) {
> >  				ret = 0;
> >  				continue;	/* continue at next VMA */
> >  			}
> >  			break;
> > -		}
> > +		} else if (ret == 0)
> > +			tried = true;
> > +		else
> > +			tried = false;
> 
> 		} else if (ret == 0)
> 			lockedp = NULL;
> 
> Maybe there's a better name than lockedp.

Thanks for the review, Matthew.

It changes the behavior from mine in that it never set lockedp as "non-NULL"
since it had retried so that we lose fault retrying optimization for
successive addresses

I understand the code is not pretty there so that hard to follow it but
I am not sure the your suggestion makes it better.
Minchan Kim March 13, 2020, 9:14 p.m. UTC | #3
Hi Andrew,

Any chance to take a look?

On Mon, Mar 02, 2020 at 04:26:38PM -0800, Minchan Kim wrote:
> Basically, fault handler releases mmap_sem before requesting readahead
> and then it is supposed to retry lookup the page from page cache with
> FAULT_FLAG_TRIED so that it avoids the live lock of infinite retry.
> 
> However, what happens if the fault handler find a page from page
> cache and the page has readahead marker but are waiting under
> writeback? Plus one more condition, it happens under mm_populate
> which repeats faulting unless it encounters error. So let's assemble
> conditions below.
> 
>        CPU 1                                                        CPU 2
> 
> - first loop
>     mm_populate
>      for ()
>        ..
>        ret = populate_vma_page_range
>          __get_user_pages
>            faultin_page
>              handle_mm_fault
>                filemap_fault
>                  do_async_mmap_readahead
>                    if (PageReadahead(pageA))
>                      maybe_unlock_mmap_for_io
>                        up_read(mmap_sem)
> 					                    shrink_page_list
>                                                               pageout
>                                                                 SetPageReclaim(=SetPageReadahead)(pageA)
>                                                                 writepage
>                                                                   SetPageWriteback(pageA)
> 
>                      page_cache_async_readahead()
> 		       ClearPageReadahead(pageA)
>                  do_async_mmap_readahead
> 		 lock_page_maybe_drop_mmap
> 		   goto out_retry
> 
> 					                    the pageA is reclaimed
> 							    and new pageB is populated to the file offset
> 							    and finally has become PG_readahead
> 
> - second loop
> 
> 	  __get_user_pages
>            faultin_page
>              handle_mm_fault
>                filemap_fault
>                  do_async_mmap_readahead
>                    if (PageReadahead(pageB))
>                      maybe_unlock_mmap_for_io
>                        up_read(mmap_sem)
> 					                    shrink_page_list
>                                                               pageout
>                                                                 SetPageReclaim(=SetPageReadahead)(pageB)
>                                                                 writepage
>                                                                   SetPageWriteback(pageB)
> 
>                      page_cache_async_readahead()
> 		       ClearPageReadahead(pageB)
>                  do_async_mmap_readahead
> 		 lock_page_maybe_drop_mmap
> 		   goto out_retry
> 
> It could be repeated forever so it's livelock. Without involving reclaim,
> it could happens if ra_pages become zero by fadvise/other threads who
> have same fd one doing randome while the other one is sequential
> because page_cache_async_readahead has following condition check like
> PageWriteback and ra_pages are never synchrnized with fadvise and
> shrink_readahead_size_eio from other threads.
> 
> page_cache_async_readahead(struct address_space *mapping,
>                            unsigned long req_size)
> {
>         /* no read-ahead */
>         if (!ra->ra_pages)
>                 return;
> 
> Thus, we need to limit fault retry from mm_populate like page
> fault handler.
> 
> Fixes: 6b4c9f446981 ("filemap: drop the mmap_sem for all blocking operations")
> Reviewed-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  mm/gup.c | 11 ++++++++---
>  1 file changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 1b521e0ac1de..6f6548c63ad5 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1133,7 +1133,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
>   *
>   * This takes care of mlocking the pages too if VM_LOCKED is set.
>   *
> - * return 0 on success, negative error code on error.
> + * return number of pages pinned on success, negative error code on error.
>   *
>   * vma->vm_mm->mmap_sem must be held.
>   *
> @@ -1196,6 +1196,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
>  	struct vm_area_struct *vma = NULL;
>  	int locked = 0;
>  	long ret = 0;
> +	bool tried = false;
>  
>  	end = start + len;
>  
> @@ -1226,14 +1227,18 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
>  		 * double checks the vma flags, so that it won't mlock pages
>  		 * if the vma was already munlocked.
>  		 */
> -		ret = populate_vma_page_range(vma, nstart, nend, &locked);
> +		ret = populate_vma_page_range(vma, nstart, nend,
> +						tried ? NULL : &locked);
>  		if (ret < 0) {
>  			if (ignore_errors) {
>  				ret = 0;
>  				continue;	/* continue at next VMA */
>  			}
>  			break;
> -		}
> +		} else if (ret == 0)
> +			tried = true;
> +		else
> +			tried = false;
>  		nend = nstart + ret * PAGE_SIZE;
>  		ret = 0;
>  	}
> -- 
> 2.25.0.265.gbab2e86ba0-goog
>
diff mbox series

Patch

diff --git a/mm/gup.c b/mm/gup.c
index 1b521e0ac1de..6f6548c63ad5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1133,7 +1133,7 @@  static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
  *
  * This takes care of mlocking the pages too if VM_LOCKED is set.
  *
- * return 0 on success, negative error code on error.
+ * return number of pages pinned on success, negative error code on error.
  *
  * vma->vm_mm->mmap_sem must be held.
  *
@@ -1196,6 +1196,7 @@  int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 	struct vm_area_struct *vma = NULL;
 	int locked = 0;
 	long ret = 0;
+	bool tried = false;
 
 	end = start + len;
 
@@ -1226,14 +1227,18 @@  int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 		 * double checks the vma flags, so that it won't mlock pages
 		 * if the vma was already munlocked.
 		 */
-		ret = populate_vma_page_range(vma, nstart, nend, &locked);
+		ret = populate_vma_page_range(vma, nstart, nend,
+						tried ? NULL : &locked);
 		if (ret < 0) {
 			if (ignore_errors) {
 				ret = 0;
 				continue;	/* continue at next VMA */
 			}
 			break;
-		}
+		} else if (ret == 0)
+			tried = true;
+		else
+			tried = false;
 		nend = nstart + ret * PAGE_SIZE;
 		ret = 0;
 	}