diff mbox series

[v3,5/5] try_to_unmap_one: batched remove rmap, update folio refcount

Message ID 20230306092259.3507807-6-fengwei.yin@intel.com (mailing list archive)
State New
Headers show
Series batched remove rmap in try_to_unmap_one() | expand

Commit Message

Yin Fengwei March 6, 2023, 9:22 a.m. UTC
If unmap one page fails, or the vma walk will skip next pte,
or the vma walk will end on next pte, batched remove map,
update folio refcount.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 include/linux/rmap.h |  1 +
 mm/page_vma_mapped.c | 30 +++++++++++++++++++++++++++
 mm/rmap.c            | 48 ++++++++++++++++++++++++++++++++++----------
 3 files changed, 68 insertions(+), 11 deletions(-)

Comments

haoxin March 6, 2023, 12:39 p.m. UTC | #1
在 2023/3/6 下午5:22, Yin Fengwei 写道:
> If unmap one page fails, or the vma walk will skip next pte,
> or the vma walk will end on next pte, batched remove map,
> update folio refcount.
>
> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> ---
>   include/linux/rmap.h |  1 +
>   mm/page_vma_mapped.c | 30 +++++++++++++++++++++++++++
>   mm/rmap.c            | 48 ++++++++++++++++++++++++++++++++++----------
>   3 files changed, 68 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index d2569b42e21a..18193d1d5a8e 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -424,6 +424,7 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
>   }
>   
>   bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
> +bool pvmw_walk_skip_or_end_on_next(struct page_vma_mapped_walk *pvmw);
>   
>   /*
>    * Used by swapoff to help locate where page is expected in vma.
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 4e448cfbc6ef..19e997dfb5c6 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -291,6 +291,36 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>   	return false;
>   }
>   
> +/**
> + * pvmw_walk_skip_or_end_on_next - check if next pte will be skipped or
> + *                                 end the walk
> + * @pvmw: pointer to struct page_vma_mapped_walk.
> + *
> + * This function can only be called with correct pte lock hold
> + */
> +bool pvmw_walk_skip_or_end_on_next(struct page_vma_mapped_walk *pvmw)
> +{
> +       unsigned long address = pvmw->address + PAGE_SIZE;
> +
> +       if (address >= vma_address_end(pvmw))
> +               return true;

If vma_address_end is exactly equal to next address(pvmw->address + 
PAGE_SIZE) , does this mean that we are ignored to unmap the last page 
here ? so

there can just use ' > '      ' if (address > vma_address_end(pvmw))' .

I may have misunderstood, please correct me.

> +
> +       if ((address & (PMD_SIZE - PAGE_SIZE)) == 0)
> +               return true;
> +
> +       if (pte_none(*pvmw->pte))
> +               return true;
> +
> +       pvmw->pte++;
> +       if (!check_pte(pvmw)) {
> +               pvmw->pte--;
> +               return true;
> +       }
> +       pvmw->pte--;
> +
> +       return false;
> +}
> +
>   /**
>    * page_mapped_in_vma - check whether a page is really mapped in a VMA
>    * @page: the page to test
> diff --git a/mm/rmap.c b/mm/rmap.c
> index bb3fcb8df579..a64e9cbb52dd 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1741,6 +1741,26 @@ static bool try_to_unmap_one_page(struct folio *folio,
>   	return false;
>   }
>   
> +static void folio_remove_rmap_and_update_count(struct folio *folio,
> +		struct page *start, struct vm_area_struct *vma, int count)
> +{
> +	if (count == 0)
> +		return;
> +
> +	/*
> +	 * No need to call mmu_notifier_invalidate_range() it has be
> +	 * done above for all cases requiring it to happen under page
> +	 * table lock before mmu_notifier_invalidate_range_end()
> +	 *
> +	 * See Documentation/mm/mmu_notifier.rst
> +	 */
> +	folio_remove_rmap_range(folio, start, count, vma,
> +					folio_test_hugetlb(folio));
> +	if (vma->vm_flags & VM_LOCKED)
> +		mlock_drain_local();
> +	folio_ref_sub(folio, count);
> +}
> +
>   /*
>    * @arg: enum ttu_flags will be passed to this argument
>    */
> @@ -1748,10 +1768,11 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>   		     unsigned long address, void *arg)
>   {
>   	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> -	struct page *subpage;
> +	struct page *start = NULL;
>   	bool ret = true;
>   	struct mmu_notifier_range range;
>   	enum ttu_flags flags = (enum ttu_flags)(long)arg;
> +	int count = 0;
>   
>   	/*
>   	 * When racing against e.g. zap_pte_range() on another cpu,
> @@ -1812,26 +1833,31 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>   			break;
>   		}
>   
> -		subpage = folio_page(folio,
> +		if (!start)
> +			start = folio_page(folio,
>   					pte_pfn(*pvmw.pte) - folio_pfn(folio));
>   		ret = try_to_unmap_one_page(folio, vma,
>   						range, pvmw, address, flags);
>   		if (!ret) {
> +			folio_remove_rmap_and_update_count(folio,
> +							start, vma, count);
>   			page_vma_mapped_walk_done(&pvmw);
>   			break;
>   		}
> +		count++;
>   
>   		/*
> -		 * No need to call mmu_notifier_invalidate_range() it has be
> -		 * done above for all cases requiring it to happen under page
> -		 * table lock before mmu_notifier_invalidate_range_end()
> -		 *
> -		 * See Documentation/mm/mmu_notifier.rst
> +		 * If next pte will be skipped in page_vma_mapped_walk() or
> +		 * the walk will end at it, batched remove rmap and update
> +		 * page refcount. We can't do it after page_vma_mapped_walk()
> +		 * return false because the pte lock will not be hold.
>   		 */
> -		page_remove_rmap(subpage, vma, false);
> -		if (vma->vm_flags & VM_LOCKED)
> -			mlock_drain_local();
> -		folio_put(folio);
> +		if (pvmw_walk_skip_or_end_on_next(&pvmw)) {
> +			folio_remove_rmap_and_update_count(folio,
> +							start, vma, count);
> +			count = 0;
> +			start = NULL;
> +		}
>   	}
>   
>   	mmu_notifier_invalidate_range_end(&range);
Yin Fengwei March 7, 2023, 2:45 a.m. UTC | #2
On Mon, 2023-03-06 at 20:39 +0800, haoxin wrote:
> 
> 在 2023/3/6 下午5:22, Yin Fengwei 写道:
> > If unmap one page fails, or the vma walk will skip next pte,
> > or the vma walk will end on next pte, batched remove map,
> > update folio refcount.
> > 
> > Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> > ---
> >   include/linux/rmap.h |  1 +
> >   mm/page_vma_mapped.c | 30 +++++++++++++++++++++++++++
> >   mm/rmap.c            | 48 ++++++++++++++++++++++++++++++++++-----
> > -----
> >   3 files changed, 68 insertions(+), 11 deletions(-)
> > 
> > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > index d2569b42e21a..18193d1d5a8e 100644
> > --- a/include/linux/rmap.h
> > +++ b/include/linux/rmap.h
> > @@ -424,6 +424,7 @@ static inline void
> > page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
> >   }
> >   
> >   bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
> > +bool pvmw_walk_skip_or_end_on_next(struct page_vma_mapped_walk
> > *pvmw);
> >   
> >   /*
> >    * Used by swapoff to help locate where page is expected in vma.
> > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > index 4e448cfbc6ef..19e997dfb5c6 100644
> > --- a/mm/page_vma_mapped.c
> > +++ b/mm/page_vma_mapped.c
> > @@ -291,6 +291,36 @@ bool page_vma_mapped_walk(struct
> > page_vma_mapped_walk *pvmw)
> >         return false;
> >   }
> >   
> > +/**
> > + * pvmw_walk_skip_or_end_on_next - check if next pte will be
> > skipped or
> > + *                                 end the walk
> > + * @pvmw: pointer to struct page_vma_mapped_walk.
> > + *
> > + * This function can only be called with correct pte lock hold
> > + */
> > +bool pvmw_walk_skip_or_end_on_next(struct page_vma_mapped_walk
> > *pvmw)
> > +{
> > +       unsigned long address = pvmw->address + PAGE_SIZE;
> > +
> > +       if (address >= vma_address_end(pvmw))
> > +               return true;
> 
> If vma_address_end is exactly equal to next address(pvmw->address + 
> PAGE_SIZE) , does this mean that we are ignored to unmap the last
> page 
> here ? so
> 
> there can just use ' > '      ' if (address > vma_address_end(pvmw))'
> .
This check will be done after the last PTE is handled. Thanks.


Regards
Yin, Fengwei

> 
> I may have misunderstood, please correct me.
> 
> > +
> > +       if ((address & (PMD_SIZE - PAGE_SIZE)) == 0)
> > +               return true;
> > +
> > +       if (pte_none(*pvmw->pte))
> > +               return true;
> > +
> > +       pvmw->pte++;
> > +       if (!check_pte(pvmw)) {
> > +               pvmw->pte--;
> > +               return true;
> > +       }
> > +       pvmw->pte--;
> > +
> > +       return false;
> > +}
> > +
> >   /**
> >    * page_mapped_in_vma - check whether a page is really mapped in
> > a VMA
> >    * @page: the page to test
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index bb3fcb8df579..a64e9cbb52dd 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1741,6 +1741,26 @@ static bool try_to_unmap_one_page(struct
> > folio *folio,
> >         return false;
> >   }
> >   
> > +static void folio_remove_rmap_and_update_count(struct folio
> > *folio,
> > +               struct page *start, struct vm_area_struct *vma, int
> > count)
> > +{
> > +       if (count == 0)
> > +               return;
> > +
> > +       /*
> > +        * No need to call mmu_notifier_invalidate_range() it has
> > be
> > +        * done above for all cases requiring it to happen under
> > page
> > +        * table lock before mmu_notifier_invalidate_range_end()
> > +        *
> > +        * See Documentation/mm/mmu_notifier.rst
> > +        */
> > +       folio_remove_rmap_range(folio, start, count, vma,
> > +                                       folio_test_hugetlb(folio));
> > +       if (vma->vm_flags & VM_LOCKED)
> > +               mlock_drain_local();
> > +       folio_ref_sub(folio, count);
> > +}
> > +
> >   /*
> >    * @arg: enum ttu_flags will be passed to this argument
> >    */
> > @@ -1748,10 +1768,11 @@ static bool try_to_unmap_one(struct folio
> > *folio, struct vm_area_struct *vma,
> >                      unsigned long address, void *arg)
> >   {
> >         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> > -       struct page *subpage;
> > +       struct page *start = NULL;
> >         bool ret = true;
> >         struct mmu_notifier_range range;
> >         enum ttu_flags flags = (enum ttu_flags)(long)arg;
> > +       int count = 0;
> >   
> >         /*
> >          * When racing against e.g. zap_pte_range() on another cpu,
> > @@ -1812,26 +1833,31 @@ static bool try_to_unmap_one(struct folio
> > *folio, struct vm_area_struct *vma,
> >                         break;
> >                 }
> >   
> > -               subpage = folio_page(folio,
> > +               if (!start)
> > +                       start = folio_page(folio,
> >                                         pte_pfn(*pvmw.pte) -
> > folio_pfn(folio));
> >                 ret = try_to_unmap_one_page(folio, vma,
> >                                                 range, pvmw,
> > address, flags);
> >                 if (!ret) {
> > +                       folio_remove_rmap_and_update_count(folio,
> > +                                                       start, vma,
> > count);
> >                         page_vma_mapped_walk_done(&pvmw);
> >                         break;
> >                 }
> > +               count++;
> >   
> >                 /*
> > -                * No need to call mmu_notifier_invalidate_range()
> > it has be
> > -                * done above for all cases requiring it to happen
> > under page
> > -                * table lock before
> > mmu_notifier_invalidate_range_end()
> > -                *
> > -                * See Documentation/mm/mmu_notifier.rst
> > +                * If next pte will be skipped in
> > page_vma_mapped_walk() or
> > +                * the walk will end at it, batched remove rmap and
> > update
> > +                * page refcount. We can't do it after
> > page_vma_mapped_walk()
> > +                * return false because the pte lock will not be
> > hold.
> >                  */
> > -               page_remove_rmap(subpage, vma, false);
> > -               if (vma->vm_flags & VM_LOCKED)
> > -                       mlock_drain_local();
> > -               folio_put(folio);
> > +               if (pvmw_walk_skip_or_end_on_next(&pvmw)) {
> > +                       folio_remove_rmap_and_update_count(folio,
> > +                                                       start, vma,
> > count);
> > +                       count = 0;
> > +                       start = NULL;
> > +               }
> >         }
> >   
> >         mmu_notifier_invalidate_range_end(&range);
diff mbox series

Patch

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d2569b42e21a..18193d1d5a8e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -424,6 +424,7 @@  static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
 }
 
 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
+bool pvmw_walk_skip_or_end_on_next(struct page_vma_mapped_walk *pvmw);
 
 /*
  * Used by swapoff to help locate where page is expected in vma.
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 4e448cfbc6ef..19e997dfb5c6 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -291,6 +291,36 @@  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 	return false;
 }
 
+/**
+ * pvmw_walk_skip_or_end_on_next - check if next pte will be skipped or
+ *                                 end the walk
+ * @pvmw: pointer to struct page_vma_mapped_walk.
+ *
+ * This function can only be called with correct pte lock hold
+ */
+bool pvmw_walk_skip_or_end_on_next(struct page_vma_mapped_walk *pvmw)
+{
+       unsigned long address = pvmw->address + PAGE_SIZE;
+
+       if (address >= vma_address_end(pvmw))
+               return true;
+
+       if ((address & (PMD_SIZE - PAGE_SIZE)) == 0)
+               return true;
+
+       if (pte_none(*pvmw->pte))
+               return true;
+
+       pvmw->pte++;
+       if (!check_pte(pvmw)) {
+               pvmw->pte--;
+               return true;
+       }
+       pvmw->pte--;
+
+       return false;
+}
+
 /**
  * page_mapped_in_vma - check whether a page is really mapped in a VMA
  * @page: the page to test
diff --git a/mm/rmap.c b/mm/rmap.c
index bb3fcb8df579..a64e9cbb52dd 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1741,6 +1741,26 @@  static bool try_to_unmap_one_page(struct folio *folio,
 	return false;
 }
 
+static void folio_remove_rmap_and_update_count(struct folio *folio,
+		struct page *start, struct vm_area_struct *vma, int count)
+{
+	if (count == 0)
+		return;
+
+	/*
+	 * No need to call mmu_notifier_invalidate_range() it has be
+	 * done above for all cases requiring it to happen under page
+	 * table lock before mmu_notifier_invalidate_range_end()
+	 *
+	 * See Documentation/mm/mmu_notifier.rst
+	 */
+	folio_remove_rmap_range(folio, start, count, vma,
+					folio_test_hugetlb(folio));
+	if (vma->vm_flags & VM_LOCKED)
+		mlock_drain_local();
+	folio_ref_sub(folio, count);
+}
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1748,10 +1768,11 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 		     unsigned long address, void *arg)
 {
 	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
-	struct page *subpage;
+	struct page *start = NULL;
 	bool ret = true;
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
+	int count = 0;
 
 	/*
 	 * When racing against e.g. zap_pte_range() on another cpu,
@@ -1812,26 +1833,31 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			break;
 		}
 
-		subpage = folio_page(folio,
+		if (!start)
+			start = folio_page(folio,
 					pte_pfn(*pvmw.pte) - folio_pfn(folio));
 		ret = try_to_unmap_one_page(folio, vma,
 						range, pvmw, address, flags);
 		if (!ret) {
+			folio_remove_rmap_and_update_count(folio,
+							start, vma, count);
 			page_vma_mapped_walk_done(&pvmw);
 			break;
 		}
+		count++;
 
 		/*
-		 * No need to call mmu_notifier_invalidate_range() it has be
-		 * done above for all cases requiring it to happen under page
-		 * table lock before mmu_notifier_invalidate_range_end()
-		 *
-		 * See Documentation/mm/mmu_notifier.rst
+		 * If next pte will be skipped in page_vma_mapped_walk() or
+		 * the walk will end at it, batched remove rmap and update
+		 * page refcount. We can't do it after page_vma_mapped_walk()
+		 * return false because the pte lock will not be hold.
 		 */
-		page_remove_rmap(subpage, vma, false);
-		if (vma->vm_flags & VM_LOCKED)
-			mlock_drain_local();
-		folio_put(folio);
+		if (pvmw_walk_skip_or_end_on_next(&pvmw)) {
+			folio_remove_rmap_and_update_count(folio,
+							start, vma, count);
+			count = 0;
+			start = NULL;
+		}
 	}
 
 	mmu_notifier_invalidate_range_end(&range);