thp: Simplify splitting PMD mapping huge zero page

Message ID	20200327170353.17734-1-kirill.shutemov@linux.intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=rYxz=5M=kvack.org=owner-linux-mm@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 76F2920838 From: "Kirill A. Shutemov" <kirill@shutemov.name> To: akpm@linux-foundation.org Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Subject: [PATCH] thp: Simplify splitting PMD mapping huge zero page Date: Fri, 27 Mar 2020 20:03:53 +0300 Message-Id: <20200327170353.17734-1-kirill.shutemov@linux.intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	thp: Simplify splitting PMD mapping huge zero page \| expand thp: Simplify splitting PMD mapping huge zero page

Message ID

20200327170353.17734-1-kirill.shutemov@linux.intel.com (mailing list archive)

State

New, archived

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 76F2920838
From: "Kirill A. Shutemov" <kirill@shutemov.name>
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCH] thp: Simplify splitting PMD mapping huge zero page
Date: Fri, 27 Mar 2020 20:03:53 +0300
Message-Id: <20200327170353.17734-1-kirill.shutemov@linux.intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

thp: Simplify splitting PMD mapping huge zero page | expand

Commit Message

Kirill A. Shutemov March 27, 2020, 5:03 p.m. UTC

Splitting PMD mapping huge zero page can be simplified a lot: we can
just unmap it and fallback to PTE handling.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/huge_memory.c | 57 ++++--------------------------------------------
 1 file changed, 4 insertions(+), 53 deletions(-)

Comments

Zi Yan March 27, 2020, 5:23 p.m. UTC | #1

On 27 Mar 2020, at 13:03, Kirill A. Shutemov wrote:

> Splitting PMD mapping huge zero page can be simplified a lot: we can
> just unmap it and fallback to PTE handling.

So we will have an extra page fault for the first read to each subpage, but nothing changes if the first access to a subpage is a write, right? BTW, what is the motivation for this code simplification?

Thanks.

>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/huge_memory.c | 57 ++++--------------------------------------------
>  1 file changed, 4 insertions(+), 53 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 42407e16bd80..ef6a6bcb291f 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2114,40 +2114,6 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
>  }
>  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>
> -static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
> -		unsigned long haddr, pmd_t *pmd)
> -{
> -	struct mm_struct *mm = vma->vm_mm;
> -	pgtable_t pgtable;
> -	pmd_t _pmd;
> -	int i;
> -
> -	/*
> -	 * Leave pmd empty until pte is filled note that it is fine to delay
> -	 * notification until mmu_notifier_invalidate_range_end() as we are
> -	 * replacing a zero pmd write protected page with a zero pte write
> -	 * protected page.
> -	 *
> -	 * See Documentation/vm/mmu_notifier.rst
> -	 */
> -	pmdp_huge_clear_flush(vma, haddr, pmd);
> -
> -	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
> -	pmd_populate(mm, &_pmd, pgtable);
> -
> -	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> -		pte_t *pte, entry;
> -		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
> -		entry = pte_mkspecial(entry);
> -		pte = pte_offset_map(&_pmd, haddr);
> -		VM_BUG_ON(!pte_none(*pte));
> -		set_pte_at(mm, haddr, pte, entry);
> -		pte_unmap(pte);
> -	}
> -	smp_wmb(); /* make pte visible before pmd */
> -	pmd_populate(mm, pmd, pgtable);
> -}
> -
>  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		unsigned long haddr, bool freeze)
>  {
> @@ -2167,7 +2133,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>
>  	count_vm_event(THP_SPLIT_PMD);
>
> -	if (!vma_is_anonymous(vma)) {
> +	if (!vma_is_anonymous(vma) || is_huge_zero_pmd(*pmd)) {
>  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
>  		/*
>  		 * We are going to unmap this huge page. So
> @@ -2175,7 +2141,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		 */
>  		if (arch_needs_pgtable_deposit())
>  			zap_deposited_table(mm, pmd);
> -		if (vma_is_dax(vma))
> +		if (vma_is_dax(vma) || is_huge_zero_pmd(*pmd))
>  			return;
>  		page = pmd_page(_pmd);
>  		if (!PageDirty(page) && pmd_dirty(_pmd))
> @@ -2186,17 +2152,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		put_page(page);
>  		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
>  		return;
> -	} else if (is_huge_zero_pmd(*pmd)) {
> -		/*
> -		 * FIXME: Do we want to invalidate secondary mmu by calling
> -		 * mmu_notifier_invalidate_range() see comments below inside
> -		 * __split_huge_pmd() ?
> -		 *
> -		 * We are going from a zero huge page write protected to zero
> -		 * small page also write protected so it does not seems useful
> -		 * to invalidate secondary mmu at this time.
> -		 */
> -		return __split_huge_zero_page_pmd(vma, haddr, pmd);
>  	}
>
>  	/*
> @@ -2339,13 +2294,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  	spin_unlock(ptl);
>  	/*
>  	 * No need to double call mmu_notifier->invalidate_range() callback.
> -	 * They are 3 cases to consider inside __split_huge_pmd_locked():
> +	 * They are 2 cases to consider inside __split_huge_pmd_locked():
>  	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
> -	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
> -	 *    fault will trigger a flush_notify before pointing to a new page
> -	 *    (it is fine if the secondary mmu keeps pointing to the old zero
> -	 *    page in the meantime)
> -	 *  3) Split a huge pmd into pte pointing to the same page. No need
> +	 *  2) Split a huge pmd into pte pointing to the same page. No need
>  	 *     to invalidate secondary tlb entry they are all still valid.
>  	 *     any further changes to individual pte will notify. So no need
>  	 *     to call mmu_notifier->invalidate_range()
> -- 
> 2.26.0


—
Best Regards,
Yan Zi

Kirill A. Shutemov March 28, 2020, 12:19 a.m. UTC | #2

On Fri, Mar 27, 2020 at 01:23:07PM -0400, Zi Yan wrote:
> On 27 Mar 2020, at 13:03, Kirill A. Shutemov wrote:
> 
> > Splitting PMD mapping huge zero page can be simplified a lot: we can
> > just unmap it and fallback to PTE handling.
> 
> So we will have an extra page fault for the first read to each subpage,
> but nothing changes if the first access to a subpage is a write, right?
> BTW, what is the motivation for this code simplification?

Match what we do for file-THP.

I found a problem with the patch. Ignore it for now.

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42407e16bd80..ef6a6bcb291f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2114,40 +2114,6 @@  void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
-		unsigned long haddr, pmd_t *pmd)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pgtable_t pgtable;
-	pmd_t _pmd;
-	int i;
-
-	/*
-	 * Leave pmd empty until pte is filled note that it is fine to delay
-	 * notification until mmu_notifier_invalidate_range_end() as we are
-	 * replacing a zero pmd write protected page with a zero pte write
-	 * protected page.
-	 *
-	 * See Documentation/vm/mmu_notifier.rst
-	 */
-	pmdp_huge_clear_flush(vma, haddr, pmd);
-
-	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-	pmd_populate(mm, &_pmd, pgtable);
-
-	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-		pte_t *pte, entry;
-		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
-		entry = pte_mkspecial(entry);
-		pte = pte_offset_map(&_pmd, haddr);
-		VM_BUG_ON(!pte_none(*pte));
-		set_pte_at(mm, haddr, pte, entry);
-		pte_unmap(pte);
-	}
-	smp_wmb(); /* make pte visible before pmd */
-	pmd_populate(mm, pmd, pgtable);
-}
-
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long haddr, bool freeze)
 {
@@ -2167,7 +2133,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	count_vm_event(THP_SPLIT_PMD);
 
-	if (!vma_is_anonymous(vma)) {
+	if (!vma_is_anonymous(vma) || is_huge_zero_pmd(*pmd)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 		/*
 		 * We are going to unmap this huge page. So
@@ -2175,7 +2141,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 */
 		if (arch_needs_pgtable_deposit())
 			zap_deposited_table(mm, pmd);
-		if (vma_is_dax(vma))
+		if (vma_is_dax(vma) || is_huge_zero_pmd(*pmd))
 			return;
 		page = pmd_page(_pmd);
 		if (!PageDirty(page) && pmd_dirty(_pmd))
@@ -2186,17 +2152,6 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		put_page(page);
 		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
 		return;
-	} else if (is_huge_zero_pmd(*pmd)) {
-		/*
-		 * FIXME: Do we want to invalidate secondary mmu by calling
-		 * mmu_notifier_invalidate_range() see comments below inside
-		 * __split_huge_pmd() ?
-		 *
-		 * We are going from a zero huge page write protected to zero
-		 * small page also write protected so it does not seems useful
-		 * to invalidate secondary mmu at this time.
-		 */
-		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
 	/*
@@ -2339,13 +2294,9 @@  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	spin_unlock(ptl);
 	/*
 	 * No need to double call mmu_notifier->invalidate_range() callback.
-	 * They are 3 cases to consider inside __split_huge_pmd_locked():
+	 * They are 2 cases to consider inside __split_huge_pmd_locked():
 	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
-	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
-	 *    fault will trigger a flush_notify before pointing to a new page
-	 *    (it is fine if the secondary mmu keeps pointing to the old zero
-	 *    page in the meantime)
-	 *  3) Split a huge pmd into pte pointing to the same page. No need
+	 *  2) Split a huge pmd into pte pointing to the same page. No need
 	 *     to invalidate secondary tlb entry they are all still valid.
 	 *     any further changes to individual pte will notify. So no need
 	 *     to call mmu_notifier->invalidate_range()

thp: Simplify splitting PMD mapping huge zero page

Commit Message

Comments

Patch