diff mbox series

[4/4] mm/migration: fix potential pte_unmap on an not mapped pte

Message ID 20220409073846.22286-5-linmiaohe@huawei.com (mailing list archive)
State New
Headers show
Series A few cleanup and fixup patches for migration | expand

Commit Message

Miaohe Lin April 9, 2022, 7:38 a.m. UTC
__migration_entry_wait and migration_entry_wait_on_locked assume pte is
always mapped from caller. But this is not the case when it's called from
migration_entry_wait_huge and follow_huge_pmd. And a parameter unmap to
indicate whether pte needs to be unmapped to fix this issue.

Fixes: 30dad30922cc ("mm: migration: add migrate_entry_wait_huge()")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
---
 include/linux/migrate.h |  2 +-
 include/linux/swapops.h |  4 ++--
 mm/filemap.c            | 10 +++++-----
 mm/hugetlb.c            |  2 +-
 mm/migrate.c            | 14 ++++++++------
 5 files changed, 17 insertions(+), 15 deletions(-)

Comments

kernel test robot April 9, 2022, 11:38 a.m. UTC | #1
Hi Miaohe,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-mm/master]
[also build test ERROR on linus/master v5.18-rc1 next-20220408]
[cannot apply to linux/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Miaohe-Lin/A-few-cleanup-and-fixup-patches-for-migration/20220409-154028
base:   https://github.com/hnaz/linux-mm master
config: powerpc64-randconfig-r015-20220408 (https://download.01.org/0day-ci/archive/20220409/202204091914.psVO4TrI-lkp@intel.com/config)
compiler: powerpc64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/73a982570cc62313c55cc5cbc2dd7acf40601474
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Miaohe-Lin/A-few-cleanup-and-fixup-patches-for-migration/20220409-154028
        git checkout 73a982570cc62313c55cc5cbc2dd7acf40601474
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/mm/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/powerpc/mm/hugetlbpage.c: In function 'follow_huge_pd':
>> arch/powerpc/mm/hugetlbpage.c:537:25: error: too few arguments to function '__migration_entry_wait'
     537 |                         __migration_entry_wait(mm, ptep, ptl);
         |                         ^~~~~~~~~~~~~~~~~~~~~~
   In file included from arch/powerpc/mm/hugetlbpage.c:20:
   include/linux/swapops.h:233:13: note: declared here
     233 | extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
         |             ^~~~~~~~~~~~~~~~~~~~~~


vim +/__migration_entry_wait +537 arch/powerpc/mm/hugetlbpage.c

^1da177e4c3f41 arch/ppc64/mm/hugetlbpage.c   Linus Torvalds   2005-04-16  507  
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  508  struct page *follow_huge_pd(struct vm_area_struct *vma,
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  509  			    unsigned long address, hugepd_t hpd,
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  510  			    int flags, int pdshift)
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  511  {
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  512  	pte_t *ptep;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  513  	spinlock_t *ptl;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  514  	struct page *page = NULL;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  515  	unsigned long mask;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  516  	int shift = hugepd_shift(hpd);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  517  	struct mm_struct *mm = vma->vm_mm;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  518  
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  519  retry:
ed515b6898c367 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2018-06-01  520  	/*
ed515b6898c367 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2018-06-01  521  	 * hugepage directory entries are protected by mm->page_table_lock
ed515b6898c367 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2018-06-01  522  	 * Use this instead of huge_pte_lockptr
ed515b6898c367 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2018-06-01  523  	 */
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  524  	ptl = &mm->page_table_lock;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  525  	spin_lock(ptl);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  526  
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  527  	ptep = hugepte_offset(hpd, address, pdshift);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  528  	if (pte_present(*ptep)) {
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  529  		mask = (1UL << shift) - 1;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  530  		page = pte_page(*ptep);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  531  		page += ((address & mask) >> PAGE_SHIFT);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  532  		if (flags & FOLL_GET)
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  533  			get_page(page);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  534  	} else {
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  535  		if (is_hugetlb_entry_migration(*ptep)) {
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  536  			spin_unlock(ptl);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06 @537  			__migration_entry_wait(mm, ptep, ptl);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  538  			goto retry;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  539  		}
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  540  	}
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  541  	spin_unlock(ptl);
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  542  	return page;
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  543  }
50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  544
Miaohe Lin April 11, 2022, 1:54 a.m. UTC | #2
On 2022/4/9 19:38, kernel test robot wrote:
> Hi Miaohe,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on hnaz-mm/master]
> [also build test ERROR on linus/master v5.18-rc1 next-20220408]
> [cannot apply to linux/master]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Miaohe-Lin/A-few-cleanup-and-fixup-patches-for-migration/20220409-154028
> base:   https://github.com/hnaz/linux-mm master
> config: powerpc64-randconfig-r015-20220408 (https://download.01.org/0day-ci/archive/20220409/202204091914.psVO4TrI-lkp@intel.com/config)
> compiler: powerpc64-linux-gcc (GCC) 11.2.0
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # https://github.com/intel-lab-lkp/linux/commit/73a982570cc62313c55cc5cbc2dd7acf40601474
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Miaohe-Lin/A-few-cleanup-and-fixup-patches-for-migration/20220409-154028
>         git checkout 73a982570cc62313c55cc5cbc2dd7acf40601474
>         # save the config file to linux build tree
>         mkdir build_dir
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/mm/
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>

Oh, powerpc variant is left undone. Will fix it. Thanks!

> 
> All errors (new ones prefixed by >>):
> 
>    arch/powerpc/mm/hugetlbpage.c: In function 'follow_huge_pd':
>>> arch/powerpc/mm/hugetlbpage.c:537:25: error: too few arguments to function '__migration_entry_wait'
>      537 |                         __migration_entry_wait(mm, ptep, ptl);
>          |                         ^~~~~~~~~~~~~~~~~~~~~~
>    In file included from arch/powerpc/mm/hugetlbpage.c:20:
>    include/linux/swapops.h:233:13: note: declared here
>      233 | extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
>          |             ^~~~~~~~~~~~~~~~~~~~~~
> 
> 
> vim +/__migration_entry_wait +537 arch/powerpc/mm/hugetlbpage.c
> 
> ^1da177e4c3f41 arch/ppc64/mm/hugetlbpage.c   Linus Torvalds   2005-04-16  507  
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  508  struct page *follow_huge_pd(struct vm_area_struct *vma,
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  509  			    unsigned long address, hugepd_t hpd,
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  510  			    int flags, int pdshift)
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  511  {
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  512  	pte_t *ptep;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  513  	spinlock_t *ptl;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  514  	struct page *page = NULL;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  515  	unsigned long mask;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  516  	int shift = hugepd_shift(hpd);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  517  	struct mm_struct *mm = vma->vm_mm;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  518  
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  519  retry:
> ed515b6898c367 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2018-06-01  520  	/*
> ed515b6898c367 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2018-06-01  521  	 * hugepage directory entries are protected by mm->page_table_lock
> ed515b6898c367 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2018-06-01  522  	 * Use this instead of huge_pte_lockptr
> ed515b6898c367 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2018-06-01  523  	 */
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  524  	ptl = &mm->page_table_lock;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  525  	spin_lock(ptl);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  526  
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  527  	ptep = hugepte_offset(hpd, address, pdshift);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  528  	if (pte_present(*ptep)) {
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  529  		mask = (1UL << shift) - 1;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  530  		page = pte_page(*ptep);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  531  		page += ((address & mask) >> PAGE_SHIFT);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  532  		if (flags & FOLL_GET)
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  533  			get_page(page);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  534  	} else {
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  535  		if (is_hugetlb_entry_migration(*ptep)) {
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  536  			spin_unlock(ptl);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06 @537  			__migration_entry_wait(mm, ptep, ptl);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  538  			goto retry;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  539  		}
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  540  	}
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  541  	spin_unlock(ptl);
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  542  	return page;
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  543  }
> 50791e6de0b5f2 arch/powerpc/mm/hugetlbpage.c Aneesh Kumar K.V 2017-07-06  544  
>
David Hildenbrand April 11, 2022, 11:41 a.m. UTC | #3
On 09.04.22 09:38, Miaohe Lin wrote:
> __migration_entry_wait and migration_entry_wait_on_locked assume pte is
> always mapped from caller. But this is not the case when it's called from
> migration_entry_wait_huge and follow_huge_pmd. And a parameter unmap to
> indicate whether pte needs to be unmapped to fix this issue.

Hm.


migration_entry_wait_on_locked documents

"@ptep: mapped pte pointer. Will return with the ptep unmapped. Only
required for pte entries, pass NULL for pmd entries."

Setting ptep implies that we have a *mapped pte* pointer that requires unmap.
If some code sets that although that's not guaranteed, that calling code
is wrong and needs to be fixed to not pass a ptep.


hugetlbfs never requires a map/unmap. I really don't see we there is need to
adjust migration_entry_wait_on_locked(): just don't pass a ptep as documented.

What's really nasty here is that hugetlbfs actually mostly works on PMD/PUD,
but we call it PTEs. One corner case might be CONT PTEs, but they are also
accessed without a map+unmap.

Regarding __migration_entry_wait(), I think we should just stop using it for
hugetlbfs and have a proper hugetlbfs variant that calls
hugetlb_migration_entry_wait(ptep == NULL), and knows that although we're
handling ptes, we're usually not actually holding ptes in our hands
that need a map+unmap.


Something like (including some cleanups mm parameter):


diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 32d517a28969..898c407ad8f7 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -234,8 +234,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl);
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
-extern void migration_entry_wait_huge(struct vm_area_struct *vma,
-		struct mm_struct *mm, pte_t *pte);
+extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
 #else
 static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
 {
@@ -261,8 +261,9 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
+static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
 static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
-		struct mm_struct *mm, pte_t *pte) { }
+					     pte_t *pte) { }
 static inline int is_writable_migration_entry(swp_entry_t entry)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 48740e6c3476..2b38eaaa2e60 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5622,7 +5622,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
-			migration_entry_wait_huge(vma, mm, ptep);
+			migration_entry_wait_huge(vma, ptep);
 			return 0;
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
@@ -6770,7 +6770,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	} else {
 		if (is_hugetlb_entry_migration(pte)) {
 			spin_unlock(ptl);
-			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
+			__migration_entry_wait_huge((pte_t *)pmd, ptl);
 			goto retry;
 		}
 		/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 231907e89b93..84b685a235fe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -315,11 +315,26 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	__migration_entry_wait(mm, ptep, ptl);
 }
 
+void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+{
+	swp_entry_t entry;
+	pte_t pte;
+
+	spin_lock(ptl);
+	pte = huge_ptep_get(ptep);
+
+	if (unlikely(!is_hugetlb_entry_migration(pte)))
+		spin_unlock(ptl);
+	else
+		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+}
+
 void migration_entry_wait_huge(struct vm_area_struct *vma,
 		struct mm_struct *mm, pte_t *pte)
 {
-	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
-	__migration_entry_wait(mm, pte, ptl);
+	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->mm, pte);
+
+	__migration_entry_wait_huge(pte, ptl);
 }
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
Miaohe Lin April 12, 2022, 2:55 a.m. UTC | #4
On 2022/4/11 19:41, David Hildenbrand wrote:
> On 09.04.22 09:38, Miaohe Lin wrote:
>> __migration_entry_wait and migration_entry_wait_on_locked assume pte is
>> always mapped from caller. But this is not the case when it's called from
>> migration_entry_wait_huge and follow_huge_pmd. And a parameter unmap to
>> indicate whether pte needs to be unmapped to fix this issue.
> 
> Hm.
> 
> 
> migration_entry_wait_on_locked documents
> 
> "@ptep: mapped pte pointer. Will return with the ptep unmapped. Only
> required for pte entries, pass NULL for pmd entries."
> 
> Setting ptep implies that we have a *mapped pte* pointer that requires unmap.
> If some code sets that although that's not guaranteed, that calling code
> is wrong and needs to be fixed to not pass a ptep.
> 
> 
> hugetlbfs never requires a map/unmap. I really don't see we there is need to
> adjust migration_entry_wait_on_locked(): just don't pass a ptep as documented.
> 
> What's really nasty here is that hugetlbfs actually mostly works on PMD/PUD,
> but we call it PTEs. One corner case might be CONT PTEs, but they are also
> accessed without a map+unmap.
> 
> Regarding __migration_entry_wait(), I think we should just stop using it for
> hugetlbfs and have a proper hugetlbfs variant that calls
> hugetlb_migration_entry_wait(ptep == NULL), and knows that although we're
> handling ptes, we're usually not actually holding ptes in our hands
> that need a map+unmap.
> 
> 
> Something like (including some cleanups mm parameter):

This really helps! Many thanks! Will try to do this in next version. :)

> 
> 
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 32d517a28969..898c407ad8f7 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -234,8 +234,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
>  					spinlock_t *ptl);
>  extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>  					unsigned long address);
> -extern void migration_entry_wait_huge(struct vm_area_struct *vma,
> -		struct mm_struct *mm, pte_t *pte);
> +extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
> +extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
>  #else
>  static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
>  {
> @@ -261,8 +261,9 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
>  					spinlock_t *ptl) { }
>  static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>  					 unsigned long address) { }
> +static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
>  static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
> -		struct mm_struct *mm, pte_t *pte) { }
> +					     pte_t *pte) { }
>  static inline int is_writable_migration_entry(swp_entry_t entry)
>  {
>  	return 0;
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 48740e6c3476..2b38eaaa2e60 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5622,7 +5622,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		 */
>  		entry = huge_ptep_get(ptep);
>  		if (unlikely(is_hugetlb_entry_migration(entry))) {
> -			migration_entry_wait_huge(vma, mm, ptep);
> +			migration_entry_wait_huge(vma, ptep);
>  			return 0;
>  		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
>  			return VM_FAULT_HWPOISON_LARGE |
> @@ -6770,7 +6770,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>  	} else {
>  		if (is_hugetlb_entry_migration(pte)) {
>  			spin_unlock(ptl);
> -			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
> +			__migration_entry_wait_huge((pte_t *)pmd, ptl);
>  			goto retry;
>  		}
>  		/*
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 231907e89b93..84b685a235fe 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -315,11 +315,26 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>  	__migration_entry_wait(mm, ptep, ptl);
>  }
>  
> +void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
> +{
> +	swp_entry_t entry;
> +	pte_t pte;
> +
> +	spin_lock(ptl);
> +	pte = huge_ptep_get(ptep);
> +
> +	if (unlikely(!is_hugetlb_entry_migration(pte)))
> +		spin_unlock(ptl);
> +	else
> +		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
> +}
> +
>  void migration_entry_wait_huge(struct vm_area_struct *vma,
>  		struct mm_struct *mm, pte_t *pte)
>  {
> -	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
> -	__migration_entry_wait(mm, pte, ptl);
> +	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->mm, pte);
> +
> +	__migration_entry_wait_huge(pte, ptl);
>  }
>  
>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>
diff mbox series

Patch

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 2707bfd43a0d..2c4de1972f99 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -41,7 +41,7 @@  extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
 void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
-				spinlock_t *ptl);
+				spinlock_t *ptl, bool unmap);
 void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
 void folio_migrate_copy(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 32d517a28969..3e6a293f88e0 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -231,7 +231,7 @@  static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
 }
 
 extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
-					spinlock_t *ptl);
+					spinlock_t *ptl, bool unmap);
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
 extern void migration_entry_wait_huge(struct vm_area_struct *vma,
@@ -258,7 +258,7 @@  static inline int is_migration_entry(swp_entry_t swp)
 }
 
 static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
-					spinlock_t *ptl) { }
+					spinlock_t *ptl, bool unmap) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
 static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
diff --git a/mm/filemap.c b/mm/filemap.c
index 3a5ffb5587cd..02f2d920c8cf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1392,6 +1392,7 @@  static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
  * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required
  *        for pte entries, pass NULL for pmd entries.
  * @ptl: already locked ptl. This function will drop the lock.
+ * @unmap: indicating whether ptep need to be unmapped.
  *
  * Wait for a migration entry referencing the given page to be removed. This is
  * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
@@ -1405,7 +1406,7 @@  static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
  * there.
  */
 void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
-				spinlock_t *ptl)
+				spinlock_t *ptl, bool unmap)
 {
 	struct wait_page_queue wait_page;
 	wait_queue_entry_t *wait = &wait_page.wait;
@@ -1442,10 +1443,9 @@  void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
 	 * a valid reference to the page, and it must take the ptl to remove the
 	 * migration entry. So the page is valid until the ptl is dropped.
 	 */
-	if (ptep)
-		pte_unmap_unlock(ptep, ptl);
-	else
-		spin_unlock(ptl);
+	spin_unlock(ptl);
+	if (unmap && ptep)
+		pte_unmap(ptep);
 
 	for (;;) {
 		unsigned int flags;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fb5a549169ce..3fc61a437c2a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6778,7 +6778,7 @@  follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	} else {
 		if (is_hugetlb_entry_migration(pte)) {
 			spin_unlock(ptl);
-			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
+			__migration_entry_wait(mm, (pte_t *)pmd, ptl, false);
 			goto retry;
 		}
 		/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 044656a14ae2..0bdf27fbc45b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -287,7 +287,7 @@  void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
  * When we return from this function the fault will be retried.
  */
 void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
-				spinlock_t *ptl)
+				spinlock_t *ptl, bool unmap)
 {
 	pte_t pte;
 	swp_entry_t entry;
@@ -301,10 +301,12 @@  void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 	if (!is_migration_entry(entry))
 		goto out;
 
-	migration_entry_wait_on_locked(entry, ptep, ptl);
+	migration_entry_wait_on_locked(entry, ptep, ptl, unmap);
 	return;
 out:
-	pte_unmap_unlock(ptep, ptl);
+	spin_unlock(ptl);
+	if (unmap)
+		pte_unmap(ptep);
 }
 
 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
@@ -312,14 +314,14 @@  void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 {
 	spinlock_t *ptl = pte_lockptr(mm, pmd);
 	pte_t *ptep = pte_offset_map(pmd, address);
-	__migration_entry_wait(mm, ptep, ptl);
+	__migration_entry_wait(mm, ptep, ptl, true);
 }
 
 void migration_entry_wait_huge(struct vm_area_struct *vma,
 		struct mm_struct *mm, pte_t *pte)
 {
 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
-	__migration_entry_wait(mm, pte, ptl);
+	__migration_entry_wait(mm, pte, ptl, false);
 }
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -330,7 +332,7 @@  void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 	ptl = pmd_lock(mm, pmd);
 	if (!is_pmd_migration_entry(*pmd))
 		goto unlock;
-	migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
+	migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl, false);
 	return;
 unlock:
 	spin_unlock(ptl);