diff mbox series

[v1,1/7] mm: khugepaged: retract_page_tables() use pte_offset_map_lock()

Message ID 258de4356bdcc01bce0ff1f6c29b2b64a4211494.1729157502.git.zhengqi.arch@bytedance.com (mailing list archive)
State New
Headers show
Series synchronously scan and reclaim empty user PTE pages | expand

Commit Message

Qi Zheng Oct. 17, 2024, 9:47 a.m. UTC
In retract_page_tables(), we may modify the pmd entry after acquiring the
pml and ptl, so we should also check whether the pmd entry is stable.
Using pte_offset_map_lock() to do it, and then we can also remove the
calling of the pte_lockptr().

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 mm/khugepaged.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

Comments

Jann Horn Oct. 17, 2024, 6 p.m. UTC | #1
On Thu, Oct 17, 2024 at 11:47 AM Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> In retract_page_tables(), we may modify the pmd entry after acquiring the
> pml and ptl, so we should also check whether the pmd entry is stable.
> Using pte_offset_map_lock() to do it, and then we can also remove the
> calling of the pte_lockptr().
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  mm/khugepaged.c | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 94feb85ce996c..b4f49d323c8d9 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1721,6 +1721,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>                 spinlock_t *pml;
>                 spinlock_t *ptl;
>                 bool skipped_uffd = false;
> +               pte_t *pte;
>
>                 /*
>                  * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
> @@ -1757,9 +1758,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>                 mmu_notifier_invalidate_range_start(&range);
>
>                 pml = pmd_lock(mm, pmd);
> -               ptl = pte_lockptr(mm, pmd);
> +               pte = pte_offset_map_lock(mm, pmd, addr, &ptl);

This takes the lock "ptl" on the success path...

> +               if (!pte) {
> +                       spin_unlock(pml);
> +                       mmu_notifier_invalidate_range_end(&range);
> +                       continue;
> +               }
>                 if (ptl != pml)
>                         spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

... and this takes the same lock again, right? I think this will
deadlock on kernels with CONFIG_SPLIT_PTE_PTLOCKS=y. Did you test this
on a machine with less than 4 CPU cores, or something like that? Or am
I missing something?
Qi Zheng Oct. 18, 2024, 2:15 a.m. UTC | #2
On 2024/10/18 02:00, Jann Horn wrote:
> On Thu, Oct 17, 2024 at 11:47 AM Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>> In retract_page_tables(), we may modify the pmd entry after acquiring the
>> pml and ptl, so we should also check whether the pmd entry is stable.
>> Using pte_offset_map_lock() to do it, and then we can also remove the
>> calling of the pte_lockptr().
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   mm/khugepaged.c | 9 ++++++++-
>>   1 file changed, 8 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 94feb85ce996c..b4f49d323c8d9 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1721,6 +1721,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>>                  spinlock_t *pml;
>>                  spinlock_t *ptl;
>>                  bool skipped_uffd = false;
>> +               pte_t *pte;
>>
>>                  /*
>>                   * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
>> @@ -1757,9 +1758,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>>                  mmu_notifier_invalidate_range_start(&range);
>>
>>                  pml = pmd_lock(mm, pmd);
>> -               ptl = pte_lockptr(mm, pmd);
>> +               pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> 
> This takes the lock "ptl" on the success path...
> 
>> +               if (!pte) {
>> +                       spin_unlock(pml);
>> +                       mmu_notifier_invalidate_range_end(&range);
>> +                       continue;
>> +               }
>>                  if (ptl != pml)
>>                          spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
> 
> ... and this takes the same lock again, right? I think this will

Oh my god, my mistake, I used pte_offset_map_rw_nolock() at first, then
I changed it to pte_offset_map_lock() but forgot to delete this, and
because my test did not trigger retract_page_tables(), so I did not
find this error.

Will change in v2.

Thanks!

> deadlock on kernels with CONFIG_SPLIT_PTE_PTLOCKS=y. Did you test this
> on a machine with less than 4 CPU cores, or something like that? Or am
> I missing something?
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 94feb85ce996c..b4f49d323c8d9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1721,6 +1721,7 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		spinlock_t *pml;
 		spinlock_t *ptl;
 		bool skipped_uffd = false;
+		pte_t *pte;
 
 		/*
 		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
@@ -1757,9 +1758,15 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		mmu_notifier_invalidate_range_start(&range);
 
 		pml = pmd_lock(mm, pmd);
-		ptl = pte_lockptr(mm, pmd);
+		pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+		if (!pte) {
+			spin_unlock(pml);
+			mmu_notifier_invalidate_range_end(&range);
+			continue;
+		}
 		if (ptl != pml)
 			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+		pte_unmap(pte);
 
 		/*
 		 * Huge page lock is still held, so normally the page table