diff mbox series

mm/mremap: Fix move_normal_pmd/retract_page_tables race

Message ID 20241007-move_normal_pmd-vs-collapse-fix-2-v1-1-5ead9631f2ea@google.com (mailing list archive)
State New
Headers show
Series mm/mremap: Fix move_normal_pmd/retract_page_tables race | expand

Commit Message

Jann Horn Oct. 7, 2024, 9:42 p.m. UTC
In mremap(), move_page_tables() looks at the type of the PMD entry and the
specified address range to figure out by which method the next chunk of
page table entries should be moved.
At that point, the mmap_lock is held in write mode, but no rmap locks are
held yet. For PMD entries that point to page tables and are fully covered
by the source address range, move_pgt_entry(NORMAL_PMD, ...) is called,
which first takes rmap locks, then does move_normal_pmd().
move_normal_pmd() takes the necessary page table locks at source and
destination, then moves an entire page table from the source to the
destination.

The problem is: The rmap locks, which protect against concurrent page table
removal by retract_page_tables() in the THP code, are only taken after the
PMD entry has been read and it has been decided how to move it.
So we can race as follows (with two processes that have mappings of the
same tmpfs file that is stored on a tmpfs mount with huge=advise); note
that process A accesses page tables through the MM while process B does it
through the file rmap:


process A                      process B
=========                      =========
mremap
  mremap_to
    move_vma
      move_page_tables
        get_old_pmd
        alloc_new_pmd
                      *** PREEMPT ***
                               madvise(MADV_COLLAPSE)
                                 do_madvise
                                   madvise_walk_vmas
                                     madvise_vma_behavior
                                       madvise_collapse
                                         hpage_collapse_scan_file
                                           collapse_file
                                             retract_page_tables
                                               i_mmap_lock_read(mapping)
                                               pmdp_collapse_flush
                                               i_mmap_unlock_read(mapping)
        move_pgt_entry(NORMAL_PMD, ...)
          take_rmap_locks
          move_normal_pmd
          drop_rmap_locks

When this happens, move_normal_pmd() can end up creating bogus PMD entries
in the line `pmd_populate(mm, new_pmd, pmd_pgtable(pmd))`.
The effect depends on arch-specific and machine-specific details; on x86,
you can end up with physical page 0 mapped as a page table, which is likely
exploitable for user->kernel privilege escalation.


Fix the race by letting process B recheck that the PMD still points to a
page table after the rmap locks have been taken. Otherwise, we bail and let
the caller fall back to the PTE-level copying path, which will then bail
immediately at the pmd_none() check.

Bug reachability: Reaching this bug requires that you can create shmem/file
THP mappings - anonymous THP uses different code that doesn't zap stuff
under rmap locks. File THP is gated on an experimental config flag
(CONFIG_READ_ONLY_THP_FOR_FS), so on normal distro kernels you need shmem
THP to hit this bug. As far as I know, getting shmem THP normally requires
that you can mount your own tmpfs with the right mount flags, which would
require creating your own user+mount namespace; though I don't know if some
distros maybe enable shmem THP by default or something like that.

Bug impact: This issue can likely be used for user->kernel privilege
escalation when it is reachable.

Cc: stable@vger.kernel.org
Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock")
Closes: https://project-zero.issues.chromium.org/371047675
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Jann Horn <jannh@google.com>
---
@David: please confirm we can add your Signed-off-by to this patch after
the Co-developed-by.
(Context: David basically wrote the entire patch except for the commit
message.)

@akpm: This replaces the previous "[PATCH] mm/mremap: Prevent racing
change of old pmd type".
---
 mm/mremap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)


---
base-commit: 8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b
change-id: 20241007-move_normal_pmd-vs-collapse-fix-2-387e9a68c7d6

Comments

Qi Zheng Oct. 8, 2024, 3:53 a.m. UTC | #1
Hi Jann,

On 2024/10/8 05:42, Jann Horn wrote:

[...]

> 
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 24712f8dbb6b..dda09e957a5d 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>   {
>   	spinlock_t *old_ptl, *new_ptl;
>   	struct mm_struct *mm = vma->vm_mm;
> +	bool res = false;
>   	pmd_t pmd;
>   
>   	if (!arch_supports_page_table_move())
> @@ -277,19 +278,25 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>   	if (new_ptl != old_ptl)
>   		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>   
> -	/* Clear the pmd */
>   	pmd = *old_pmd;
> +
> +	/* Racing with collapse? */
> +	if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))

Since we already hold the exclusive mmap lock, after a racing
with collapse occurs, the pmd entry cannot be refilled with
new content by page fault. So maybe we only need to recheck
pmd_none(pmd) here?

Thanks,
Qi

> +		goto out_unlock;
> +	/* Clear the pmd */
>   	pmd_clear(old_pmd);
> +	res = true;
>   
>   	VM_BUG_ON(!pmd_none(*new_pmd));
>   
>   	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
>   	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
> +out_unlock:
>   	if (new_ptl != old_ptl)
>   		spin_unlock(new_ptl);
>   	spin_unlock(old_ptl);
>   
> -	return true;
> +	return res;
>   }
>   #else
>   static inline bool move_normal_pmd(struct vm_area_struct *vma,
> 
> ---
> base-commit: 8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b
> change-id: 20241007-move_normal_pmd-vs-collapse-fix-2-387e9a68c7d6
David Hildenbrand Oct. 8, 2024, 7:50 a.m. UTC | #2
On 07.10.24 23:42, Jann Horn wrote:
> In mremap(), move_page_tables() looks at the type of the PMD entry and the
> specified address range to figure out by which method the next chunk of
> page table entries should be moved.
> At that point, the mmap_lock is held in write mode, but no rmap locks are
> held yet. For PMD entries that point to page tables and are fully covered
> by the source address range, move_pgt_entry(NORMAL_PMD, ...) is called,
> which first takes rmap locks, then does move_normal_pmd().
> move_normal_pmd() takes the necessary page table locks at source and
> destination, then moves an entire page table from the source to the
> destination.
> 
> The problem is: The rmap locks, which protect against concurrent page table
> removal by retract_page_tables() in the THP code, are only taken after the
> PMD entry has been read and it has been decided how to move it.
> So we can race as follows (with two processes that have mappings of the
> same tmpfs file that is stored on a tmpfs mount with huge=advise); note
> that process A accesses page tables through the MM while process B does it
> through the file rmap:
> 
> 
> process A                      process B
> =========                      =========
> mremap
>    mremap_to
>      move_vma
>        move_page_tables
>          get_old_pmd
>          alloc_new_pmd
>                        *** PREEMPT ***
>                                 madvise(MADV_COLLAPSE)
>                                   do_madvise
>                                     madvise_walk_vmas
>                                       madvise_vma_behavior
>                                         madvise_collapse
>                                           hpage_collapse_scan_file
>                                             collapse_file
>                                               retract_page_tables
>                                                 i_mmap_lock_read(mapping)
>                                                 pmdp_collapse_flush
>                                                 i_mmap_unlock_read(mapping)
>          move_pgt_entry(NORMAL_PMD, ...)
>            take_rmap_locks
>            move_normal_pmd
>            drop_rmap_locks
> 
> When this happens, move_normal_pmd() can end up creating bogus PMD entries
> in the line `pmd_populate(mm, new_pmd, pmd_pgtable(pmd))`.
> The effect depends on arch-specific and machine-specific details; on x86,
> you can end up with physical page 0 mapped as a page table, which is likely
> exploitable for user->kernel privilege escalation.
> 
> 
> Fix the race by letting process B recheck that the PMD still points to a
> page table after the rmap locks have been taken. Otherwise, we bail and let
> the caller fall back to the PTE-level copying path, which will then bail
> immediately at the pmd_none() check.
> 
> Bug reachability: Reaching this bug requires that you can create shmem/file
> THP mappings - anonymous THP uses different code that doesn't zap stuff
> under rmap locks. File THP is gated on an experimental config flag
> (CONFIG_READ_ONLY_THP_FOR_FS), so on normal distro kernels you need shmem
> THP to hit this bug. As far as I know, getting shmem THP normally requires
> that you can mount your own tmpfs with the right mount flags, which would
> require creating your own user+mount namespace; though I don't know if some
> distros maybe enable shmem THP by default or something like that.
> 
> Bug impact: This issue can likely be used for user->kernel privilege
> escalation when it is reachable.
> 
> Cc: stable@vger.kernel.org
> Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock")
> Closes: https://project-zero.issues.chromium.org/371047675
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Jann Horn <jannh@google.com>
> ---
> @David: please confirm we can add your Signed-off-by to this patch after
> the Co-developed-by.

Sure, thanks for sending this out!

Signed-off-by: David Hildenbrand <david@redhat.com>
David Hildenbrand Oct. 8, 2024, 7:52 a.m. UTC | #3
On 08.10.24 05:53, Qi Zheng wrote:
> Hi Jann,
> 
> On 2024/10/8 05:42, Jann Horn wrote:
> 
> [...]
> 
>>
>> diff --git a/mm/mremap.c b/mm/mremap.c
>> index 24712f8dbb6b..dda09e957a5d 100644
>> --- a/mm/mremap.c
>> +++ b/mm/mremap.c
>> @@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>>    {
>>    	spinlock_t *old_ptl, *new_ptl;
>>    	struct mm_struct *mm = vma->vm_mm;
>> +	bool res = false;
>>    	pmd_t pmd;
>>    
>>    	if (!arch_supports_page_table_move())
>> @@ -277,19 +278,25 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>>    	if (new_ptl != old_ptl)
>>    		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>>    
>> -	/* Clear the pmd */
>>    	pmd = *old_pmd;
>> +
>> +	/* Racing with collapse? */
>> +	if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
> 
> Since we already hold the exclusive mmap lock, after a racing
> with collapse occurs, the pmd entry cannot be refilled with
> new content by page fault. So maybe we only need to recheck
> pmd_none(pmd) here?

My thinking was that it is cheap and more future proof to check that we 
really still have a page table here. For example, what if collapse code 
is ever changed to replace the page table by the collapsed PMD?

So unless there is a good reason not to have this check here, I would 
keep it like that.

Thanks!
Qi Zheng Oct. 8, 2024, 7:58 a.m. UTC | #4
On 2024/10/8 15:52, David Hildenbrand wrote:
> On 08.10.24 05:53, Qi Zheng wrote:
>> Hi Jann,
>>
>> On 2024/10/8 05:42, Jann Horn wrote:
>>
>> [...]
>>
>>>
>>> diff --git a/mm/mremap.c b/mm/mremap.c
>>> index 24712f8dbb6b..dda09e957a5d 100644
>>> --- a/mm/mremap.c
>>> +++ b/mm/mremap.c
>>> @@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_area_struct 
>>> *vma, unsigned long old_addr,
>>>    {
>>>        spinlock_t *old_ptl, *new_ptl;
>>>        struct mm_struct *mm = vma->vm_mm;
>>> +    bool res = false;
>>>        pmd_t pmd;
>>>        if (!arch_supports_page_table_move())
>>> @@ -277,19 +278,25 @@ static bool move_normal_pmd(struct 
>>> vm_area_struct *vma, unsigned long old_addr,
>>>        if (new_ptl != old_ptl)
>>>            spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>>> -    /* Clear the pmd */
>>>        pmd = *old_pmd;
>>> +
>>> +    /* Racing with collapse? */
>>> +    if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
>>
>> Since we already hold the exclusive mmap lock, after a racing
>> with collapse occurs, the pmd entry cannot be refilled with
>> new content by page fault. So maybe we only need to recheck
>> pmd_none(pmd) here?
> 
> My thinking was that it is cheap and more future proof to check that we 
> really still have a page table here. For example, what if collapse code 
> is ever changed to replace the page table by the collapsed PMD?

Ah, make sense.

Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>

> 
> So unless there is a good reason not to have this check here, I would 
> keep it like that.
> 
> Thanks!
>
David Hildenbrand Oct. 8, 2024, 8:21 a.m. UTC | #5
On 08.10.24 09:58, Qi Zheng wrote:
> 
> 
> On 2024/10/8 15:52, David Hildenbrand wrote:
>> On 08.10.24 05:53, Qi Zheng wrote:
>>> Hi Jann,
>>>
>>> On 2024/10/8 05:42, Jann Horn wrote:
>>>
>>> [...]
>>>
>>>>
>>>> diff --git a/mm/mremap.c b/mm/mremap.c
>>>> index 24712f8dbb6b..dda09e957a5d 100644
>>>> --- a/mm/mremap.c
>>>> +++ b/mm/mremap.c
>>>> @@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_area_struct
>>>> *vma, unsigned long old_addr,
>>>>     {
>>>>         spinlock_t *old_ptl, *new_ptl;
>>>>         struct mm_struct *mm = vma->vm_mm;
>>>> +    bool res = false;
>>>>         pmd_t pmd;
>>>>         if (!arch_supports_page_table_move())
>>>> @@ -277,19 +278,25 @@ static bool move_normal_pmd(struct
>>>> vm_area_struct *vma, unsigned long old_addr,
>>>>         if (new_ptl != old_ptl)
>>>>             spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>>>> -    /* Clear the pmd */
>>>>         pmd = *old_pmd;
>>>> +
>>>> +    /* Racing with collapse? */
>>>> +    if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
>>>
>>> Since we already hold the exclusive mmap lock, after a racing
>>> with collapse occurs, the pmd entry cannot be refilled with
>>> new content by page fault. So maybe we only need to recheck
>>> pmd_none(pmd) here?
>>
>> My thinking was that it is cheap and more future proof to check that we
>> really still have a page table here. For example, what if collapse code
>> is ever changed to replace the page table by the collapsed PMD?
> 
> Ah, make sense.
> 
> Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>

Thanks a lot for your review!
Joel Fernandes Oct. 8, 2024, 11:58 p.m. UTC | #6
On Mon, Oct 7, 2024 at 5:42 PM Jann Horn <jannh@google.com> wrote:
>
> In mremap(), move_page_tables() looks at the type of the PMD entry and the
> specified address range to figure out by which method the next chunk of
> page table entries should be moved.
> At that point, the mmap_lock is held in write mode, but no rmap locks are
> held yet. For PMD entries that point to page tables and are fully covered
> by the source address range, move_pgt_entry(NORMAL_PMD, ...) is called,
> which first takes rmap locks, then does move_normal_pmd().
> move_normal_pmd() takes the necessary page table locks at source and
> destination, then moves an entire page table from the source to the
> destination.
>
> The problem is: The rmap locks, which protect against concurrent page table
> removal by retract_page_tables() in the THP code, are only taken after the
> PMD entry has been read and it has been decided how to move it.
> So we can race as follows (with two processes that have mappings of the
> same tmpfs file that is stored on a tmpfs mount with huge=advise); note
> that process A accesses page tables through the MM while process B does it
> through the file rmap:
>
>
> process A                      process B
> =========                      =========
> mremap
>   mremap_to
>     move_vma
>       move_page_tables
>         get_old_pmd
>         alloc_new_pmd
>                       *** PREEMPT ***
>                                madvise(MADV_COLLAPSE)
>                                  do_madvise
>                                    madvise_walk_vmas
>                                      madvise_vma_behavior
>                                        madvise_collapse
>                                          hpage_collapse_scan_file
>                                            collapse_file
>                                              retract_page_tables
>                                                i_mmap_lock_read(mapping)
>                                                pmdp_collapse_flush
>                                                i_mmap_unlock_read(mapping)
>         move_pgt_entry(NORMAL_PMD, ...)
>           take_rmap_locks
>           move_normal_pmd
>           drop_rmap_locks
>
> When this happens, move_normal_pmd() can end up creating bogus PMD entries
> in the line `pmd_populate(mm, new_pmd, pmd_pgtable(pmd))`.
> The effect depends on arch-specific and machine-specific details; on x86,
> you can end up with physical page 0 mapped as a page table, which is likely
> exploitable for user->kernel privilege escalation.
>
>
> Fix the race by letting process B recheck that the PMD still points to a
> page table after the rmap locks have been taken. Otherwise, we bail and let
> the caller fall back to the PTE-level copying path, which will then bail
> immediately at the pmd_none() check.
>
> Bug reachability: Reaching this bug requires that you can create shmem/file
> THP mappings - anonymous THP uses different code that doesn't zap stuff
> under rmap locks. File THP is gated on an experimental config flag
> (CONFIG_READ_ONLY_THP_FOR_FS), so on normal distro kernels you need shmem
> THP to hit this bug. As far as I know, getting shmem THP normally requires
> that you can mount your own tmpfs with the right mount flags, which would
> require creating your own user+mount namespace; though I don't know if some
> distros maybe enable shmem THP by default or something like that.

Not to overthink it, but do you have any insight into why copy_vma()
only requires the rmap lock under this condition?

*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);

Could a collapse still occur when need_rmap_locks is false,
potentially triggering the bug you described? My assumption is no, but
I wanted to double-check.

The patch looks good to me overall. I was also curious if
move_normal_pud() would require a similar change, though I’m inclined
to think that path doesn't lead to a bug.

thanks,

- Joel


>
> Bug impact: This issue can likely be used for user->kernel privilege
> escalation when it is reachable.
>
> Cc: stable@vger.kernel.org
> Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock")
> Closes: https://project-zero.issues.chromium.org/371047675
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Jann Horn <jannh@google.com>
> ---
> @David: please confirm we can add your Signed-off-by to this patch after
> the Co-developed-by.
> (Context: David basically wrote the entire patch except for the commit
> message.)
>
> @akpm: This replaces the previous "[PATCH] mm/mremap: Prevent racing
> change of old pmd type".
> ---
>  mm/mremap.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 24712f8dbb6b..dda09e957a5d 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>  {
>         spinlock_t *old_ptl, *new_ptl;
>         struct mm_struct *mm = vma->vm_mm;
> +       bool res = false;
>         pmd_t pmd;
>
>         if (!arch_supports_page_table_move())
> @@ -277,19 +278,25 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>         if (new_ptl != old_ptl)
>                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>
> -       /* Clear the pmd */
>         pmd = *old_pmd;
> +
> +       /* Racing with collapse? */
> +       if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
> +               goto out_unlock;
> +       /* Clear the pmd */
>         pmd_clear(old_pmd);
> +       res = true;
>
>         VM_BUG_ON(!pmd_none(*new_pmd));
>
>         pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
>         flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
> +out_unlock:
>         if (new_ptl != old_ptl)
>                 spin_unlock(new_ptl);
>         spin_unlock(old_ptl);
>
> -       return true;
> +       return res;
>  }
>  #else
>  static inline bool move_normal_pmd(struct vm_area_struct *vma,
>
> ---
> base-commit: 8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b
> change-id: 20241007-move_normal_pmd-vs-collapse-fix-2-387e9a68c7d6
> --
> Jann Horn <jannh@google.com>
>
Jann Horn Oct. 9, 2024, 12:49 a.m. UTC | #7
On Wed, Oct 9, 2024 at 1:58 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> On Mon, Oct 7, 2024 at 5:42 PM Jann Horn <jannh@google.com> wrote:
> Not to overthink it, but do you have any insight into why copy_vma()
> only requires the rmap lock under this condition?
>
> *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
>
> Could a collapse still occur when need_rmap_locks is false,
> potentially triggering the bug you described? My assumption is no, but
> I wanted to double-check.

Ah, that code is a bit confusing. There are actually two circumstances
under which we take rmap locks, and that condition only captures (part
of) the first one:

1. when we might move PTEs against rmap traversal order (we need the
lock so that concurrent rmap traversal can't miss the PTEs)
2. when we move page tables (otherwise concurrent rmap traversal could
race with page table changes)

If you look at the four callsites of move_pgt_entry(), you can see
that its parameter "need_rmap_locks" sometimes comes from the caller's
"need_rmap_locks" variable (in the HPAGE_PUD and HPAGE_PMD cases), but
other times it is just hardcoded to true (in the NORMAL_PUD and
NORMAL_PMD cases).
So move_normal_pmd() always holds rmap locks.
(This code would probably be a bit clearer if we moved the rmap
locking into the helpers move_{normal,huge}_{pmd,pud} and got rid of
the helper move_pgt_entry()...)

(Also, note that when undoing the PTE moves with the second
move_page_tables() call, the "need_rmap_locks" parameter to
move_page_tables() is hardcoded to true.)

> The patch looks good to me overall. I was also curious if
> move_normal_pud() would require a similar change, though I’m inclined
> to think that path doesn't lead to a bug.

Yeah, there is no path that would remove PUD entries pointing to page
tables through the rmap, that's a special PMD entry thing. (Well, at
least not in non-hugetlb code, I haven't looked at hugetlb in a long
time - but hugetlb has an entirely separate codepath for moving page
tables, move_hugetlb_page_tables().)
Joel Fernandes Oct. 9, 2024, 1:46 a.m. UTC | #8
On Tue, Oct 8, 2024 at 8:50 PM Jann Horn <jannh@google.com> wrote:
>
> On Wed, Oct 9, 2024 at 1:58 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > On Mon, Oct 7, 2024 at 5:42 PM Jann Horn <jannh@google.com> wrote:
> > Not to overthink it, but do you have any insight into why copy_vma()
> > only requires the rmap lock under this condition?
> >
> > *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
> >
> > Could a collapse still occur when need_rmap_locks is false,
> > potentially triggering the bug you described? My assumption is no, but
> > I wanted to double-check.
>
> Ah, that code is a bit confusing. There are actually two circumstances
> under which we take rmap locks, and that condition only captures (part
> of) the first one:
>
> 1. when we might move PTEs against rmap traversal order (we need the
> lock so that concurrent rmap traversal can't miss the PTEs).

Ah ok, I see this mentioned in move_ptes(). Thanks for clarifying.

> 2. when we move page tables (otherwise concurrent rmap traversal could
> race with page table changes)

Ah ok, and these are the 4 call sites you mention below. Makes sense.

> If you look at the four callsites of move_pgt_entry(), you can see
> that its parameter "need_rmap_locks" sometimes comes from the caller's
> "need_rmap_locks" variable (in the HPAGE_PUD and HPAGE_PMD cases), but
> other times it is just hardcoded to true (in the NORMAL_PUD and
> NORMAL_PMD cases).
> So move_normal_pmd() always holds rmap locks.
> (This code would probably be a bit clearer if we moved the rmap
> locking into the helpers move_{normal,huge}_{pmd,pud} and got rid of
> the helper move_pgt_entry()...)

Thanks for clarifying, this makes a lot of sense now. So the
optimization is when we move ptes forward, we don't need the rmap lock
because the rmap code is cool with that due to traversal order. Ok.
And that definitely doesn't apply to move_normal_pmd() as you
mentioned I guess :).

> (Also, note that when undoing the PTE moves with the second
> move_page_tables() call, the "need_rmap_locks" parameter to
> move_page_tables() is hardcoded to true.)

Sure.

> > The patch looks good to me overall. I was also curious if
> > move_normal_pud() would require a similar change, though I’m inclined
> > to think that path doesn't lead to a bug.
>
> Yeah, there is no path that would remove PUD entries pointing to page
> tables through the rmap, that's a special PMD entry thing. (Well, at
> least not in non-hugetlb code, I haven't looked at hugetlb in a long
> time - but hugetlb has an entirely separate codepath for moving page
> tables, move_hugetlb_page_tables().)

Makes sense. TIL ;-)

thanks!

 - Joel
Lorenzo Stoakes Oct. 9, 2024, 2:44 p.m. UTC | #9
On Mon, Oct 07, 2024 at 11:42:04PM +0200, Jann Horn wrote:
> In mremap(), move_page_tables() looks at the type of the PMD entry and the
> specified address range to figure out by which method the next chunk of
> page table entries should be moved.
> At that point, the mmap_lock is held in write mode, but no rmap locks are
> held yet. For PMD entries that point to page tables and are fully covered
> by the source address range, move_pgt_entry(NORMAL_PMD, ...) is called,
> which first takes rmap locks, then does move_normal_pmd().
> move_normal_pmd() takes the necessary page table locks at source and
> destination, then moves an entire page table from the source to the
> destination.
>
> The problem is: The rmap locks, which protect against concurrent page table
> removal by retract_page_tables() in the THP code, are only taken after the
> PMD entry has been read and it has been decided how to move it.
> So we can race as follows (with two processes that have mappings of the
> same tmpfs file that is stored on a tmpfs mount with huge=advise); note
> that process A accesses page tables through the MM while process B does it
> through the file rmap:
>
>
> process A                      process B
> =========                      =========
> mremap
>   mremap_to
>     move_vma
>       move_page_tables
>         get_old_pmd
>         alloc_new_pmd
>                       *** PREEMPT ***
>                                madvise(MADV_COLLAPSE)
>                                  do_madvise
>                                    madvise_walk_vmas
>                                      madvise_vma_behavior
>                                        madvise_collapse
>                                          hpage_collapse_scan_file
>                                            collapse_file
>                                              retract_page_tables
>                                                i_mmap_lock_read(mapping)
>                                                pmdp_collapse_flush
>                                                i_mmap_unlock_read(mapping)
>         move_pgt_entry(NORMAL_PMD, ...)
>           take_rmap_locks
>           move_normal_pmd
>           drop_rmap_locks
>
> When this happens, move_normal_pmd() can end up creating bogus PMD entries
> in the line `pmd_populate(mm, new_pmd, pmd_pgtable(pmd))`.
> The effect depends on arch-specific and machine-specific details; on x86,
> you can end up with physical page 0 mapped as a page table, which is likely
> exploitable for user->kernel privilege escalation.
>
>
> Fix the race by letting process B recheck that the PMD still points to a
> page table after the rmap locks have been taken. Otherwise, we bail and let
> the caller fall back to the PTE-level copying path, which will then bail
> immediately at the pmd_none() check.
>
> Bug reachability: Reaching this bug requires that you can create shmem/file
> THP mappings - anonymous THP uses different code that doesn't zap stuff
> under rmap locks. File THP is gated on an experimental config flag
> (CONFIG_READ_ONLY_THP_FOR_FS), so on normal distro kernels you need shmem
> THP to hit this bug. As far as I know, getting shmem THP normally requires
> that you can mount your own tmpfs with the right mount flags, which would
> require creating your own user+mount namespace; though I don't know if some
> distros maybe enable shmem THP by default or something like that.

Any repro?

>
> Bug impact: This issue can likely be used for user->kernel privilege
> escalation when it is reachable.
>
> Cc: stable@vger.kernel.org
> Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock")
> Closes: https://project-zero.issues.chromium.org/371047675
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Jann Horn <jannh@google.com>

Ugh man this PMD locking thing is horrid. This is subtle and deeply painful and
I feel like we need some better way of expressing this locking.

Documenting this stuff, or at least VMA side remains on my todo list...

Anyway this patch looks sane:

Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

> ---
> @David: please confirm we can add your Signed-off-by to this patch after
> the Co-developed-by.
> (Context: David basically wrote the entire patch except for the commit
> message.)

The fact David did that automatically gives me confidence in this change
from mm side. :)

>
> @akpm: This replaces the previous "[PATCH] mm/mremap: Prevent racing
> change of old pmd type".
> ---
>  mm/mremap.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 24712f8dbb6b..dda09e957a5d 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>  {
>  	spinlock_t *old_ptl, *new_ptl;
>  	struct mm_struct *mm = vma->vm_mm;
> +	bool res = false;
>  	pmd_t pmd;
>
>  	if (!arch_supports_page_table_move())
> @@ -277,19 +278,25 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>  	if (new_ptl != old_ptl)
>  		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>
> -	/* Clear the pmd */
>  	pmd = *old_pmd;
> +
> +	/* Racing with collapse? */
> +	if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
> +		goto out_unlock;
> +	/* Clear the pmd */
>  	pmd_clear(old_pmd);
> +	res = true;
>
>  	VM_BUG_ON(!pmd_none(*new_pmd));
>
>  	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
>  	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
> +out_unlock:
>  	if (new_ptl != old_ptl)
>  		spin_unlock(new_ptl);
>  	spin_unlock(old_ptl);
>
> -	return true;
> +	return res;
>  }
>  #else
>  static inline bool move_normal_pmd(struct vm_area_struct *vma,
>
> ---
> base-commit: 8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b
> change-id: 20241007-move_normal_pmd-vs-collapse-fix-2-387e9a68c7d6
> --
> Jann Horn <jannh@google.com>
>
diff mbox series

Patch

diff --git a/mm/mremap.c b/mm/mremap.c
index 24712f8dbb6b..dda09e957a5d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -238,6 +238,7 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
+	bool res = false;
 	pmd_t pmd;
 
 	if (!arch_supports_page_table_move())
@@ -277,19 +278,25 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 
-	/* Clear the pmd */
 	pmd = *old_pmd;
+
+	/* Racing with collapse? */
+	if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
+		goto out_unlock;
+	/* Clear the pmd */
 	pmd_clear(old_pmd);
+	res = true;
 
 	VM_BUG_ON(!pmd_none(*new_pmd));
 
 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+out_unlock:
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
 
-	return true;
+	return res;
 }
 #else
 static inline bool move_normal_pmd(struct vm_area_struct *vma,