diff mbox series

[v1] mm: Fix race between __split_huge_pmd_locked() and GUP-fast

Message ID 20240425170704.3379492-1-ryan.roberts@arm.com (mailing list archive)
State New
Headers show
Series [v1] mm: Fix race between __split_huge_pmd_locked() and GUP-fast | expand

Commit Message

Ryan Roberts April 25, 2024, 5:07 p.m. UTC
__split_huge_pmd_locked() can be called for a present THP, devmap or
(non-present) migration entry. It calls pmdp_invalidate()
unconditionally on the pmdp and only determines if it is present or not
based on the returned old pmd. This is a problem for the migration entry
case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
called for a present pmd.

On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
future call to pmd_present() will return true. And therefore any
lockless pgtable walker could see the migration entry pmd in this state
and start interpretting the fields as if it were present, leading to
BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
I suspect the same is possible on other architectures.

Fix this by only calling pmdp_invalidate() for a present pmd. And for
good measure let's add a warning to the generic implementation of
pmdp_invalidate(). I've manually reviewed all other
pmdp_invalidate[_ad]() call sites and believe all others to be
conformant.

This is a theoretical bug found during code review. I don't have any
test case to trigger it in practice.

Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---

Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.

Thanks,
Ryan


 mm/huge_memory.c     | 5 +++--
 mm/pgtable-generic.c | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

--
2.25.1

Comments

Zi Yan April 25, 2024, 6:58 p.m. UTC | #1
+Anshuman, who changed pmd_present() semantics. See:
https://lore.kernel.org/all/1599627183-14453-2-git-send-email-anshuman.khandual@arm.com/ and commit b65399f6111b ("arm64/mm: Change
THP helpers to comply with generic MM semantics")

On 25 Apr 2024, at 13:07, Ryan Roberts wrote:

> __split_huge_pmd_locked() can be called for a present THP, devmap or
> (non-present) migration entry. It calls pmdp_invalidate()
> unconditionally on the pmdp and only determines if it is present or not
> based on the returned old pmd. This is a problem for the migration entry
> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
> called for a present pmd.
>
> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
> future call to pmd_present() will return true. And therefore any

But pmd_mkinvalid() on x86 does not behave so. Maybe we should fix
pmd_mkinvalid() on arm64 by not setting PMD_PRESENT_INVALID when the
entry is invalid already. And add a test in mm/debug_vm_pgtable.c.

I notice that x86, risc-v, mips behave the same. loongarch also
has _PAGE_PRESENT_INVALID bit set during pmd_mkinvalid(), but its
pmd_present() makes sure _PAGE_HUEG is set before checks _PAGE_PRESENT_INVALID.
So it is not a problem for loongarch. Add Huacai to confirm this.

Maybe pmd_present() on arm64 can do that too?

> lockless pgtable walker could see the migration entry pmd in this state
> and start interpretting the fields as if it were present, leading to
> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
> I suspect the same is possible on other architectures.
>
> Fix this by only calling pmdp_invalidate() for a present pmd. And for
> good measure let's add a warning to the generic implementation of
> pmdp_invalidate(). I've manually reviewed all other
> pmdp_invalidate[_ad]() call sites and believe all others to be
> conformant.
>
> This is a theoretical bug found during code review. I don't have any
> test case to trigger it in practice.
>
> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>
> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>
> Thanks,
> Ryan
>
>
>  mm/huge_memory.c     | 5 +++--
>  mm/pgtable-generic.c | 2 ++
>  2 files changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 89f58c7603b2..80939ad00718 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>  	 * non-huge version of the pmd entry with pmd_populate.
>  	 */
> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>
> -	pmd_migration = is_pmd_migration_entry(old_pmd);
> +	pmd_migration = is_pmd_migration_entry(*pmd);
>  	if (unlikely(pmd_migration)) {
>  		swp_entry_t entry;
>
> +		old_pmd = *pmd;
>  		entry = pmd_to_swp_entry(old_pmd);
>  		page = pfn_swap_entry_to_page(entry);
>  		write = is_writable_migration_entry(entry);
> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>  	} else {
> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>  		page = pmd_page(old_pmd);
>  		folio = page_folio(page);
>  		if (pmd_dirty(old_pmd)) {
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 4fcd959dcc4d..74e34ea90656 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>  		     pmd_t *pmdp)
>  {
> +	VM_WARN_ON(!pmd_present(*pmdp));
>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>  	return old;
> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>  			 pmd_t *pmdp)
>  {
> +	VM_WARN_ON(!pmd_present(*pmdp));  	return pmdp_invalidate(vma, address, pmdp);
>  }
>  #endif
> --
> 2.25.1


--
Best Regards,
Yan, Zi
Anshuman Khandual April 26, 2024, 4:19 a.m. UTC | #2
On 4/25/24 22:37, Ryan Roberts wrote:
> __split_huge_pmd_locked() can be called for a present THP, devmap or
> (non-present) migration entry. It calls pmdp_invalidate()
> unconditionally on the pmdp and only determines if it is present or not
> based on the returned old pmd. This is a problem for the migration entry
> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
> called for a present pmd.

pmdp_invalidate() must be called only for present PMD - is this expected
by core MM ? Does this cause any problem otherwise ?

> 
> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
> future call to pmd_present() will return true. And therefore any

IIRC the following semantics needs to be followed as expected by core MM.

-------------------------------------------------------------------------
|	PMD states	|	pmd_present	|	pmd_trans_huge	|
-------------------------------------------------------------------------
|	Mapped		|	Yes		|	Yes		|
-------------------------------------------------------------------------
|	Splitting	|	Yes		|	Yes		|
-------------------------------------------------------------------------
|	Migration/Swap	|	No		|	No		|
-------------------------------------------------------------------------


> lockless pgtable walker could see the migration entry pmd in this state
> and start interpretting the fields as if it were present, leading to
> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.

Could you please explain how bad things might happen ?
 
> I suspect the same is possible on other architectures.
> 
> Fix this by only calling pmdp_invalidate() for a present pmd. And for
> good measure let's add a warning to the generic implementation of
> pmdp_invalidate(). I've manually reviewed all other
> pmdp_invalidate[_ad]() call sites and believe all others to be
> conformant.
> 
> This is a theoretical bug found during code review. I don't have any
> test case to trigger it in practice.
> 
> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
> 
> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
> 
> Thanks,
> Ryan
> 
> 
>  mm/huge_memory.c     | 5 +++--
>  mm/pgtable-generic.c | 2 ++
>  2 files changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 89f58c7603b2..80939ad00718 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>  	 * non-huge version of the pmd entry with pmd_populate.
>  	 */
> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
> 
> -	pmd_migration = is_pmd_migration_entry(old_pmd);
> +	pmd_migration = is_pmd_migration_entry(*pmd);
>  	if (unlikely(pmd_migration)) {
>  		swp_entry_t entry;
> 
> +		old_pmd = *pmd;
>  		entry = pmd_to_swp_entry(old_pmd);
>  		page = pfn_swap_entry_to_page(entry);
>  		write = is_writable_migration_entry(entry);
> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>  	} else {
> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>  		page = pmd_page(old_pmd);
>  		folio = page_folio(page);
>  		if (pmd_dirty(old_pmd)) {
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 4fcd959dcc4d..74e34ea90656 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>  		     pmd_t *pmdp)
>  {
> +	VM_WARN_ON(!pmd_present(*pmdp));
>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>  	return old;
> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>  			 pmd_t *pmdp)
>  {
> +	VM_WARN_ON(!pmd_present(*pmdp));
>  	return pmdp_invalidate(vma, address, pmdp);
>  }
>  #endif
> --
> 2.25.1
> 
>
Anshuman Khandual April 26, 2024, 4:50 a.m. UTC | #3
On 4/26/24 00:28, Zi Yan wrote:
> +Anshuman, who changed pmd_present() semantics. See:
> https://lore.kernel.org/all/1599627183-14453-2-git-send-email-anshuman.khandual@arm.com/ and commit b65399f6111b ("arm64/mm: Change
> THP helpers to comply with generic MM semantics")
> 
> On 25 Apr 2024, at 13:07, Ryan Roberts wrote:
> 
>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>> (non-present) migration entry. It calls pmdp_invalidate()
>> unconditionally on the pmdp and only determines if it is present or not
>> based on the returned old pmd. This is a problem for the migration entry
>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>> called for a present pmd.
>>
>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>> future call to pmd_present() will return true. And therefore any
> 
> But pmd_mkinvalid() on x86 does not behave so. Maybe we should fix
> pmd_mkinvalid() on arm64 by not setting PMD_PRESENT_INVALID when the
> entry is invalid already. And add a test in mm/debug_vm_pgtable.c.
> 
> I notice that x86, risc-v, mips behave the same. loongarch also
> has _PAGE_PRESENT_INVALID bit set during pmd_mkinvalid(), but its
> pmd_present() makes sure _PAGE_HUEG is set before checks _PAGE_PRESENT_INVALID.
> So it is not a problem for loongarch. Add Huacai to confirm this.
> 
> Maybe pmd_present() on arm64 can do that too?

pmd_present() should return true even for a splitting PMD which is not
currently mapped. IIRC in all other architectures, there is a distinct
identification bit for huge page which stays back, even when the entry
becomes unmapped. That bit helps pmd_present() return true, during PMD
splitting process.

But on arm64 platform

#define PTE_VALID                (_AT(pteval_t, 1) << 0)
#define PMD_SECT_VALID           (_AT(pmdval_t, 1) << 0)
#define PMD_TABLE_BIT            (_AT(pmdval_t, 1) << 1)

When the PMD entry becomes unmapped, PTE_VALID gets cleared, so does the
PMD_SECT_VALID, thus erasing its identity as a huge mapping. A software
bit PMD_PRESENT_INVALID was added which helps in preserving that cleared
huge page mapping identity once it becomes unmapped.

> 
>> lockless pgtable walker could see the migration entry pmd in this state
>> and start interpretting the fields as if it were present, leading to
>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>> I suspect the same is possible on other architectures.
>>
>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>> good measure let's add a warning to the generic implementation of
>> pmdp_invalidate(). I've manually reviewed all other
>> pmdp_invalidate[_ad]() call sites and believe all others to be
>> conformant.
>>
>> This is a theoretical bug found during code review. I don't have any
>> test case to trigger it in practice.
>>
>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>
>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>
>> Thanks,
>> Ryan
>>
>>
>>  mm/huge_memory.c     | 5 +++--
>>  mm/pgtable-generic.c | 2 ++
>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 89f58c7603b2..80939ad00718 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>  	 * non-huge version of the pmd entry with pmd_populate.
>>  	 */
>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>
>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>  	if (unlikely(pmd_migration)) {
>>  		swp_entry_t entry;
>>
>> +		old_pmd = *pmd;
>>  		entry = pmd_to_swp_entry(old_pmd);
>>  		page = pfn_swap_entry_to_page(entry);
>>  		write = is_writable_migration_entry(entry);
>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>  	} else {
>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>  		page = pmd_page(old_pmd);
>>  		folio = page_folio(page);
>>  		if (pmd_dirty(old_pmd)) {
>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>> index 4fcd959dcc4d..74e34ea90656 100644
>> --- a/mm/pgtable-generic.c
>> +++ b/mm/pgtable-generic.c
>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>  		     pmd_t *pmdp)
>>  {
>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>  	return old;
>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>  			 pmd_t *pmdp)
>>  {
>> +	VM_WARN_ON(!pmd_present(*pmdp));  	return pmdp_invalidate(vma, address, pmdp);
>>  }
>>  #endif
>> --
>> 2.25.1
> 
> 
> --
> Best Regards,
> Yan, Zi
Ryan Roberts April 26, 2024, 7:43 a.m. UTC | #4
On 26/04/2024 05:19, Anshuman Khandual wrote:
> On 4/25/24 22:37, Ryan Roberts wrote:
>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>> (non-present) migration entry. It calls pmdp_invalidate()
>> unconditionally on the pmdp and only determines if it is present or not
>> based on the returned old pmd. This is a problem for the migration entry
>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>> called for a present pmd.
> 
> pmdp_invalidate() must be called only for present PMD - is this expected
> by core MM ? Does this cause any problem otherwise ?

I'm saying that only calling pmdp_invalidate() on a pte_present()==true pte is
the only semantic that makes sense. And, yes, it causes a problem if called on a
pte_present()==false pte - that's exactly what I'm describing in this commit log.

To labour the point, this is the logical type hierachy of PTEs (and block-mapped
PMDs) as I see it:

---8<----

pte
 |- present
 |   |- valid
 |   |- invalid
 |
 |- not_present
     |- none
     |- swap_pte

present: All fields must be interpretted the way the HW sees them. e.g.
         pte_pfn(), pte_write(), pte_dirty(), pte_young(), pte_mkwrite(),
         pte_mkold() can all be legitimately used to query and modify the pte.

  valid: The HW may access the pte, interpret the fields and create a TLB entry,
         etc.

  invalid: The HW will never access the pte or create a TLB entry for it.

not_present: The fields are SW-defined. HW never accesses the PTE.

  none: Unused; represents a hole

  swap_pte: Contains a swap entry and swap pte bits. The contained swap entry
            may 1 of a few different types e.g. actual swap entry, migration
            entry, hw poison, etc.

---8<----

We test present vs not_present with pte_present()

We test none vs swap_pte with pte_none()

valid vs invalid is slightly more vague. The core-mm can move a PMD from valid
-> invalid by calling pmd_mkinvalid(). But it can't query the state. And it
can't do this generically for a PTE.


Based on that lot, it makes no sense to me that we should permit calling
pmd_mkinvalid() on a non-present pte. Indeed, we don't permit calling
pte_mkwrite() etc on a non-present pte. And those functions are not defensive;
they don't check that the pte is present before making the change. They just
trust that the core-mm will not call them for non-present ptes.

The alternative approach would be to make pmdp_invalidate() defensive so that it
checks the pmd is present before making any changes. But it doesn't semantically
make sense to invalidate a non-present pmd in the first place so why call it
under these circumstances? There is also a practical problem in that some arches
implement their own pmdp_invalidate() so you would want to make all those
defensive too, which would grow the size of the change.


> 
>>
>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>> future call to pmd_present() will return true. And therefore any
> 
> IIRC the following semantics needs to be followed as expected by core MM.
> 
> -------------------------------------------------------------------------
> |	PMD states	|	pmd_present	|	pmd_trans_huge	|
> -------------------------------------------------------------------------
> |	Mapped		|	Yes		|	Yes		|
> -------------------------------------------------------------------------
> |	Splitting	|	Yes		|	Yes		|
> -------------------------------------------------------------------------
> |	Migration/Swap	|	No		|	No		|
> -------------------------------------------------------------------------

Indeed, the problem, as I see it, is if pmd_mkinvalid() is called on a
"Migration/Swap" pmd, then a future call to pmd_present() will return Yes, which
is clearly wrong. pmd_trans_huge() will also return Yes due to:

static inline int pmd_trans_huge(pmd_t pmd)
{
	return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
}

At least this happens for arm64. Although Zi suggests other arches look like
they will do this too in the other email.

The reason is that arm64's pmd_mkinvalid() unconditionally sets
PMD_PRESENT_INVALID (bit 59) and clears PMD_SECT_VALID (bit 0) in the pte. So
next time pmd_present() is called it will see PMD_PRESENT_INVALID is set and
return true.

> 
> 
>> lockless pgtable walker could see the migration entry pmd in this state
>> and start interpretting the fields as if it were present, leading to
>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
> 
> Could you please explain how bad things might happen ?

See 2 places where pmdp_get_lockless() is called in gup.c, without the PTL.
These could both return the swap pte for which pmd_mkinvalid() has been called.
In both cases, this would lead to the pmd_present() check eroneously returning
true, eventually causing incorrect interpretation of the pte fields. e.g.:

gup_pmd_range()
  pmd_t pmd = pmdp_get_lockless(pmdp);
  gup_huge_pmd(pmd, ...)
    page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);

page is guff.

Let me know what you think!

Thanks,
Ryan


>  
>> I suspect the same is possible on other architectures.
>>
>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>> good measure let's add a warning to the generic implementation of
>> pmdp_invalidate(). I've manually reviewed all other
>> pmdp_invalidate[_ad]() call sites and believe all others to be
>> conformant.
>>
>> This is a theoretical bug found during code review. I don't have any
>> test case to trigger it in practice.
>>
>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>
>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>
>> Thanks,
>> Ryan
>>
>>
>>  mm/huge_memory.c     | 5 +++--
>>  mm/pgtable-generic.c | 2 ++
>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 89f58c7603b2..80939ad00718 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>  	 * non-huge version of the pmd entry with pmd_populate.
>>  	 */
>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>
>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>  	if (unlikely(pmd_migration)) {
>>  		swp_entry_t entry;
>>
>> +		old_pmd = *pmd;
>>  		entry = pmd_to_swp_entry(old_pmd);
>>  		page = pfn_swap_entry_to_page(entry);
>>  		write = is_writable_migration_entry(entry);
>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>  	} else {
>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>  		page = pmd_page(old_pmd);
>>  		folio = page_folio(page);
>>  		if (pmd_dirty(old_pmd)) {
>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>> index 4fcd959dcc4d..74e34ea90656 100644
>> --- a/mm/pgtable-generic.c
>> +++ b/mm/pgtable-generic.c
>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>  		     pmd_t *pmdp)
>>  {
>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>  	return old;
>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>  			 pmd_t *pmdp)
>>  {
>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>  	return pmdp_invalidate(vma, address, pmdp);
>>  }
>>  #endif
>> --
>> 2.25.1
>>
>>
Ryan Roberts April 26, 2024, 7:48 a.m. UTC | #5
On 25/04/2024 19:58, Zi Yan wrote:
> +Anshuman, who changed pmd_present() semantics. See:
> https://lore.kernel.org/all/1599627183-14453-2-git-send-email-anshuman.khandual@arm.com/ and commit b65399f6111b ("arm64/mm: Change
> THP helpers to comply with generic MM semantics")
> 
> On 25 Apr 2024, at 13:07, Ryan Roberts wrote:
> 
>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>> (non-present) migration entry. It calls pmdp_invalidate()
>> unconditionally on the pmdp and only determines if it is present or not
>> based on the returned old pmd. This is a problem for the migration entry
>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>> called for a present pmd.
>>
>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>> future call to pmd_present() will return true. And therefore any
> 
> But pmd_mkinvalid() on x86 does not behave so. Maybe we should fix
> pmd_mkinvalid() on arm64 by not setting PMD_PRESENT_INVALID when the
> entry is invalid already. And add a test in mm/debug_vm_pgtable.c.

Yes, we *could* make pmd_mkinvalid() defensive. But we don't do that for the
other getters/setters (e.g. pte_mkwrite()). So not sure why we would want that
here. Ultimately it makes no semantic sense to invalidate a non-present pmd.

See my other mail for excessive detail.

Thanks,
Ryan

> 
> I notice that x86, risc-v, mips behave the same. loongarch also
> has _PAGE_PRESENT_INVALID bit set during pmd_mkinvalid(), but its
> pmd_present() makes sure _PAGE_HUEG is set before checks _PAGE_PRESENT_INVALID.
> So it is not a problem for loongarch. Add Huacai to confirm this.
> 
> Maybe pmd_present() on arm64 can do that too?>
>> lockless pgtable walker could see the migration entry pmd in this state
>> and start interpretting the fields as if it were present, leading to
>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>> I suspect the same is possible on other architectures.
>>
>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>> good measure let's add a warning to the generic implementation of
>> pmdp_invalidate(). I've manually reviewed all other
>> pmdp_invalidate[_ad]() call sites and believe all others to be
>> conformant.
>>
>> This is a theoretical bug found during code review. I don't have any
>> test case to trigger it in practice.
>>
>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>
>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>
>> Thanks,
>> Ryan
>>
>>
>>  mm/huge_memory.c     | 5 +++--
>>  mm/pgtable-generic.c | 2 ++
>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 89f58c7603b2..80939ad00718 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>  	 * non-huge version of the pmd entry with pmd_populate.
>>  	 */
>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>
>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>  	if (unlikely(pmd_migration)) {
>>  		swp_entry_t entry;
>>
>> +		old_pmd = *pmd;
>>  		entry = pmd_to_swp_entry(old_pmd);
>>  		page = pfn_swap_entry_to_page(entry);
>>  		write = is_writable_migration_entry(entry);
>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>  	} else {
>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>  		page = pmd_page(old_pmd);
>>  		folio = page_folio(page);
>>  		if (pmd_dirty(old_pmd)) {
>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>> index 4fcd959dcc4d..74e34ea90656 100644
>> --- a/mm/pgtable-generic.c
>> +++ b/mm/pgtable-generic.c
>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>  		     pmd_t *pmdp)
>>  {
>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>  	return old;
>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>  			 pmd_t *pmdp)
>>  {
>> +	VM_WARN_ON(!pmd_present(*pmdp));  	return pmdp_invalidate(vma, address, pmdp);
>>  }
>>  #endif
>> --
>> 2.25.1
> 
> 
> --
> Best Regards,
> Yan, Zi
Zi Yan April 26, 2024, 2:33 p.m. UTC | #6
--
Best Regards,
Yan, Zi

On 26 Apr 2024, at 0:50, Anshuman Khandual wrote:

> On 4/26/24 00:28, Zi Yan wrote:
>> +Anshuman, who changed pmd_present() semantics. See:
>> https://lore.kernel.org/all/1599627183-14453-2-git-send-email-anshuman.khandual@arm.com/ and commit b65399f6111b ("arm64/mm: Change
>> THP helpers to comply with generic MM semantics")
>>
>> On 25 Apr 2024, at 13:07, Ryan Roberts wrote:
>>
>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>> (non-present) migration entry. It calls pmdp_invalidate()
>>> unconditionally on the pmdp and only determines if it is present or not
>>> based on the returned old pmd. This is a problem for the migration entry
>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>> called for a present pmd.
>>>
>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>> future call to pmd_present() will return true. And therefore any
>>
>> But pmd_mkinvalid() on x86 does not behave so. Maybe we should fix
>> pmd_mkinvalid() on arm64 by not setting PMD_PRESENT_INVALID when the
>> entry is invalid already. And add a test in mm/debug_vm_pgtable.c.
>>
>> I notice that x86, risc-v, mips behave the same. loongarch also
>> has _PAGE_PRESENT_INVALID bit set during pmd_mkinvalid(), but its
>> pmd_present() makes sure _PAGE_HUEG is set before checks _PAGE_PRESENT_INVALID.
>> So it is not a problem for loongarch. Add Huacai to confirm this.
>>
>> Maybe pmd_present() on arm64 can do that too?
>
> pmd_present() should return true even for a splitting PMD which is not
> currently mapped. IIRC in all other architectures, there is a distinct
> identification bit for huge page which stays back, even when the entry
> becomes unmapped. That bit helps pmd_present() return true, during PMD
> splitting process.
>
> But on arm64 platform
>
> #define PTE_VALID                (_AT(pteval_t, 1) << 0)
> #define PMD_SECT_VALID           (_AT(pmdval_t, 1) << 0)
> #define PMD_TABLE_BIT            (_AT(pmdval_t, 1) << 1)
>
> When the PMD entry becomes unmapped, PTE_VALID gets cleared, so does the
> PMD_SECT_VALID, thus erasing its identity as a huge mapping. A software
> bit PMD_PRESENT_INVALID was added which helps in preserving that cleared
> huge page mapping identity once it becomes unmapped.

OK. PMD_SECT_VALID is just a different name of PTE_VALID. I wonder
if ~PMD_TABLE_BIT can be used as _PAGE_HUGE to indicate it is a huge page
PMD, since PMD_TABLE_BIT is unset for PMD huge page already, for swap
entry, since PMD_SECT_VALID is unset, PMD_TABLE_BIT is ignored. But it
will require PTE and PMD have different swap entry encoding on arm64.
It might not be worth the effort.

>>
>>> lockless pgtable walker could see the migration entry pmd in this state
>>> and start interpretting the fields as if it were present, leading to
>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>> I suspect the same is possible on other architectures.
>>>
>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>> good measure let's add a warning to the generic implementation of
>>> pmdp_invalidate(). I've manually reviewed all other
>>> pmdp_invalidate[_ad]() call sites and believe all others to be
>>> conformant.
>>>
>>> This is a theoretical bug found during code review. I don't have any
>>> test case to trigger it in practice.
>>>
>>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>
>>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>>
>>> Thanks,
>>> Ryan
>>>
>>>
>>>  mm/huge_memory.c     | 5 +++--
>>>  mm/pgtable-generic.c | 2 ++
>>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 89f58c7603b2..80939ad00718 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>>  	 * non-huge version of the pmd entry with pmd_populate.
>>>  	 */
>>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>
>>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>>  	if (unlikely(pmd_migration)) {
>>>  		swp_entry_t entry;
>>>
>>> +		old_pmd = *pmd;
>>>  		entry = pmd_to_swp_entry(old_pmd);
>>>  		page = pfn_swap_entry_to_page(entry);
>>>  		write = is_writable_migration_entry(entry);
>>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>  	} else {
>>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>  		page = pmd_page(old_pmd);
>>>  		folio = page_folio(page);
>>>  		if (pmd_dirty(old_pmd)) {
>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>> index 4fcd959dcc4d..74e34ea90656 100644
>>> --- a/mm/pgtable-generic.c
>>> +++ b/mm/pgtable-generic.c
>>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>  		     pmd_t *pmdp)
>>>  {
>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>>  	return old;
>>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>>  			 pmd_t *pmdp)
>>>  {
>>> +	VM_WARN_ON(!pmd_present(*pmdp));  	return pmdp_invalidate(vma, address, pmdp);
>>>  }
>>>  #endif
>>> --
>>> 2.25.1
>>
>>
>> --
>> Best Regards,
>> Yan, Zi
Zi Yan April 26, 2024, 2:49 p.m. UTC | #7
On 26 Apr 2024, at 3:43, Ryan Roberts wrote:

> On 26/04/2024 05:19, Anshuman Khandual wrote:
>> On 4/25/24 22:37, Ryan Roberts wrote:
>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>> (non-present) migration entry. It calls pmdp_invalidate()
>>> unconditionally on the pmdp and only determines if it is present or not
>>> based on the returned old pmd. This is a problem for the migration entry
>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>> called for a present pmd.
>>
>> pmdp_invalidate() must be called only for present PMD - is this expected
>> by core MM ? Does this cause any problem otherwise ?
>
> I'm saying that only calling pmdp_invalidate() on a pte_present()==true pte is
> the only semantic that makes sense. And, yes, it causes a problem if called on a
> pte_present()==false pte - that's exactly what I'm describing in this commit log.
>
> To labour the point, this is the logical type hierachy of PTEs (and block-mapped
> PMDs) as I see it:
>
> ---8<----
>
> pte
>  |- present
>  |   |- valid
>  |   |- invalid
>  |
>  |- not_present
>      |- none
>      |- swap_pte
>
> present: All fields must be interpretted the way the HW sees them. e.g.
>          pte_pfn(), pte_write(), pte_dirty(), pte_young(), pte_mkwrite(),
>          pte_mkold() can all be legitimately used to query and modify the pte.
>
>   valid: The HW may access the pte, interpret the fields and create a TLB entry,
>          etc.
>
>   invalid: The HW will never access the pte or create a TLB entry for it.
>
> not_present: The fields are SW-defined. HW never accesses the PTE.
>
>   none: Unused; represents a hole
>
>   swap_pte: Contains a swap entry and swap pte bits. The contained swap entry
>             may 1 of a few different types e.g. actual swap entry, migration
>             entry, hw poison, etc.
>
> ---8<----
>
> We test present vs not_present with pte_present()
>
> We test none vs swap_pte with pte_none()
>
> valid vs invalid is slightly more vague. The core-mm can move a PMD from valid
> -> invalid by calling pmd_mkinvalid(). But it can't query the state. And it
> can't do this generically for a PTE.
>
>
> Based on that lot, it makes no sense to me that we should permit calling
> pmd_mkinvalid() on a non-present pte. Indeed, we don't permit calling
> pte_mkwrite() etc on a non-present pte. And those functions are not defensive;
> they don't check that the pte is present before making the change. They just
> trust that the core-mm will not call them for non-present ptes.

I am OK with disallowing to call pmd_mkinvalid() on a non-present entry, but
would like to know how to enforce it or document it. Because x86, risc-v, mips,
and loongarch can call pmd_mkinvalid() on a non-present entry without causing
any issue, any developer who work on these arches but arm64 can use pmd_mkinvalid()
improperly until someone else tests it on arm64. You will need to add VM_WARM_ON()
to all arch versions of pmd_mkinvalid().

>
> The alternative approach would be to make pmdp_invalidate() defensive so that it
> checks the pmd is present before making any changes. But it doesn't semantically
> make sense to invalidate a non-present pmd in the first place so why call it
> under these circumstances? There is also a practical problem in that some arches
> implement their own pmdp_invalidate() so you would want to make all those
> defensive too, which would grow the size of the change.

Like I said above, if you do not do this, other arches developers can break arm64
without knowing it, since their pmd_mkinvalid() do not change a non-present
PMD to a present one.

>>
>>>
>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>> future call to pmd_present() will return true. And therefore any
>>
>> IIRC the following semantics needs to be followed as expected by core MM.
>>
>> -------------------------------------------------------------------------
>> |	PMD states	|	pmd_present	|	pmd_trans_huge	|
>> -------------------------------------------------------------------------
>> |	Mapped		|	Yes		|	Yes		|
>> -------------------------------------------------------------------------
>> |	Splitting	|	Yes		|	Yes		|
>> -------------------------------------------------------------------------
>> |	Migration/Swap	|	No		|	No		|
>> -------------------------------------------------------------------------
>
> Indeed, the problem, as I see it, is if pmd_mkinvalid() is called on a
> "Migration/Swap" pmd, then a future call to pmd_present() will return Yes, which
> is clearly wrong. pmd_trans_huge() will also return Yes due to:
>
> static inline int pmd_trans_huge(pmd_t pmd)
> {
> 	return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
> }
>
> At least this happens for arm64. Although Zi suggests other arches look like
> they will do this too in the other email.
>
> The reason is that arm64's pmd_mkinvalid() unconditionally sets
> PMD_PRESENT_INVALID (bit 59) and clears PMD_SECT_VALID (bit 0) in the pte. So
> next time pmd_present() is called it will see PMD_PRESENT_INVALID is set and
> return true.
>
>>
>>
>>> lockless pgtable walker could see the migration entry pmd in this state
>>> and start interpretting the fields as if it were present, leading to
>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>
>> Could you please explain how bad things might happen ?
>
> See 2 places where pmdp_get_lockless() is called in gup.c, without the PTL.
> These could both return the swap pte for which pmd_mkinvalid() has been called.
> In both cases, this would lead to the pmd_present() check eroneously returning
> true, eventually causing incorrect interpretation of the pte fields. e.g.:
>
> gup_pmd_range()
>   pmd_t pmd = pmdp_get_lockless(pmdp);
>   gup_huge_pmd(pmd, ...)
>     page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
>
> page is guff.
>
> Let me know what you think!
>
> Thanks,
> Ryan
>
>
>>
>>> I suspect the same is possible on other architectures.
>>>
>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>> good measure let's add a warning to the generic implementation of
>>> pmdp_invalidate(). I've manually reviewed all other
>>> pmdp_invalidate[_ad]() call sites and believe all others to be
>>> conformant.
>>>
>>> This is a theoretical bug found during code review. I don't have any
>>> test case to trigger it in practice.
>>>
>>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>
>>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>>
>>> Thanks,
>>> Ryan
>>>
>>>
>>>  mm/huge_memory.c     | 5 +++--
>>>  mm/pgtable-generic.c | 2 ++
>>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 89f58c7603b2..80939ad00718 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>>  	 * non-huge version of the pmd entry with pmd_populate.
>>>  	 */
>>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>
>>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>>  	if (unlikely(pmd_migration)) {
>>>  		swp_entry_t entry;
>>>
>>> +		old_pmd = *pmd;
>>>  		entry = pmd_to_swp_entry(old_pmd);
>>>  		page = pfn_swap_entry_to_page(entry);
>>>  		write = is_writable_migration_entry(entry);
>>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>  	} else {
>>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>  		page = pmd_page(old_pmd);
>>>  		folio = page_folio(page);
>>>  		if (pmd_dirty(old_pmd)) {
>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>> index 4fcd959dcc4d..74e34ea90656 100644
>>> --- a/mm/pgtable-generic.c
>>> +++ b/mm/pgtable-generic.c
>>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>  		     pmd_t *pmdp)
>>>  {
>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>>  	return old;
>>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>>  			 pmd_t *pmdp)
>>>  {
>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>  	return pmdp_invalidate(vma, address, pmdp);
>>>  }
>>>  #endif
>>> --
>>> 2.25.1
>>>
>>>


--
Best Regards,
Yan, Zi
Zi Yan April 26, 2024, 2:53 p.m. UTC | #8
On 26 Apr 2024, at 10:49, Zi Yan wrote:

> On 26 Apr 2024, at 3:43, Ryan Roberts wrote:
>
>> On 26/04/2024 05:19, Anshuman Khandual wrote:
>>> On 4/25/24 22:37, Ryan Roberts wrote:
>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>> unconditionally on the pmdp and only determines if it is present or not
>>>> based on the returned old pmd. This is a problem for the migration entry
>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>> called for a present pmd.
>>>
>>> pmdp_invalidate() must be called only for present PMD - is this expected
>>> by core MM ? Does this cause any problem otherwise ?
>>
>> I'm saying that only calling pmdp_invalidate() on a pte_present()==true pte is
>> the only semantic that makes sense. And, yes, it causes a problem if called on a
>> pte_present()==false pte - that's exactly what I'm describing in this commit log.
>>
>> To labour the point, this is the logical type hierachy of PTEs (and block-mapped
>> PMDs) as I see it:
>>
>> ---8<----
>>
>> pte
>>  |- present
>>  |   |- valid
>>  |   |- invalid
>>  |
>>  |- not_present
>>      |- none
>>      |- swap_pte
>>
>> present: All fields must be interpretted the way the HW sees them. e.g.
>>          pte_pfn(), pte_write(), pte_dirty(), pte_young(), pte_mkwrite(),
>>          pte_mkold() can all be legitimately used to query and modify the pte.
>>
>>   valid: The HW may access the pte, interpret the fields and create a TLB entry,
>>          etc.
>>
>>   invalid: The HW will never access the pte or create a TLB entry for it.
>>
>> not_present: The fields are SW-defined. HW never accesses the PTE.
>>
>>   none: Unused; represents a hole
>>
>>   swap_pte: Contains a swap entry and swap pte bits. The contained swap entry
>>             may 1 of a few different types e.g. actual swap entry, migration
>>             entry, hw poison, etc.
>>
>> ---8<----
>>
>> We test present vs not_present with pte_present()
>>
>> We test none vs swap_pte with pte_none()
>>
>> valid vs invalid is slightly more vague. The core-mm can move a PMD from valid
>> -> invalid by calling pmd_mkinvalid(). But it can't query the state. And it
>> can't do this generically for a PTE.
>>
>>
>> Based on that lot, it makes no sense to me that we should permit calling
>> pmd_mkinvalid() on a non-present pte. Indeed, we don't permit calling
>> pte_mkwrite() etc on a non-present pte. And those functions are not defensive;
>> they don't check that the pte is present before making the change. They just
>> trust that the core-mm will not call them for non-present ptes.
>
> I am OK with disallowing to call pmd_mkinvalid() on a non-present entry, but
> would like to know how to enforce it or document it. Because x86, risc-v, mips,
> and loongarch can call pmd_mkinvalid() on a non-present entry without causing
> any issue, any developer who work on these arches but arm64 can use pmd_mkinvalid()
> improperly until someone else tests it on arm64. You will need to add VM_WARM_ON()
> to all arch versions of pmd_mkinvalid().
>
>>
>> The alternative approach would be to make pmdp_invalidate() defensive so that it
>> checks the pmd is present before making any changes. But it doesn't semantically
>> make sense to invalidate a non-present pmd in the first place so why call it
>> under these circumstances? There is also a practical problem in that some arches
>> implement their own pmdp_invalidate() so you would want to make all those
>> defensive too, which would grow the size of the change.
>
> Like I said above, if you do not do this, other arches developers can break arm64
> without knowing it, since their pmd_mkinvalid() do not change a non-present
> PMD to a present one.
>
>>>
>>>>
>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>> future call to pmd_present() will return true. And therefore any
>>>
>>> IIRC the following semantics needs to be followed as expected by core MM.
>>>
>>> -------------------------------------------------------------------------
>>> |	PMD states	|	pmd_present	|	pmd_trans_huge	|
>>> -------------------------------------------------------------------------
>>> |	Mapped		|	Yes		|	Yes		|
>>> -------------------------------------------------------------------------
>>> |	Splitting	|	Yes		|	Yes		|
>>> -------------------------------------------------------------------------
>>> |	Migration/Swap	|	No		|	No		|
>>> -------------------------------------------------------------------------
>>
>> Indeed, the problem, as I see it, is if pmd_mkinvalid() is called on a
>> "Migration/Swap" pmd, then a future call to pmd_present() will return Yes, which
>> is clearly wrong. pmd_trans_huge() will also return Yes due to:
>>
>> static inline int pmd_trans_huge(pmd_t pmd)
>> {
>> 	return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
>> }
>>
>> At least this happens for arm64. Although Zi suggests other arches look like
>> they will do this too in the other email.
>>
>> The reason is that arm64's pmd_mkinvalid() unconditionally sets
>> PMD_PRESENT_INVALID (bit 59) and clears PMD_SECT_VALID (bit 0) in the pte. So
>> next time pmd_present() is called it will see PMD_PRESENT_INVALID is set and
>> return true.
>>
>>>
>>>
>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>> and start interpretting the fields as if it were present, leading to
>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>
>>> Could you please explain how bad things might happen ?
>>
>> See 2 places where pmdp_get_lockless() is called in gup.c, without the PTL.
>> These could both return the swap pte for which pmd_mkinvalid() has been called.
>> In both cases, this would lead to the pmd_present() check eroneously returning
>> true, eventually causing incorrect interpretation of the pte fields. e.g.:
>>
>> gup_pmd_range()
>>   pmd_t pmd = pmdp_get_lockless(pmdp);
>>   gup_huge_pmd(pmd, ...)
>>     page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
>>
>> page is guff.
>>
>> Let me know what you think!

Add JohnH to check GUP code.

>>
>> Thanks,
>> Ryan
>>
>>
>>>
>>>> I suspect the same is possible on other architectures.
>>>>
>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>> good measure let's add a warning to the generic implementation of
>>>> pmdp_invalidate(). I've manually reviewed all other
>>>> pmdp_invalidate[_ad]() call sites and believe all others to be
>>>> conformant.
>>>>
>>>> This is a theoretical bug found during code review. I don't have any
>>>> test case to trigger it in practice.
>>>>
>>>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> ---
>>>>
>>>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>
>>>>  mm/huge_memory.c     | 5 +++--
>>>>  mm/pgtable-generic.c | 2 ++
>>>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 89f58c7603b2..80939ad00718 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>>>  	 * non-huge version of the pmd entry with pmd_populate.
>>>>  	 */
>>>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>
>>>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>>>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>>>  	if (unlikely(pmd_migration)) {
>>>>  		swp_entry_t entry;
>>>>
>>>> +		old_pmd = *pmd;
>>>>  		entry = pmd_to_swp_entry(old_pmd);
>>>>  		page = pfn_swap_entry_to_page(entry);
>>>>  		write = is_writable_migration_entry(entry);
>>>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>>  	} else {
>>>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>  		page = pmd_page(old_pmd);
>>>>  		folio = page_folio(page);
>>>>  		if (pmd_dirty(old_pmd)) {
>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>>> index 4fcd959dcc4d..74e34ea90656 100644
>>>> --- a/mm/pgtable-generic.c
>>>> +++ b/mm/pgtable-generic.c
>>>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>  		     pmd_t *pmdp)
>>>>  {
>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>>>  	return old;
>>>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>>>  			 pmd_t *pmdp)
>>>>  {
>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>>  	return pmdp_invalidate(vma, address, pmdp);
>>>>  }
>>>>  #endif
>>>> --
>>>> 2.25.1
>>>>
>>>>
>
>
> --
> Best Regards,
> Yan, Zi


--
Best Regards,
Yan, Zi
John Hubbard April 27, 2024, 4:25 a.m. UTC | #9
On 4/26/24 7:53 AM, Zi Yan wrote:

Hi Zi (and Ryan)!

>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>> and start interpretting the fields as if it were present, leading to
>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>
>>>> Could you please explain how bad things might happen ?
>>>
>>> See 2 places where pmdp_get_lockless() is called in gup.c, without the PTL.
>>> These could both return the swap pte for which pmd_mkinvalid() has been called.
>>> In both cases, this would lead to the pmd_present() check eroneously returning
>>> true, eventually causing incorrect interpretation of the pte fields. e.g.:
>>>
>>> gup_pmd_range()
>>>    pmd_t pmd = pmdp_get_lockless(pmdp);
>>>    gup_huge_pmd(pmd, ...)
>>>      page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
>>>
>>> page is guff.
>>>
>>> Let me know what you think!
> 
> Add JohnH to check GUP code.

Ryan is correct about this behavior.

By the way, remember that gup is not the only lockless page table
walker: there is also the CPU hardware itself, which inconveniently
refuses to bother with taking page table locks. :)

So if we have code that can make a non-present PTE appear to be present
to any of these page walkers, whether software or hardware, it's a
definitely Not Good and will lead directly to bugs.

Since I had to study this patch and discussion a bit in order to
respond, I'll go ahead and also reply to the original patch with review
comments.


thanks,
John Hubbard April 27, 2024, 4:41 a.m. UTC | #10
On 4/25/24 10:07 AM, Ryan Roberts wrote:
> __split_huge_pmd_locked() can be called for a present THP, devmap or
> (non-present) migration entry. It calls pmdp_invalidate()
> unconditionally on the pmdp and only determines if it is present or not
> based on the returned old pmd. This is a problem for the migration entry
> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
> called for a present pmd.
> 
> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
> future call to pmd_present() will return true. And therefore any
> lockless pgtable walker could see the migration entry pmd in this state
> and start interpretting the fields as if it were present, leading to
> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
> I suspect the same is possible on other architectures.
> 
> Fix this by only calling pmdp_invalidate() for a present pmd. And for

Yes, this seems like a good design decision (after reading through the
discussion that you all had in the other threads).

> good measure let's add a warning to the generic implementation of
> pmdp_invalidate(). I've manually reviewed all other
> pmdp_invalidate[_ad]() call sites and believe all others to be
> conformant.
> 
> This is a theoretical bug found during code review. I don't have any
> test case to trigger it in practice.
> 
> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
> 
> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
> 
> Thanks,
> Ryan
> 
> 
>   mm/huge_memory.c     | 5 +++--
>   mm/pgtable-generic.c | 2 ++
>   2 files changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 89f58c7603b2..80939ad00718 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   	 * for this pmd), then we flush the SMP TLB and finally we write the
>   	 * non-huge version of the pmd entry with pmd_populate.
>   	 */
> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
> 
> -	pmd_migration = is_pmd_migration_entry(old_pmd);
> +	pmd_migration = is_pmd_migration_entry(*pmd);
>   	if (unlikely(pmd_migration)) {
>   		swp_entry_t entry;
> 
> +		old_pmd = *pmd;
>   		entry = pmd_to_swp_entry(old_pmd);
>   		page = pfn_swap_entry_to_page(entry);
>   		write = is_writable_migration_entry(entry);
> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>   		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>   	} else {
> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);

This looks good, except that now I am deeply confused about the pre-existing
logic. I thought that migration entries were a subset of swap entries,
but this code seems to be treating is_pmd_migration_entry() as a
synonym for "is a swap entry". Can you shed any light on this for me?


>   		page = pmd_page(old_pmd);
>   		folio = page_folio(page);
>   		if (pmd_dirty(old_pmd)) {
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 4fcd959dcc4d..74e34ea90656 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>   pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>   		     pmd_t *pmdp)
>   {
> +	VM_WARN_ON(!pmd_present(*pmdp));
>   	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>   	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>   	return old;
> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>   pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>   			 pmd_t *pmdp)
>   {
> +	VM_WARN_ON(!pmd_present(*pmdp));

Should these be VM_WARN_ON_ONCE(), instead?

Also, this seems like a good place to put a little comment in, to mark the
new design constraint. Something like "Only present entries are allowed
to be invalidated", perhaps.


>   	return pmdp_invalidate(vma, address, pmdp);
>   }
>   #endif
> --
> 2.25.1
> 
> 

thanks,
Zi Yan April 27, 2024, 3:07 p.m. UTC | #11
On 27 Apr 2024, at 0:25, John Hubbard wrote:

> On 4/26/24 7:53 AM, Zi Yan wrote:
>
> Hi Zi (and Ryan)!
>
>>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>>> and start interpretting the fields as if it were present, leading to
>>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>>
>>>>> Could you please explain how bad things might happen ?
>>>>
>>>> See 2 places where pmdp_get_lockless() is called in gup.c, without the PTL.
>>>> These could both return the swap pte for which pmd_mkinvalid() has been called.
>>>> In both cases, this would lead to the pmd_present() check eroneously returning
>>>> true, eventually causing incorrect interpretation of the pte fields. e.g.:
>>>>
>>>> gup_pmd_range()
>>>>    pmd_t pmd = pmdp_get_lockless(pmdp);
>>>>    gup_huge_pmd(pmd, ...)
>>>>      page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
>>>>
>>>> page is guff.
>>>>
>>>> Let me know what you think!
>>
>> Add JohnH to check GUP code.
>
> Ryan is correct about this behavior.
>
> By the way, remember that gup is not the only lockless page table
> walker: there is also the CPU hardware itself, which inconveniently
> refuses to bother with taking page table locks. :)
>
> So if we have code that can make a non-present PTE appear to be present
> to any of these page walkers, whether software or hardware, it's a
> definitely Not Good and will lead directly to bugs.

This issue does not bother hardware, because the PTE_VALID/PMD_SECT_VALID
is always unset and hardware always sees this PMD as invalid. It is a pure
software issue, since for THP splitting, we do not want hardware to access
the page but still allow kernel to user pmd_page() to get the pfn, so
pmd_present() returns true even if PTE_VALID/PMD_SECT_VALID is unset by
setting and checking PMD_PRESENT_INVALID bit. pmd_mkinvalid() sets
PMD_PRESENT_INVALID, turning a migration entry from !pmd_present() to
pmd_present(), while it is always a invalid PMD to hardware.

>
> Since I had to study this patch and discussion a bit in order to
> respond, I'll go ahead and also reply to the original patch with review
> comments.
>
>
> thanks,
> -- 
> John Hubbard
> NVIDIA


--
Best Regards,
Yan, Zi
Zi Yan April 27, 2024, 3:14 p.m. UTC | #12
On 27 Apr 2024, at 0:41, John Hubbard wrote:

> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>> (non-present) migration entry. It calls pmdp_invalidate()
>> unconditionally on the pmdp and only determines if it is present or not
>> based on the returned old pmd. This is a problem for the migration entry
>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>> called for a present pmd.
>>
>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>> future call to pmd_present() will return true. And therefore any
>> lockless pgtable walker could see the migration entry pmd in this state
>> and start interpretting the fields as if it were present, leading to
>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>> I suspect the same is possible on other architectures.
>>
>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>
> Yes, this seems like a good design decision (after reading through the
> discussion that you all had in the other threads).

This will only be good for arm64 and does not prevent other arch developers
to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
a swap entry to a pmd_present() entry.

>
>> good measure let's add a warning to the generic implementation of
>> pmdp_invalidate(). I've manually reviewed all other
>> pmdp_invalidate[_ad]() call sites and believe all others to be
>> conformant.
>>
>> This is a theoretical bug found during code review. I don't have any
>> test case to trigger it in practice.
>>
>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>
>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>
>> Thanks,
>> Ryan
>>
>>
>>   mm/huge_memory.c     | 5 +++--
>>   mm/pgtable-generic.c | 2 ++
>>   2 files changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 89f58c7603b2..80939ad00718 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>   	 * for this pmd), then we flush the SMP TLB and finally we write the
>>   	 * non-huge version of the pmd entry with pmd_populate.
>>   	 */
>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>
>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>   	if (unlikely(pmd_migration)) {
>>   		swp_entry_t entry;
>>
>> +		old_pmd = *pmd;
>>   		entry = pmd_to_swp_entry(old_pmd);
>>   		page = pfn_swap_entry_to_page(entry);
>>   		write = is_writable_migration_entry(entry);
>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>   		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>   		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>   	} else {
>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>
> This looks good, except that now I am deeply confused about the pre-existing
> logic. I thought that migration entries were a subset of swap entries,
> but this code seems to be treating is_pmd_migration_entry() as a
> synonym for "is a swap entry". Can you shed any light on this for me?

It is likely because kernel only split present pmd and migration pmd, but I
could be wrong since the code is changed a lot since splitting migration
pmd was added. We either need to check all call sites or check pmd_present()
instead of is_pmd_migration_entry() and handle all possible situations.

>
>
>>   		page = pmd_page(old_pmd);
>>   		folio = page_folio(page);
>>   		if (pmd_dirty(old_pmd)) {
>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>> index 4fcd959dcc4d..74e34ea90656 100644
>> --- a/mm/pgtable-generic.c
>> +++ b/mm/pgtable-generic.c
>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>   pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>   		     pmd_t *pmdp)
>>   {
>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>   	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>   	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>   	return old;
>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>   pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>   			 pmd_t *pmdp)
>>   {
>> +	VM_WARN_ON(!pmd_present(*pmdp));
>
> Should these be VM_WARN_ON_ONCE(), instead?
>
> Also, this seems like a good place to put a little comment in, to mark the
> new design constraint. Something like "Only present entries are allowed
> to be invalidated", perhaps.
>
>
>>   	return pmdp_invalidate(vma, address, pmdp);
>>   }
>>   #endif
>> --
>> 2.25.1
>>
>>
>
> thanks,
> -- 
> John Hubbard
> NVIDIA


--
Best Regards,
Yan, Zi
John Hubbard April 27, 2024, 7:11 p.m. UTC | #13
On 4/27/24 8:14 AM, Zi Yan wrote:
> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>> (non-present) migration entry. It calls pmdp_invalidate()
>>> unconditionally on the pmdp and only determines if it is present or not
>>> based on the returned old pmd. This is a problem for the migration entry
>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>> called for a present pmd.
>>>
>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>> future call to pmd_present() will return true. And therefore any
>>> lockless pgtable walker could see the migration entry pmd in this state
>>> and start interpretting the fields as if it were present, leading to
>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>> I suspect the same is possible on other architectures.
>>>
>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>
>> Yes, this seems like a good design decision (after reading through the
>> discussion that you all had in the other threads).
> 
> This will only be good for arm64 and does not prevent other arch developers
> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
> a swap entry to a pmd_present() entry.

Well, let's characterize it in a bit more detail, then:

1) This patch will make things better for arm64. That's important!

2) Equally important, this patch does not make anything worse for
    other CPU arches.

3) This patch represents a new design constraint on the CPU arch
    layer, and thus requires documentation and whatever enforcement
    we can provide, in order to keep future code out of trouble.

3.a) See the VM_WARN_ON() hunks below.

3.b) I like the new design constraint, because it is reasonable and
      clearly understandable: don't invalidate a non-present page
      table entry.

I do wonder if there is somewhere else that this should be documented?


thanks,
Zi Yan April 27, 2024, 8:45 p.m. UTC | #14
On 27 Apr 2024, at 15:11, John Hubbard wrote:

> On 4/27/24 8:14 AM, Zi Yan wrote:
>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>> unconditionally on the pmdp and only determines if it is present or not
>>>> based on the returned old pmd. This is a problem for the migration entry
>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>> called for a present pmd.
>>>>
>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>> future call to pmd_present() will return true. And therefore any
>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>> and start interpretting the fields as if it were present, leading to
>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>> I suspect the same is possible on other architectures.
>>>>
>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>
>>> Yes, this seems like a good design decision (after reading through the
>>> discussion that you all had in the other threads).
>>
>> This will only be good for arm64 and does not prevent other arch developers
>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>> a swap entry to a pmd_present() entry.
>
> Well, let's characterize it in a bit more detail, then:
>
> 1) This patch will make things better for arm64. That's important!
>
> 2) Equally important, this patch does not make anything worse for
>    other CPU arches.
>
> 3) This patch represents a new design constraint on the CPU arch
>    layer, and thus requires documentation and whatever enforcement
>    we can provide, in order to keep future code out of trouble.
>
> 3.a) See the VM_WARN_ON() hunks below.
>
> 3.b) I like the new design constraint, because it is reasonable and
>      clearly understandable: don't invalidate a non-present page
>      table entry.
>
> I do wonder if there is somewhere else that this should be documented?

The issue is pmd_mkinvalid(), since it turns a swap entry into a pmd_present()
entry on arm64. This patch only adds a warning on pmd_invalidate(), although
pmd_invalidate() is the only caller of pmd_mkinvalid(). This means any
future user of pmd_mkinvalid() can cause the same issue on arm64 without any
warning.

I am not against changing the logic in __split_huge_pmd_lock() to fix arm64,
but just want to prevent future errors, that might only be possible on arm64.

BTW, in terms of the patch itself, moving "pmdp_invalidate(vma, haddr, pmd)"
without moving the big comment above it is not OK, since later no one can
figure out why that comment is there.


>
>
> thanks,
> -- 
> John Hubbard
> NVIDIA
>
>
>>>
>>>> good measure let's add a warning to the generic implementation of
>>>> pmdp_invalidate(). I've manually reviewed all other
>>>> pmdp_invalidate[_ad]() call sites and believe all others to be
>>>> conformant.
>>>>
>>>> This is a theoretical bug found during code review. I don't have any
>>>> test case to trigger it in practice.
>>>>
>>>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> ---
>>>>
>>>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>
>>>>    mm/huge_memory.c     | 5 +++--
>>>>    mm/pgtable-generic.c | 2 ++
>>>>    2 files changed, 5 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 89f58c7603b2..80939ad00718 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>    	 * for this pmd), then we flush the SMP TLB and finally we write the
>>>>    	 * non-huge version of the pmd entry with pmd_populate.
>>>>    	 */
>>>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>
>>>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>>>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>>>    	if (unlikely(pmd_migration)) {
>>>>    		swp_entry_t entry;
>>>>
>>>> +		old_pmd = *pmd;
>>>>    		entry = pmd_to_swp_entry(old_pmd);
>>>>    		page = pfn_swap_entry_to_page(entry);
>>>>    		write = is_writable_migration_entry(entry);
>>>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>    		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>>    		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>>    	} else {
>>>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>
>>> This looks good, except that now I am deeply confused about the pre-existing
>>> logic. I thought that migration entries were a subset of swap entries,
>>> but this code seems to be treating is_pmd_migration_entry() as a
>>> synonym for "is a swap entry". Can you shed any light on this for me?
>>
>> It is likely because kernel only split present pmd and migration pmd, but I
>> could be wrong since the code is changed a lot since splitting migration
>> pmd was added. We either need to check all call sites or check pmd_present()
>> instead of is_pmd_migration_entry() and handle all possible situations.
>>
>>>
>>>
>>>>    		page = pmd_page(old_pmd);
>>>>    		folio = page_folio(page);
>>>>    		if (pmd_dirty(old_pmd)) {
>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>>> index 4fcd959dcc4d..74e34ea90656 100644
>>>> --- a/mm/pgtable-generic.c
>>>> +++ b/mm/pgtable-generic.c
>>>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>>>    pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>    		     pmd_t *pmdp)
>>>>    {
>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>>    	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>>>    	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>>>    	return old;
>>>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>    pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>>>    			 pmd_t *pmdp)
>>>>    {
>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>
>>> Should these be VM_WARN_ON_ONCE(), instead?
>>>
>>> Also, this seems like a good place to put a little comment in, to mark the
>>> new design constraint. Something like "Only present entries are allowed
>>> to be invalidated", perhaps.
>>>
>>>
>>>>    	return pmdp_invalidate(vma, address, pmdp);
>>>>    }
>>>>    #endif
>>>> --
>>>> 2.25.1
>>>>
>>>>
>>>
>>> thanks,
>>> -- 
>>> John Hubbard
>>> NVIDIA
>>
>>
>> --
>> Best Regards,
>> Yan, Zi


--
Best Regards,
Yan, Zi
Zi Yan April 27, 2024, 8:48 p.m. UTC | #15
On 27 Apr 2024, at 16:45, Zi Yan wrote:

> On 27 Apr 2024, at 15:11, John Hubbard wrote:
>
>> On 4/27/24 8:14 AM, Zi Yan wrote:
>>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>>> unconditionally on the pmdp and only determines if it is present or not
>>>>> based on the returned old pmd. This is a problem for the migration entry
>>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>>> called for a present pmd.
>>>>>
>>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>>> future call to pmd_present() will return true. And therefore any
>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>> and start interpretting the fields as if it were present, leading to
>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>> I suspect the same is possible on other architectures.
>>>>>
>>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>>
>>>> Yes, this seems like a good design decision (after reading through the
>>>> discussion that you all had in the other threads).
>>>
>>> This will only be good for arm64 and does not prevent other arch developers
>>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>>> a swap entry to a pmd_present() entry.
>>
>> Well, let's characterize it in a bit more detail, then:
>>
>> 1) This patch will make things better for arm64. That's important!
>>
>> 2) Equally important, this patch does not make anything worse for
>>    other CPU arches.
>>
>> 3) This patch represents a new design constraint on the CPU arch
>>    layer, and thus requires documentation and whatever enforcement
>>    we can provide, in order to keep future code out of trouble.
>>
>> 3.a) See the VM_WARN_ON() hunks below.
>>
>> 3.b) I like the new design constraint, because it is reasonable and
>>      clearly understandable: don't invalidate a non-present page
>>      table entry.
>>
>> I do wonder if there is somewhere else that this should be documented?

In terms of documentation, at least in Documentation/mm/arch_pgtable_helpers.rst,
pmd_mkinvalid() entry needs to add "do not call on an invalid entry as
it breaks arm64".

>
> The issue is pmd_mkinvalid(), since it turns a swap entry into a pmd_present()
> entry on arm64. This patch only adds a warning on pmd_invalidate(), although
> pmd_invalidate() is the only caller of pmd_mkinvalid(). This means any
> future user of pmd_mkinvalid() can cause the same issue on arm64 without any
> warning.
>
> I am not against changing the logic in __split_huge_pmd_lock() to fix arm64,
> but just want to prevent future errors, that might only be possible on arm64.
>
> BTW, in terms of the patch itself, moving "pmdp_invalidate(vma, haddr, pmd)"
> without moving the big comment above it is not OK, since later no one can
> figure out why that comment is there.
>
>
>>
>>
>> thanks,
>> -- 
>> John Hubbard
>> NVIDIA
>>
>>
>>>>
>>>>> good measure let's add a warning to the generic implementation of
>>>>> pmdp_invalidate(). I've manually reviewed all other
>>>>> pmdp_invalidate[_ad]() call sites and believe all others to be
>>>>> conformant.
>>>>>
>>>>> This is a theoretical bug found during code review. I don't have any
>>>>> test case to trigger it in practice.
>>>>>
>>>>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>>> ---
>>>>>
>>>>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>>>>
>>>>> Thanks,
>>>>> Ryan
>>>>>
>>>>>
>>>>>    mm/huge_memory.c     | 5 +++--
>>>>>    mm/pgtable-generic.c | 2 ++
>>>>>    2 files changed, 5 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 89f58c7603b2..80939ad00718 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>>    	 * for this pmd), then we flush the SMP TLB and finally we write the
>>>>>    	 * non-huge version of the pmd entry with pmd_populate.
>>>>>    	 */
>>>>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>>
>>>>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>>>>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>>>>    	if (unlikely(pmd_migration)) {
>>>>>    		swp_entry_t entry;
>>>>>
>>>>> +		old_pmd = *pmd;
>>>>>    		entry = pmd_to_swp_entry(old_pmd);
>>>>>    		page = pfn_swap_entry_to_page(entry);
>>>>>    		write = is_writable_migration_entry(entry);
>>>>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>>    		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>>>    		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>>>    	} else {
>>>>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>
>>>> This looks good, except that now I am deeply confused about the pre-existing
>>>> logic. I thought that migration entries were a subset of swap entries,
>>>> but this code seems to be treating is_pmd_migration_entry() as a
>>>> synonym for "is a swap entry". Can you shed any light on this for me?
>>>
>>> It is likely because kernel only split present pmd and migration pmd, but I
>>> could be wrong since the code is changed a lot since splitting migration
>>> pmd was added. We either need to check all call sites or check pmd_present()
>>> instead of is_pmd_migration_entry() and handle all possible situations.
>>>
>>>>
>>>>
>>>>>    		page = pmd_page(old_pmd);
>>>>>    		folio = page_folio(page);
>>>>>    		if (pmd_dirty(old_pmd)) {
>>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>>>> index 4fcd959dcc4d..74e34ea90656 100644
>>>>> --- a/mm/pgtable-generic.c
>>>>> +++ b/mm/pgtable-generic.c
>>>>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>>>>    pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>>    		     pmd_t *pmdp)
>>>>>    {
>>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>>>    	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>>>>    	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>>>>    	return old;
>>>>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>>    pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>>>>    			 pmd_t *pmdp)
>>>>>    {
>>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>>
>>>> Should these be VM_WARN_ON_ONCE(), instead?
>>>>
>>>> Also, this seems like a good place to put a little comment in, to mark the
>>>> new design constraint. Something like "Only present entries are allowed
>>>> to be invalidated", perhaps.
>>>>
>>>>
>>>>>    	return pmdp_invalidate(vma, address, pmdp);
>>>>>    }
>>>>>    #endif
>>>>> --
>>>>> 2.25.1
>>>>>
>>>>>
>>>>
>>>> thanks,
>>>> -- 
>>>> John Hubbard
>>>> NVIDIA
>>>
>>>
>>> --
>>> Best Regards,
>>> Yan, Zi
>
>
> --
> Best Regards,
> Yan, Zi


--
Best Regards,
Yan, Zi
Anshuman Khandual April 29, 2024, 3:36 a.m. UTC | #16
On 4/26/24 20:03, Zi Yan wrote:
> 
> 
> --
> Best Regards,
> Yan, Zi
> 
> On 26 Apr 2024, at 0:50, Anshuman Khandual wrote:
> 
>> On 4/26/24 00:28, Zi Yan wrote:
>>> +Anshuman, who changed pmd_present() semantics. See:
>>> https://lore.kernel.org/all/1599627183-14453-2-git-send-email-anshuman.khandual@arm.com/ and commit b65399f6111b ("arm64/mm: Change
>>> THP helpers to comply with generic MM semantics")
>>>
>>> On 25 Apr 2024, at 13:07, Ryan Roberts wrote:
>>>
>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>> unconditionally on the pmdp and only determines if it is present or not
>>>> based on the returned old pmd. This is a problem for the migration entry
>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>> called for a present pmd.
>>>>
>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>> future call to pmd_present() will return true. And therefore any
>>>
>>> But pmd_mkinvalid() on x86 does not behave so. Maybe we should fix
>>> pmd_mkinvalid() on arm64 by not setting PMD_PRESENT_INVALID when the
>>> entry is invalid already. And add a test in mm/debug_vm_pgtable.c.
>>>
>>> I notice that x86, risc-v, mips behave the same. loongarch also
>>> has _PAGE_PRESENT_INVALID bit set during pmd_mkinvalid(), but its
>>> pmd_present() makes sure _PAGE_HUEG is set before checks _PAGE_PRESENT_INVALID.
>>> So it is not a problem for loongarch. Add Huacai to confirm this.
>>>
>>> Maybe pmd_present() on arm64 can do that too?
>>
>> pmd_present() should return true even for a splitting PMD which is not
>> currently mapped. IIRC in all other architectures, there is a distinct
>> identification bit for huge page which stays back, even when the entry
>> becomes unmapped. That bit helps pmd_present() return true, during PMD
>> splitting process.
>>
>> But on arm64 platform
>>
>> #define PTE_VALID                (_AT(pteval_t, 1) << 0)
>> #define PMD_SECT_VALID           (_AT(pmdval_t, 1) << 0)
>> #define PMD_TABLE_BIT            (_AT(pmdval_t, 1) << 1)
>>
>> When the PMD entry becomes unmapped, PTE_VALID gets cleared, so does the
>> PMD_SECT_VALID, thus erasing its identity as a huge mapping. A software
>> bit PMD_PRESENT_INVALID was added which helps in preserving that cleared
>> huge page mapping identity once it becomes unmapped.
> 
> OK. PMD_SECT_VALID is just a different name of PTE_VALID. I wonder

Non PTE Level descriptor

	BIT[1:0] = X0 Invalid entry
		 = 01 Block entries (Huge Pages)
		 = 11 Table entries

PTE Level descriptor

	BIT[1:0] = X0 Invalid entry
		 = 01 Reserved invalid entry
		 = 11 Page entries

Although PTE_VALID and PMD_SECT_VALID share the same bit position, the huge
page also requires the table bit to be cleared.

> if ~PMD_TABLE_BIT can be used as _PAGE_HUGE to indicate it is a huge page
> PMD, since PMD_TABLE_BIT is unset for PMD huge page already, for swap
> entry, since PMD_SECT_VALID is unset, PMD_TABLE_BIT is ignored. But it
> will require PTE and PMD have different swap entry encoding on arm64.
> It might not be worth the effort.

Right and also depending on just clearing of table bit for huge page mapping
might be problematic in subtle ways.

> 
>>>
>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>> and start interpretting the fields as if it were present, leading to
>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>> I suspect the same is possible on other architectures.
>>>>
>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>> good measure let's add a warning to the generic implementation of
>>>> pmdp_invalidate(). I've manually reviewed all other
>>>> pmdp_invalidate[_ad]() call sites and believe all others to be
>>>> conformant.
>>>>
>>>> This is a theoretical bug found during code review. I don't have any
>>>> test case to trigger it in practice.
>>>>
>>>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> ---
>>>>
>>>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>
>>>>  mm/huge_memory.c     | 5 +++--
>>>>  mm/pgtable-generic.c | 2 ++
>>>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 89f58c7603b2..80939ad00718 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>>>  	 * non-huge version of the pmd entry with pmd_populate.
>>>>  	 */
>>>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>
>>>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>>>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>>>  	if (unlikely(pmd_migration)) {
>>>>  		swp_entry_t entry;
>>>>
>>>> +		old_pmd = *pmd;
>>>>  		entry = pmd_to_swp_entry(old_pmd);
>>>>  		page = pfn_swap_entry_to_page(entry);
>>>>  		write = is_writable_migration_entry(entry);
>>>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>>  	} else {
>>>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>  		page = pmd_page(old_pmd);
>>>>  		folio = page_folio(page);
>>>>  		if (pmd_dirty(old_pmd)) {
>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>>> index 4fcd959dcc4d..74e34ea90656 100644
>>>> --- a/mm/pgtable-generic.c
>>>> +++ b/mm/pgtable-generic.c
>>>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>  		     pmd_t *pmdp)
>>>>  {
>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>>>  	return old;
>>>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>>>  			 pmd_t *pmdp)
>>>>  {
>>>> +	VM_WARN_ON(!pmd_present(*pmdp));  	return pmdp_invalidate(vma, address, pmdp);
>>>>  }
>>>>  #endif
>>>> --
>>>> 2.25.1
>>>
>>>
>>> --
>>> Best Regards,
>>> Yan, Zi
Anshuman Khandual April 29, 2024, 5:07 a.m. UTC | #17
On 4/26/24 13:13, Ryan Roberts wrote:
> On 26/04/2024 05:19, Anshuman Khandual wrote:
>> On 4/25/24 22:37, Ryan Roberts wrote:
>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>> (non-present) migration entry. It calls pmdp_invalidate()
>>> unconditionally on the pmdp and only determines if it is present or not
>>> based on the returned old pmd. This is a problem for the migration entry
>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>> called for a present pmd.
>>
>> pmdp_invalidate() must be called only for present PMD - is this expected
>> by core MM ? Does this cause any problem otherwise ?
> 
> I'm saying that only calling pmdp_invalidate() on a pte_present()==true pte is
> the only semantic that makes sense. And, yes, it causes a problem if called on a
> pte_present()==false pte - that's exactly what I'm describing in this commit log.
> 
> To labour the point, this is the logical type hierachy of PTEs (and block-mapped
> PMDs) as I see it:
> 
> ---8<----
> 
> pte
>  |- present
>  |   |- valid
>  |   |- invalid
>  |
>  |- not_present
>      |- none
>      |- swap_pte
> 
> present: All fields must be interpretted the way the HW sees them. e.g.
>          pte_pfn(), pte_write(), pte_dirty(), pte_young(), pte_mkwrite(),
>          pte_mkold() can all be legitimately used to query and modify the pte.
> 
>   valid: The HW may access the pte, interpret the fields and create a TLB entry,
>          etc.
> 
>   invalid: The HW will never access the pte or create a TLB entry for it.
> 
> not_present: The fields are SW-defined. HW never accesses the PTE.
> 
>   none: Unused; represents a hole
> 
>   swap_pte: Contains a swap entry and swap pte bits. The contained swap entry
>             may 1 of a few different types e.g. actual swap entry, migration
>             entry, hw poison, etc.

Sure, makes sense.

> 
> ---8<----
> 
> We test present vs not_present with pte_present()
> 
> We test none vs swap_pte with pte_none()

Agreed.

> 
> valid vs invalid is slightly more vague. The core-mm can move a PMD from valid
> -> invalid by calling pmd_mkinvalid(). But it can't query the state. And it
> can't do this generically for a PTE.
> 
> 
> Based on that lot, it makes no sense to me that we should permit calling
> pmd_mkinvalid() on a non-present pte. Indeed, we don't permit calling
> pte_mkwrite() etc on a non-present pte. And those functions are not defensive;
> they don't check that the pte is present before making the change. They just
> trust that the core-mm will not call them for non-present ptes.
> 
> The alternative approach would be to make pmdp_invalidate() defensive so that it
> checks the pmd is present before making any changes. But it doesn't semantically
> make sense to invalidate a non-present pmd in the first place so why call it
> under these circumstances? There is also a practical problem in that some arches
> implement their own pmdp_invalidate() so you would want to make all those
> defensive too, which would grow the size of the change.

I would suggest adding warnings in such arch specific pmdp_invalidate() helpers
to catch further unexpected calls in non present PMD state ?

> 
> 
>>
>>>
>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>> future call to pmd_present() will return true. And therefore any
>>
>> IIRC the following semantics needs to be followed as expected by core MM.
>>
>> -------------------------------------------------------------------------
>> |	PMD states	|	pmd_present	|	pmd_trans_huge	|
>> -------------------------------------------------------------------------
>> |	Mapped		|	Yes		|	Yes		|
>> -------------------------------------------------------------------------
>> |	Splitting	|	Yes		|	Yes		|
>> -------------------------------------------------------------------------
>> |	Migration/Swap	|	No		|	No		|
>> -------------------------------------------------------------------------
> 
> Indeed, the problem, as I see it, is if pmd_mkinvalid() is called on a
> "Migration/Swap" pmd, then a future call to pmd_present() will return Yes, which
> is clearly wrong. pmd_trans_huge() will also return Yes due to:
> 
> static inline int pmd_trans_huge(pmd_t pmd)
> {
> 	return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
> }

Agreed pmd_mkinvalid() on "Migration/Swap" entries making any subsequent
pmd_present() and pmd_trans_huge() return true is problematic. As you said,
the solution is to prevent pmd_mkinvalid() on "Migration/Swap" entries.

> 
> At least this happens for arm64. Although Zi suggests other arches look like
> they will do this too in the other email.
> 
> The reason is that arm64's pmd_mkinvalid() unconditionally sets
> PMD_PRESENT_INVALID (bit 59) and clears PMD_SECT_VALID (bit 0) in the pte. So
> next time pmd_present() is called it will see PMD_PRESENT_INVALID is set and
> return true.

> 
>>
>>
>>> lockless pgtable walker could see the migration entry pmd in this state
>>> and start interpretting the fields as if it were present, leading to
>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>
>> Could you please explain how bad things might happen ?
> 
> See 2 places where pmdp_get_lockless() is called in gup.c, without the PTL.
> These could both return the swap pte for which pmd_mkinvalid() has been called.
> In both cases, this would lead to the pmd_present() check eroneously returning
> true, eventually causing incorrect interpretation of the pte fields. e.g.:
> 
> gup_pmd_range()
>   pmd_t pmd = pmdp_get_lockless(pmdp);
>   gup_huge_pmd(pmd, ...)
>     page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
> 
> page is guff.
> 
> Let me know what you think!

Agreed, the page here might not be valid any more for the GUP as the migration
would have changed the pfn in the mean time.

> 
> Thanks,
> Ryan
> 
> 
>>  
>>> I suspect the same is possible on other architectures.
>>>
>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>> good measure let's add a warning to the generic implementation of
>>> pmdp_invalidate(). I've manually reviewed all other
>>> pmdp_invalidate[_ad]() call sites and believe all others to be
>>> conformant.
>>>
>>> This is a theoretical bug found during code review. I don't have any
>>> test case to trigger it in practice.
>>>
>>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>
>>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>>
>>> Thanks,
>>> Ryan
>>>
>>>
>>>  mm/huge_memory.c     | 5 +++--
>>>  mm/pgtable-generic.c | 2 ++
>>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 89f58c7603b2..80939ad00718 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>>  	 * non-huge version of the pmd entry with pmd_populate.
>>>  	 */
>>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>
>>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>>  	if (unlikely(pmd_migration)) {
>>>  		swp_entry_t entry;
>>>
>>> +		old_pmd = *pmd;
>>>  		entry = pmd_to_swp_entry(old_pmd);
>>>  		page = pfn_swap_entry_to_page(entry);
>>>  		write = is_writable_migration_entry(entry);
>>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>  	} else {
>>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>  		page = pmd_page(old_pmd);
>>>  		folio = page_folio(page);
>>>  		if (pmd_dirty(old_pmd)) {
>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>> index 4fcd959dcc4d..74e34ea90656 100644
>>> --- a/mm/pgtable-generic.c
>>> +++ b/mm/pgtable-generic.c
>>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>  		     pmd_t *pmdp)
>>>  {
>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>>  	return old;
>>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>>  			 pmd_t *pmdp)
>>>  {
>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>  	return pmdp_invalidate(vma, address, pmdp);
>>>  }
>>>  #endif
>>> --
>>> 2.25.1
>>>
>>>
>
Anshuman Khandual April 29, 2024, 5:25 a.m. UTC | #18
On 4/26/24 20:19, Zi Yan wrote:
> On 26 Apr 2024, at 3:43, Ryan Roberts wrote:
> 
>> On 26/04/2024 05:19, Anshuman Khandual wrote:
>>> On 4/25/24 22:37, Ryan Roberts wrote:
>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>> unconditionally on the pmdp and only determines if it is present or not
>>>> based on the returned old pmd. This is a problem for the migration entry
>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>> called for a present pmd.
>>>
>>> pmdp_invalidate() must be called only for present PMD - is this expected
>>> by core MM ? Does this cause any problem otherwise ?
>>
>> I'm saying that only calling pmdp_invalidate() on a pte_present()==true pte is
>> the only semantic that makes sense. And, yes, it causes a problem if called on a
>> pte_present()==false pte - that's exactly what I'm describing in this commit log.
>>
>> To labour the point, this is the logical type hierachy of PTEs (and block-mapped
>> PMDs) as I see it:
>>
>> ---8<----
>>
>> pte
>>  |- present
>>  |   |- valid
>>  |   |- invalid
>>  |
>>  |- not_present
>>      |- none
>>      |- swap_pte
>>
>> present: All fields must be interpretted the way the HW sees them. e.g.
>>          pte_pfn(), pte_write(), pte_dirty(), pte_young(), pte_mkwrite(),
>>          pte_mkold() can all be legitimately used to query and modify the pte.
>>
>>   valid: The HW may access the pte, interpret the fields and create a TLB entry,
>>          etc.
>>
>>   invalid: The HW will never access the pte or create a TLB entry for it.
>>
>> not_present: The fields are SW-defined. HW never accesses the PTE.
>>
>>   none: Unused; represents a hole
>>
>>   swap_pte: Contains a swap entry and swap pte bits. The contained swap entry
>>             may 1 of a few different types e.g. actual swap entry, migration
>>             entry, hw poison, etc.
>>
>> ---8<----
>>
>> We test present vs not_present with pte_present()
>>
>> We test none vs swap_pte with pte_none()
>>
>> valid vs invalid is slightly more vague. The core-mm can move a PMD from valid
>> -> invalid by calling pmd_mkinvalid(). But it can't query the state. And it
>> can't do this generically for a PTE.
>>
>>
>> Based on that lot, it makes no sense to me that we should permit calling
>> pmd_mkinvalid() on a non-present pte. Indeed, we don't permit calling
>> pte_mkwrite() etc on a non-present pte. And those functions are not defensive;
>> they don't check that the pte is present before making the change. They just
>> trust that the core-mm will not call them for non-present ptes.
> 
> I am OK with disallowing to call pmd_mkinvalid() on a non-present entry, but
> would like to know how to enforce it or document it. Because x86, risc-v, mips,
> and loongarch can call pmd_mkinvalid() on a non-present entry without causing
> any issue, any developer who work on these arches but arm64 can use pmd_mkinvalid()
> improperly until someone else tests it on arm64. You will need to add VM_WARM_ON()
> to all arch versions of pmd_mkinvalid().

Adding VM_WARM_ON() to all arch versions of pmd_mkinvalid() or returning the old
pmd back unchanged when not present are stable solutions.

> 
>>
>> The alternative approach would be to make pmdp_invalidate() defensive so that it
>> checks the pmd is present before making any changes. But it doesn't semantically
>> make sense to invalidate a non-present pmd in the first place so why call it
>> under these circumstances? There is also a practical problem in that some arches
>> implement their own pmdp_invalidate() so you would want to make all those
>> defensive too, which would grow the size of the change.
> 
> Like I said above, if you do not do this, other arches developers can break arm64
> without knowing it, since their pmd_mkinvalid() do not change a non-present
> PMD to a present one.

Unlike other PMD helpers pmdp_invalidate() is bit non-trivial and hence can be made
bit more defensive.

> 
>>>
>>>>
>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>> future call to pmd_present() will return true. And therefore any
>>>
>>> IIRC the following semantics needs to be followed as expected by core MM.
>>>
>>> -------------------------------------------------------------------------
>>> |	PMD states	|	pmd_present	|	pmd_trans_huge	|
>>> -------------------------------------------------------------------------
>>> |	Mapped		|	Yes		|	Yes		|
>>> -------------------------------------------------------------------------
>>> |	Splitting	|	Yes		|	Yes		|
>>> -------------------------------------------------------------------------
>>> |	Migration/Swap	|	No		|	No		|
>>> -------------------------------------------------------------------------
>>
>> Indeed, the problem, as I see it, is if pmd_mkinvalid() is called on a
>> "Migration/Swap" pmd, then a future call to pmd_present() will return Yes, which
>> is clearly wrong. pmd_trans_huge() will also return Yes due to:
>>
>> static inline int pmd_trans_huge(pmd_t pmd)
>> {
>> 	return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
>> }
>>
>> At least this happens for arm64. Although Zi suggests other arches look like
>> they will do this too in the other email.
>>
>> The reason is that arm64's pmd_mkinvalid() unconditionally sets
>> PMD_PRESENT_INVALID (bit 59) and clears PMD_SECT_VALID (bit 0) in the pte. So
>> next time pmd_present() is called it will see PMD_PRESENT_INVALID is set and
>> return true.
>>
>>>
>>>
>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>> and start interpretting the fields as if it were present, leading to
>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>
>>> Could you please explain how bad things might happen ?
>>
>> See 2 places where pmdp_get_lockless() is called in gup.c, without the PTL.
>> These could both return the swap pte for which pmd_mkinvalid() has been called.
>> In both cases, this would lead to the pmd_present() check eroneously returning
>> true, eventually causing incorrect interpretation of the pte fields. e.g.:
>>
>> gup_pmd_range()
>>   pmd_t pmd = pmdp_get_lockless(pmdp);
>>   gup_huge_pmd(pmd, ...)
>>     page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
>>
>> page is guff.
>>
>> Let me know what you think!
>>
>> Thanks,
>> Ryan
>>
>>
>>>
>>>> I suspect the same is possible on other architectures.
>>>>
>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>> good measure let's add a warning to the generic implementation of
>>>> pmdp_invalidate(). I've manually reviewed all other
>>>> pmdp_invalidate[_ad]() call sites and believe all others to be
>>>> conformant.
>>>>
>>>> This is a theoretical bug found during code review. I don't have any
>>>> test case to trigger it in practice.
>>>>
>>>> Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> ---
>>>>
>>>> Applies on top of v6.9-rc5. Passes all the mm selftests on arm64.
>>>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>
>>>>  mm/huge_memory.c     | 5 +++--
>>>>  mm/pgtable-generic.c | 2 ++
>>>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 89f58c7603b2..80939ad00718 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -2513,12 +2513,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  	 * for this pmd), then we flush the SMP TLB and finally we write the
>>>>  	 * non-huge version of the pmd entry with pmd_populate.
>>>>  	 */
>>>> -	old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>
>>>> -	pmd_migration = is_pmd_migration_entry(old_pmd);
>>>> +	pmd_migration = is_pmd_migration_entry(*pmd);
>>>>  	if (unlikely(pmd_migration)) {
>>>>  		swp_entry_t entry;
>>>>
>>>> +		old_pmd = *pmd;
>>>>  		entry = pmd_to_swp_entry(old_pmd);
>>>>  		page = pfn_swap_entry_to_page(entry);
>>>>  		write = is_writable_migration_entry(entry);
>>>> @@ -2529,6 +2529,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>>  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>>  	} else {
>>>> +		old_pmd = pmdp_invalidate(vma, haddr, pmd);
>>>>  		page = pmd_page(old_pmd);
>>>>  		folio = page_folio(page);
>>>>  		if (pmd_dirty(old_pmd)) {
>>>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>>>> index 4fcd959dcc4d..74e34ea90656 100644
>>>> --- a/mm/pgtable-generic.c
>>>> +++ b/mm/pgtable-generic.c
>>>> @@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>>>>  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>  		     pmd_t *pmdp)
>>>>  {
>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>>  	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
>>>>  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>>>>  	return old;
>>>> @@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>>>>  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
>>>>  			 pmd_t *pmdp)
>>>>  {
>>>> +	VM_WARN_ON(!pmd_present(*pmdp));
>>>>  	return pmdp_invalidate(vma, address, pmdp);
>>>>  }
>>>>  #endif
>>>> --
>>>> 2.25.1
>>>>
>>>>
> 
> 
> --
> Best Regards,
> Yan, Zi
Anshuman Khandual April 29, 2024, 5:31 a.m. UTC | #19
On 4/27/24 20:37, Zi Yan wrote:
> On 27 Apr 2024, at 0:25, John Hubbard wrote:
> 
>> On 4/26/24 7:53 AM, Zi Yan wrote:
>>
>> Hi Zi (and Ryan)!
>>
>>>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>>>> and start interpretting the fields as if it were present, leading to
>>>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>>> Could you please explain how bad things might happen ?
>>>>> See 2 places where pmdp_get_lockless() is called in gup.c, without the PTL.
>>>>> These could both return the swap pte for which pmd_mkinvalid() has been called.
>>>>> In both cases, this would lead to the pmd_present() check eroneously returning
>>>>> true, eventually causing incorrect interpretation of the pte fields. e.g.:
>>>>>
>>>>> gup_pmd_range()
>>>>>    pmd_t pmd = pmdp_get_lockless(pmdp);
>>>>>    gup_huge_pmd(pmd, ...)
>>>>>      page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
>>>>>
>>>>> page is guff.
>>>>>
>>>>> Let me know what you think!
>>> Add JohnH to check GUP code.
>> Ryan is correct about this behavior.
>>
>> By the way, remember that gup is not the only lockless page table
>> walker: there is also the CPU hardware itself, which inconveniently
>> refuses to bother with taking page table locks. 
Anshuman Khandual April 29, 2024, 6:17 a.m. UTC | #20
On 4/28/24 02:18, Zi Yan wrote:
> On 27 Apr 2024, at 16:45, Zi Yan wrote:
> 
>> On 27 Apr 2024, at 15:11, John Hubbard wrote:
>>
>>> On 4/27/24 8:14 AM, Zi Yan wrote:
>>>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>>>> unconditionally on the pmdp and only determines if it is present or not
>>>>>> based on the returned old pmd. This is a problem for the migration entry
>>>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>>>> called for a present pmd.
>>>>>>
>>>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>>>> future call to pmd_present() will return true. And therefore any
>>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>>> and start interpretting the fields as if it were present, leading to
>>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>>> I suspect the same is possible on other architectures.
>>>>>>
>>>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>>> Yes, this seems like a good design decision (after reading through the
>>>>> discussion that you all had in the other threads).
>>>> This will only be good for arm64 and does not prevent other arch developers
>>>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>>>> a swap entry to a pmd_present() entry.
>>> Well, let's characterize it in a bit more detail, then:
>>>
>>> 1) This patch will make things better for arm64. That's important!
>>>
>>> 2) Equally important, this patch does not make anything worse for
>>>    other CPU arches.
>>>
>>> 3) This patch represents a new design constraint on the CPU arch
>>>    layer, and thus requires documentation and whatever enforcement
>>>    we can provide, in order to keep future code out of trouble.
>>>
>>> 3.a) See the VM_WARN_ON() hunks below.
>>>
>>> 3.b) I like the new design constraint, because it is reasonable and
>>>      clearly understandable: don't invalidate a non-present page
>>>      table entry.
>>>
>>> I do wonder if there is somewhere else that this should be documented?
> In terms of documentation, at least in Documentation/mm/arch_pgtable_helpers.rst,
> pmd_mkinvalid() entry needs to add "do not call on an invalid entry as
> it breaks arm64"

s/invalid/non-present ?					^^^^^^^^^^^^^

But validation via mm/debug_vm_pgtable.c would require a predictable return
value from pmd_mkinvalid() e.g return old pmd when the entry is not present.

	ASSERT(pmd = pmd_mkinvalid(pmd)) - when pmd is not present

Otherwise, wondering how the semantics could be validated in the test.
Ryan Roberts April 29, 2024, 9:29 a.m. UTC | #21
On 27/04/2024 20:11, John Hubbard wrote:
> On 4/27/24 8:14 AM, Zi Yan wrote:
>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>> unconditionally on the pmdp and only determines if it is present or not
>>>> based on the returned old pmd. This is a problem for the migration entry
>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>> called for a present pmd.
>>>>
>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>> future call to pmd_present() will return true. And therefore any
>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>> and start interpretting the fields as if it were present, leading to
>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>> I suspect the same is possible on other architectures.
>>>>
>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>
>>> Yes, this seems like a good design decision (after reading through the
>>> discussion that you all had in the other threads).
>>
>> This will only be good for arm64 and does not prevent other arch developers
>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>> a swap entry to a pmd_present() entry.
> 
> Well, let's characterize it in a bit more detail, then:

Hi All,

Thanks for all the feedback! I had thought that this patch would be entirely
uncontraversial - obviously I was wrong :)

I've read all the emails, and trying to summarize a way forward here...

> 
> 1) This patch will make things better for arm64. That's important!
> 
> 2) Equally important, this patch does not make anything worse for
>    other CPU arches.
> 
> 3) This patch represents a new design constraint on the CPU arch
>    layer, and thus requires documentation and whatever enforcement
>    we can provide, in order to keep future code out of trouble.

I know its only semantics, but I don't view this as a new design constraint. I
see it as an existing constraint that was previously being violated, and this
patch aims to fix that. The generic version of pmdp_invalidate() unconditionally
does a tlb invalidation on the address range covered by the pmd. That makes no
sense unless the pmd was previously present. So my conclusion is that the
function only expects to be called for present pmds.

Additionally Documentation/mm/arch_pgtable_helpers.rst already says this:

"
| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
"

I read "mapped" to be a synonym for "present". So I think its already
documented. Happy to explcitly change "mapped" to "present" though, if it helps?

Finally, [1] which is linked from Documentation/mm/arch_pgtable_helpers.rst,
also implies this constraint, although it doesn't explicitly say it.

[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/

> 
> 3.a) See the VM_WARN_ON() hunks below.

It sounds like everybody would be happy if I sprinkle these into the arches that
override pmdp_invalidate[_ad]()? There are 3 arches that have their own version
of pmdp_invalidate(); powerpc, s390 and sparc. And 1 that has its own version of
pmdp_invalidate_ad(); x86. I'll add them in all of those.

I'll use VM_WARN_ON_ONCE() as suggested by John.

I'd rather not put it directly into pmd_mkinvalid() since that would set a
precedent for adding them absolutely everywhere. (e.g. pte_mkdirty(), ...).

> 
> 3.b) I like the new design constraint, because it is reasonable and
>      clearly understandable: don't invalidate a non-present page
>      table entry.
> 
> I do wonder if there is somewhere else that this should be documented?

If I change:

"
| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
"

To:

"
| pmd_mkinvalid             | Invalidates a present PMD; do not call for       |
|                             non-present pmd [1]                              |
"

Is that sufficient? (I'll do the same for pud_mkinvalid() too.

> 
> 
> thanks,
Zi Yan April 29, 2024, 2:41 p.m. UTC | #22
On 29 Apr 2024, at 2:17, Anshuman Khandual wrote:

> On 4/28/24 02:18, Zi Yan wrote:
>> On 27 Apr 2024, at 16:45, Zi Yan wrote:
>>
>>> On 27 Apr 2024, at 15:11, John Hubbard wrote:
>>>
>>>> On 4/27/24 8:14 AM, Zi Yan wrote:
>>>>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>>>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>>>>> unconditionally on the pmdp and only determines if it is present or not
>>>>>>> based on the returned old pmd. This is a problem for the migration entry
>>>>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>>>>> called for a present pmd.
>>>>>>>
>>>>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>>>>> future call to pmd_present() will return true. And therefore any
>>>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>>>> and start interpretting the fields as if it were present, leading to
>>>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>>>> I suspect the same is possible on other architectures.
>>>>>>>
>>>>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>>>> Yes, this seems like a good design decision (after reading through the
>>>>>> discussion that you all had in the other threads).
>>>>> This will only be good for arm64 and does not prevent other arch developers
>>>>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>>>>> a swap entry to a pmd_present() entry.
>>>> Well, let's characterize it in a bit more detail, then:
>>>>
>>>> 1) This patch will make things better for arm64. That's important!
>>>>
>>>> 2) Equally important, this patch does not make anything worse for
>>>>    other CPU arches.
>>>>
>>>> 3) This patch represents a new design constraint on the CPU arch
>>>>    layer, and thus requires documentation and whatever enforcement
>>>>    we can provide, in order to keep future code out of trouble.
>>>>
>>>> 3.a) See the VM_WARN_ON() hunks below.
>>>>
>>>> 3.b) I like the new design constraint, because it is reasonable and
>>>>      clearly understandable: don't invalidate a non-present page
>>>>      table entry.
>>>>
>>>> I do wonder if there is somewhere else that this should be documented?
>> In terms of documentation, at least in Documentation/mm/arch_pgtable_helpers.rst,
>> pmd_mkinvalid() entry needs to add "do not call on an invalid entry as
>> it breaks arm64"
>
> s/invalid/non-present ?					^^^^^^^^^^^^^
>
> But validation via mm/debug_vm_pgtable.c would require a predictable return
> value from pmd_mkinvalid() e.g return old pmd when the entry is not present.
>
> 	ASSERT(pmd = pmd_mkinvalid(pmd)) - when pmd is not present
>
> Otherwise, wondering how the semantics could be validated in the test.

I thought about checking this in mm/debug_vm_pgtable.c but concluded it is
impossible. We want to make sure no one use pmd_mkinvalid() on
!pmd_present() entries but that requires pmd_mkinvalid() on input entries'
at code writing time. A runtime test like mm/debug_vm_pgtable.c does not help.

I also even thought about changing pmd_mkinvalid() input parameter type to
a new pmd_invalid_t, so the type system can enforce it, but when we read
from a PMD entry, unless we inspect the bits, there is no way of determining
it is valid or not statically.

To me, the most future proof method is to make arm64 pmd_mkinvalid() to return
without changing anything if the input entry is !pmd_present(). This aligns
arm64 pmd_mkinvalid() with other arches pmd_mkinvalid() semantics, so that
if someone writes code using pmd_mkinvalid(), which runs on arches other than
arm64, the code would also work on arm64. But I am not going to insist on this
and let Ryan to make the call.


--
Best Regards,
Yan, Zi
Zi Yan April 29, 2024, 2:45 p.m. UTC | #23
On 29 Apr 2024, at 5:29, Ryan Roberts wrote:

> On 27/04/2024 20:11, John Hubbard wrote:
>> On 4/27/24 8:14 AM, Zi Yan wrote:
>>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>>> unconditionally on the pmdp and only determines if it is present or not
>>>>> based on the returned old pmd. This is a problem for the migration entry
>>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>>> called for a present pmd.
>>>>>
>>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>>> future call to pmd_present() will return true. And therefore any
>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>> and start interpretting the fields as if it were present, leading to
>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>> I suspect the same is possible on other architectures.
>>>>>
>>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>>
>>>> Yes, this seems like a good design decision (after reading through the
>>>> discussion that you all had in the other threads).
>>>
>>> This will only be good for arm64 and does not prevent other arch developers
>>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>>> a swap entry to a pmd_present() entry.
>>
>> Well, let's characterize it in a bit more detail, then:
>
> Hi All,
>
> Thanks for all the feedback! I had thought that this patch would be entirely
> uncontraversial - obviously I was wrong :)
>
> I've read all the emails, and trying to summarize a way forward here...
>
>>
>> 1) This patch will make things better for arm64. That's important!
>>
>> 2) Equally important, this patch does not make anything worse for
>>    other CPU arches.
>>
>> 3) This patch represents a new design constraint on the CPU arch
>>    layer, and thus requires documentation and whatever enforcement
>>    we can provide, in order to keep future code out of trouble.
>
> I know its only semantics, but I don't view this as a new design constraint. I
> see it as an existing constraint that was previously being violated, and this
> patch aims to fix that. The generic version of pmdp_invalidate() unconditionally
> does a tlb invalidation on the address range covered by the pmd. That makes no
> sense unless the pmd was previously present. So my conclusion is that the
> function only expects to be called for present pmds.
>
> Additionally Documentation/mm/arch_pgtable_helpers.rst already says this:
>
> "
> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
> "
>
> I read "mapped" to be a synonym for "present". So I think its already
> documented. Happy to explcitly change "mapped" to "present" though, if it helps?
>
> Finally, [1] which is linked from Documentation/mm/arch_pgtable_helpers.rst,
> also implies this constraint, although it doesn't explicitly say it.
>
> [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
>
>>
>> 3.a) See the VM_WARN_ON() hunks below.
>
> It sounds like everybody would be happy if I sprinkle these into the arches that
> override pmdp_invalidate[_ad]()? There are 3 arches that have their own version
> of pmdp_invalidate(); powerpc, s390 and sparc. And 1 that has its own version of
> pmdp_invalidate_ad(); x86. I'll add them in all of those.
>
> I'll use VM_WARN_ON_ONCE() as suggested by John.
>
> I'd rather not put it directly into pmd_mkinvalid() since that would set a
> precedent for adding them absolutely everywhere. (e.g. pte_mkdirty(), ...).

I understand your concern here. I assume you also understand the potential issue
with this, namely it does not prevent one from using pmd_mkinvalid() improperly
and causing a bug and the bug might only appear on arm64.

>
>>
>> 3.b) I like the new design constraint, because it is reasonable and
>>      clearly understandable: don't invalidate a non-present page
>>      table entry.
>>
>> I do wonder if there is somewhere else that this should be documented?
>
> If I change:
>
> "
> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
> "
>
> To:
>
> "
> | pmd_mkinvalid             | Invalidates a present PMD; do not call for       |
> |                             non-present pmd [1]                              |
> "
>
> Is that sufficient? (I'll do the same for pud_mkinvalid() too.

Sounds good to me.

Also, if you move pmdp_invalidate(), please move the big comment with it to
avoid confusion. Thanks.

--
Best Regards,
Yan, Zi
Zi Yan April 29, 2024, 3:29 p.m. UTC | #24
On 29 Apr 2024, at 10:45, Zi Yan wrote:

> On 29 Apr 2024, at 5:29, Ryan Roberts wrote:
>
>> On 27/04/2024 20:11, John Hubbard wrote:
>>> On 4/27/24 8:14 AM, Zi Yan wrote:
>>>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>>>> unconditionally on the pmdp and only determines if it is present or not
>>>>>> based on the returned old pmd. This is a problem for the migration entry
>>>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>>>> called for a present pmd.
>>>>>>
>>>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>>>> future call to pmd_present() will return true. And therefore any
>>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>>> and start interpretting the fields as if it were present, leading to
>>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>>> I suspect the same is possible on other architectures.
>>>>>>
>>>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>>>
>>>>> Yes, this seems like a good design decision (after reading through the
>>>>> discussion that you all had in the other threads).
>>>>
>>>> This will only be good for arm64 and does not prevent other arch developers
>>>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>>>> a swap entry to a pmd_present() entry.
>>>
>>> Well, let's characterize it in a bit more detail, then:
>>
>> Hi All,
>>
>> Thanks for all the feedback! I had thought that this patch would be entirely
>> uncontraversial - obviously I was wrong :)
>>
>> I've read all the emails, and trying to summarize a way forward here...
>>
>>>
>>> 1) This patch will make things better for arm64. That's important!
>>>
>>> 2) Equally important, this patch does not make anything worse for
>>>    other CPU arches.
>>>
>>> 3) This patch represents a new design constraint on the CPU arch
>>>    layer, and thus requires documentation and whatever enforcement
>>>    we can provide, in order to keep future code out of trouble.
>>
>> I know its only semantics, but I don't view this as a new design constraint. I
>> see it as an existing constraint that was previously being violated, and this
>> patch aims to fix that. The generic version of pmdp_invalidate() unconditionally
>> does a tlb invalidation on the address range covered by the pmd. That makes no
>> sense unless the pmd was previously present. So my conclusion is that the
>> function only expects to be called for present pmds.
>>
>> Additionally Documentation/mm/arch_pgtable_helpers.rst already says this:
>>
>> "
>> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
>> "
>>
>> I read "mapped" to be a synonym for "present". So I think its already
>> documented. Happy to explcitly change "mapped" to "present" though, if it helps?
>>
>> Finally, [1] which is linked from Documentation/mm/arch_pgtable_helpers.rst,
>> also implies this constraint, although it doesn't explicitly say it.
>>
>> [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
>>
>>>
>>> 3.a) See the VM_WARN_ON() hunks below.
>>
>> It sounds like everybody would be happy if I sprinkle these into the arches that
>> override pmdp_invalidate[_ad]()? There are 3 arches that have their own version
>> of pmdp_invalidate(); powerpc, s390 and sparc. And 1 that has its own version of
>> pmdp_invalidate_ad(); x86. I'll add them in all of those.
>>
>> I'll use VM_WARN_ON_ONCE() as suggested by John.
>>
>> I'd rather not put it directly into pmd_mkinvalid() since that would set a
>> precedent for adding them absolutely everywhere. (e.g. pte_mkdirty(), ...).
>
> I understand your concern here. I assume you also understand the potential issue
> with this, namely it does not prevent one from using pmd_mkinvalid() improperly
> and causing a bug and the bug might only appear on arm64.
>
>>
>>>
>>> 3.b) I like the new design constraint, because it is reasonable and
>>>      clearly understandable: don't invalidate a non-present page
>>>      table entry.
>>>
>>> I do wonder if there is somewhere else that this should be documented?
>>
>> If I change:
>>
>> "
>> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
>> "
>>
>> To:
>>
>> "
>> | pmd_mkinvalid             | Invalidates a present PMD; do not call for       |
>> |                             non-present pmd [1]                              |
>> "
>>
>> Is that sufficient? (I'll do the same for pud_mkinvalid() too.
>
> Sounds good to me.
>
> Also, if you move pmdp_invalidate(), please move the big comment with it to
> avoid confusion. Thanks.

And the Fixes tag does not need to go back that far, since this only affects arm64,
which enables thp migration at commit 53fa117bb33c ("arm64/mm: Enable THP migration").

--
Best Regards,
Yan, Zi
Ryan Roberts April 29, 2024, 3:34 p.m. UTC | #25
On 29/04/2024 15:45, Zi Yan wrote:
> On 29 Apr 2024, at 5:29, Ryan Roberts wrote:
> 
>> On 27/04/2024 20:11, John Hubbard wrote:
>>> On 4/27/24 8:14 AM, Zi Yan wrote:
>>>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>>>> unconditionally on the pmdp and only determines if it is present or not
>>>>>> based on the returned old pmd. This is a problem for the migration entry
>>>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>>>> called for a present pmd.
>>>>>>
>>>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>>>> future call to pmd_present() will return true. And therefore any
>>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>>> and start interpretting the fields as if it were present, leading to
>>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>>> I suspect the same is possible on other architectures.
>>>>>>
>>>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>>>
>>>>> Yes, this seems like a good design decision (after reading through the
>>>>> discussion that you all had in the other threads).
>>>>
>>>> This will only be good for arm64 and does not prevent other arch developers
>>>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>>>> a swap entry to a pmd_present() entry.
>>>
>>> Well, let's characterize it in a bit more detail, then:
>>
>> Hi All,
>>
>> Thanks for all the feedback! I had thought that this patch would be entirely
>> uncontraversial - obviously I was wrong :)
>>
>> I've read all the emails, and trying to summarize a way forward here...
>>
>>>
>>> 1) This patch will make things better for arm64. That's important!
>>>
>>> 2) Equally important, this patch does not make anything worse for
>>>    other CPU arches.
>>>
>>> 3) This patch represents a new design constraint on the CPU arch
>>>    layer, and thus requires documentation and whatever enforcement
>>>    we can provide, in order to keep future code out of trouble.
>>
>> I know its only semantics, but I don't view this as a new design constraint. I
>> see it as an existing constraint that was previously being violated, and this
>> patch aims to fix that. The generic version of pmdp_invalidate() unconditionally
>> does a tlb invalidation on the address range covered by the pmd. That makes no
>> sense unless the pmd was previously present. So my conclusion is that the
>> function only expects to be called for present pmds.
>>
>> Additionally Documentation/mm/arch_pgtable_helpers.rst already says this:
>>
>> "
>> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
>> "
>>
>> I read "mapped" to be a synonym for "present". So I think its already
>> documented. Happy to explcitly change "mapped" to "present" though, if it helps?
>>
>> Finally, [1] which is linked from Documentation/mm/arch_pgtable_helpers.rst,
>> also implies this constraint, although it doesn't explicitly say it.
>>
>> [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
>>
>>>
>>> 3.a) See the VM_WARN_ON() hunks below.
>>
>> It sounds like everybody would be happy if I sprinkle these into the arches that
>> override pmdp_invalidate[_ad]()? There are 3 arches that have their own version
>> of pmdp_invalidate(); powerpc, s390 and sparc. And 1 that has its own version of
>> pmdp_invalidate_ad(); x86. I'll add them in all of those.
>>
>> I'll use VM_WARN_ON_ONCE() as suggested by John.
>>
>> I'd rather not put it directly into pmd_mkinvalid() since that would set a
>> precedent for adding them absolutely everywhere. (e.g. pte_mkdirty(), ...).
> 
> I understand your concern here. I assume you also understand the potential issue
> with this, namely it does not prevent one from using pmd_mkinvalid() improperly
> and causing a bug and the bug might only appear on arm64.

Are you saying that arm64 is the *only* arch where pmd_mkinvalid() can turn a
swap pte into a present pte? I hadn't appreciated that; in your first reply to
this patch you said "I notice that x86, risc-v, mips behave the same" - I
thought you were saying they behaved the same as arm64, but on re-reading, I
think I've taken that out of context.

But in spite of that, it still remains my view that making arm64's
pmd_mkinvalid() robust to non-present ptes is not the right fix - or at least
not sufficient on its own. That change on its own would still result in issuing
a TLBI for the non-present pte from pmdp_invalidate(). That's not a correctness
issue, but certainly could be a performance concern.

I think its much better to have the design constraint that pmd_mkinvalid(),
pmdp_invalidate() and pmdp_invalidate_ad() can only be called for present ptes.
And I think the combination of WARNs and docs that we've discussed should be
enough to allay your concerns about introduction of arm64-specific bugs.

> 
>>
>>>
>>> 3.b) I like the new design constraint, because it is reasonable and
>>>      clearly understandable: don't invalidate a non-present page
>>>      table entry.
>>>
>>> I do wonder if there is somewhere else that this should be documented?
>>
>> If I change:
>>
>> "
>> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
>> "
>>
>> To:
>>
>> "
>> | pmd_mkinvalid             | Invalidates a present PMD; do not call for       |
>> |                             non-present pmd [1]                              |
>> "
>>
>> Is that sufficient? (I'll do the same for pud_mkinvalid() too.
> 
> Sounds good to me.
> 
> Also, if you move pmdp_invalidate(), please move the big comment with it to
> avoid confusion. Thanks.

Yes good spot, I'll move it.

> 
> --
> Best Regards,
> Yan, Zi
Ryan Roberts April 29, 2024, 3:35 p.m. UTC | #26
On 29/04/2024 16:29, Zi Yan wrote:
> On 29 Apr 2024, at 10:45, Zi Yan wrote:
> 
>> On 29 Apr 2024, at 5:29, Ryan Roberts wrote:
>>
>>> On 27/04/2024 20:11, John Hubbard wrote:
>>>> On 4/27/24 8:14 AM, Zi Yan wrote:
>>>>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>>>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>>>>> unconditionally on the pmdp and only determines if it is present or not
>>>>>>> based on the returned old pmd. This is a problem for the migration entry
>>>>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>>>>> called for a present pmd.
>>>>>>>
>>>>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>>>>> future call to pmd_present() will return true. And therefore any
>>>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>>>> and start interpretting the fields as if it were present, leading to
>>>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>>>> I suspect the same is possible on other architectures.
>>>>>>>
>>>>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>>>>
>>>>>> Yes, this seems like a good design decision (after reading through the
>>>>>> discussion that you all had in the other threads).
>>>>>
>>>>> This will only be good for arm64 and does not prevent other arch developers
>>>>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>>>>> a swap entry to a pmd_present() entry.
>>>>
>>>> Well, let's characterize it in a bit more detail, then:
>>>
>>> Hi All,
>>>
>>> Thanks for all the feedback! I had thought that this patch would be entirely
>>> uncontraversial - obviously I was wrong :)
>>>
>>> I've read all the emails, and trying to summarize a way forward here...
>>>
>>>>
>>>> 1) This patch will make things better for arm64. That's important!
>>>>
>>>> 2) Equally important, this patch does not make anything worse for
>>>>    other CPU arches.
>>>>
>>>> 3) This patch represents a new design constraint on the CPU arch
>>>>    layer, and thus requires documentation and whatever enforcement
>>>>    we can provide, in order to keep future code out of trouble.
>>>
>>> I know its only semantics, but I don't view this as a new design constraint. I
>>> see it as an existing constraint that was previously being violated, and this
>>> patch aims to fix that. The generic version of pmdp_invalidate() unconditionally
>>> does a tlb invalidation on the address range covered by the pmd. That makes no
>>> sense unless the pmd was previously present. So my conclusion is that the
>>> function only expects to be called for present pmds.
>>>
>>> Additionally Documentation/mm/arch_pgtable_helpers.rst already says this:
>>>
>>> "
>>> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
>>> "
>>>
>>> I read "mapped" to be a synonym for "present". So I think its already
>>> documented. Happy to explcitly change "mapped" to "present" though, if it helps?
>>>
>>> Finally, [1] which is linked from Documentation/mm/arch_pgtable_helpers.rst,
>>> also implies this constraint, although it doesn't explicitly say it.
>>>
>>> [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
>>>
>>>>
>>>> 3.a) See the VM_WARN_ON() hunks below.
>>>
>>> It sounds like everybody would be happy if I sprinkle these into the arches that
>>> override pmdp_invalidate[_ad]()? There are 3 arches that have their own version
>>> of pmdp_invalidate(); powerpc, s390 and sparc. And 1 that has its own version of
>>> pmdp_invalidate_ad(); x86. I'll add them in all of those.
>>>
>>> I'll use VM_WARN_ON_ONCE() as suggested by John.
>>>
>>> I'd rather not put it directly into pmd_mkinvalid() since that would set a
>>> precedent for adding them absolutely everywhere. (e.g. pte_mkdirty(), ...).
>>
>> I understand your concern here. I assume you also understand the potential issue
>> with this, namely it does not prevent one from using pmd_mkinvalid() improperly
>> and causing a bug and the bug might only appear on arm64.
>>
>>>
>>>>
>>>> 3.b) I like the new design constraint, because it is reasonable and
>>>>      clearly understandable: don't invalidate a non-present page
>>>>      table entry.
>>>>
>>>> I do wonder if there is somewhere else that this should be documented?
>>>
>>> If I change:
>>>
>>> "
>>> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
>>> "
>>>
>>> To:
>>>
>>> "
>>> | pmd_mkinvalid             | Invalidates a present PMD; do not call for       |
>>> |                             non-present pmd [1]                              |
>>> "
>>>
>>> Is that sufficient? (I'll do the same for pud_mkinvalid() too.
>>
>> Sounds good to me.
>>
>> Also, if you move pmdp_invalidate(), please move the big comment with it to
>> avoid confusion. Thanks.
> 
> And the Fixes tag does not need to go back that far, since this only affects arm64,
> which enables thp migration at commit 53fa117bb33c ("arm64/mm: Enable THP migration").

Yes, will do - good point.

> 
> --
> Best Regards,
> Yan, Zi
Zi Yan April 29, 2024, 4:02 p.m. UTC | #27
On 29 Apr 2024, at 11:34, Ryan Roberts wrote:

> On 29/04/2024 15:45, Zi Yan wrote:
>> On 29 Apr 2024, at 5:29, Ryan Roberts wrote:
>>
>>> On 27/04/2024 20:11, John Hubbard wrote:
>>>> On 4/27/24 8:14 AM, Zi Yan wrote:
>>>>> On 27 Apr 2024, at 0:41, John Hubbard wrote:
>>>>>> On 4/25/24 10:07 AM, Ryan Roberts wrote:
>>>>>>> __split_huge_pmd_locked() can be called for a present THP, devmap or
>>>>>>> (non-present) migration entry. It calls pmdp_invalidate()
>>>>>>> unconditionally on the pmdp and only determines if it is present or not
>>>>>>> based on the returned old pmd. This is a problem for the migration entry
>>>>>>> case because pmd_mkinvalid(), called by pmdp_invalidate() must only be
>>>>>>> called for a present pmd.
>>>>>>>
>>>>>>> On arm64 at least, pmd_mkinvalid() will mark the pmd such that any
>>>>>>> future call to pmd_present() will return true. And therefore any
>>>>>>> lockless pgtable walker could see the migration entry pmd in this state
>>>>>>> and start interpretting the fields as if it were present, leading to
>>>>>>> BadThings (TM). GUP-fast appears to be one such lockless pgtable walker.
>>>>>>> I suspect the same is possible on other architectures.
>>>>>>>
>>>>>>> Fix this by only calling pmdp_invalidate() for a present pmd. And for
>>>>>>
>>>>>> Yes, this seems like a good design decision (after reading through the
>>>>>> discussion that you all had in the other threads).
>>>>>
>>>>> This will only be good for arm64 and does not prevent other arch developers
>>>>> to write code breaking arm64, since only arm64's pmd_mkinvalid() can turn
>>>>> a swap entry to a pmd_present() entry.
>>>>
>>>> Well, let's characterize it in a bit more detail, then:
>>>
>>> Hi All,
>>>
>>> Thanks for all the feedback! I had thought that this patch would be entirely
>>> uncontraversial - obviously I was wrong :)
>>>
>>> I've read all the emails, and trying to summarize a way forward here...
>>>
>>>>
>>>> 1) This patch will make things better for arm64. That's important!
>>>>
>>>> 2) Equally important, this patch does not make anything worse for
>>>>    other CPU arches.
>>>>
>>>> 3) This patch represents a new design constraint on the CPU arch
>>>>    layer, and thus requires documentation and whatever enforcement
>>>>    we can provide, in order to keep future code out of trouble.
>>>
>>> I know its only semantics, but I don't view this as a new design constraint. I
>>> see it as an existing constraint that was previously being violated, and this
>>> patch aims to fix that. The generic version of pmdp_invalidate() unconditionally
>>> does a tlb invalidation on the address range covered by the pmd. That makes no
>>> sense unless the pmd was previously present. So my conclusion is that the
>>> function only expects to be called for present pmds.
>>>
>>> Additionally Documentation/mm/arch_pgtable_helpers.rst already says this:
>>>
>>> "
>>> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
>>> "
>>>
>>> I read "mapped" to be a synonym for "present". So I think its already
>>> documented. Happy to explcitly change "mapped" to "present" though, if it helps?
>>>
>>> Finally, [1] which is linked from Documentation/mm/arch_pgtable_helpers.rst,
>>> also implies this constraint, although it doesn't explicitly say it.
>>>
>>> [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
>>>
>>>>
>>>> 3.a) See the VM_WARN_ON() hunks below.
>>>
>>> It sounds like everybody would be happy if I sprinkle these into the arches that
>>> override pmdp_invalidate[_ad]()? There are 3 arches that have their own version
>>> of pmdp_invalidate(); powerpc, s390 and sparc. And 1 that has its own version of
>>> pmdp_invalidate_ad(); x86. I'll add them in all of those.
>>>
>>> I'll use VM_WARN_ON_ONCE() as suggested by John.
>>>
>>> I'd rather not put it directly into pmd_mkinvalid() since that would set a
>>> precedent for adding them absolutely everywhere. (e.g. pte_mkdirty(), ...).
>>
>> I understand your concern here. I assume you also understand the potential issue
>> with this, namely it does not prevent one from using pmd_mkinvalid() improperly
>> and causing a bug and the bug might only appear on arm64.
>
> Are you saying that arm64 is the *only* arch where pmd_mkinvalid() can turn a
> swap pte into a present pte? I hadn't appreciated that; in your first reply to

Yes.

> this patch you said "I notice that x86, risc-v, mips behave the same" - I
> thought you were saying they behaved the same as arm64, but on re-reading, I
> think I've taken that out of context.
>
> But in spite of that, it still remains my view that making arm64's
> pmd_mkinvalid() robust to non-present ptes is not the right fix - or at least
> not sufficient on its own. That change on its own would still result in issuing
> a TLBI for the non-present pte from pmdp_invalidate(). That's not a correctness
> issue, but certainly could be a performance concern.

I agree with you that using pmd_mkinvalid() on non-presenet entries does not make
sense, but there is no easy way of enforcing it to prevent anyone doing that. And
if people do it and they are not working or testing on arm64, they can break arm64
without noticing it. It becomes arm64's burden to watch out for this potential
break all the time.

Yes, TLB invalidation should be avoided in pmdp_invalidate() to recover performance
loss. It is a separate issue from the pmd_mkinvalid() correction issue. Thank you
for pointing this out explicitly.

>
> I think its much better to have the design constraint that pmd_mkinvalid(),
> pmdp_invalidate() and pmdp_invalidate_ad() can only be called for present ptes.
> And I think the combination of WARNs and docs that we've discussed should be
> enough to allay your concerns about introduction of arm64-specific bugs.

Yes. I also understand that putting a WARN in pmd_mkinvalid() might not be desirable.

>
>>
>>>
>>>>
>>>> 3.b) I like the new design constraint, because it is reasonable and
>>>>      clearly understandable: don't invalidate a non-present page
>>>>      table entry.
>>>>
>>>> I do wonder if there is somewhere else that this should be documented?
>>>
>>> If I change:
>>>
>>> "
>>> | pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
>>> "
>>>
>>> To:
>>>
>>> "
>>> | pmd_mkinvalid             | Invalidates a present PMD; do not call for       |
>>> |                             non-present pmd [1]                              |
>>> "
>>>
>>> Is that sufficient? (I'll do the same for pud_mkinvalid() too.
>>
>> Sounds good to me.
>>
>> Also, if you move pmdp_invalidate(), please move the big comment with it to
>> avoid confusion. Thanks.
>
> Yes good spot, I'll move it.
>
>>
>> --
>> Best Regards,
>> Yan, Zi


--
Best Regards,
Yan, Zi
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 89f58c7603b2..80939ad00718 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2513,12 +2513,12 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	 * for this pmd), then we flush the SMP TLB and finally we write the
 	 * non-huge version of the pmd entry with pmd_populate.
 	 */
-	old_pmd = pmdp_invalidate(vma, haddr, pmd);

-	pmd_migration = is_pmd_migration_entry(old_pmd);
+	pmd_migration = is_pmd_migration_entry(*pmd);
 	if (unlikely(pmd_migration)) {
 		swp_entry_t entry;

+		old_pmd = *pmd;
 		entry = pmd_to_swp_entry(old_pmd);
 		page = pfn_swap_entry_to_page(entry);
 		write = is_writable_migration_entry(entry);
@@ -2529,6 +2529,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
 	} else {
+		old_pmd = pmdp_invalidate(vma, haddr, pmd);
 		page = pmd_page(old_pmd);
 		folio = page_folio(page);
 		if (pmd_dirty(old_pmd)) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4fcd959dcc4d..74e34ea90656 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@  pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
+	VM_WARN_ON(!pmd_present(*pmdp));
 	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return old;
@@ -208,6 +209,7 @@  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
 			 pmd_t *pmdp)
 {
+	VM_WARN_ON(!pmd_present(*pmdp));
 	return pmdp_invalidate(vma, address, pmdp);
 }
 #endif