diff mbox series

[1/4] fs/proc/task_mmu: use folio API in pte_is_pinned()

Message ID 20240604114822.2089819-2-wangkefeng.wang@huawei.com (mailing list archive)
State New
Headers show
Series mm: remove page_maybe_dma_pinned() and page_mkclean() | expand

Commit Message

Kefeng Wang June 4, 2024, 11:48 a.m. UTC
Convert to use vm_normal_folio() and folio_maybe_dma_pinned() API,
which helps to remove page_maybe_dma_pinned() in the subsequent change.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 fs/proc/task_mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Comments

David Hildenbrand June 4, 2024, 11:51 a.m. UTC | #1
On 04.06.24 13:48, Kefeng Wang wrote:
> Convert to use vm_normal_folio() and folio_maybe_dma_pinned() API,
> which helps to remove page_maybe_dma_pinned() in the subsequent change.
> 
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
>   fs/proc/task_mmu.c | 8 ++++----
>   1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index f8d35f993fe5..5aceb3db7565 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1088,7 +1088,7 @@ struct clear_refs_private {
>   
>   static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
>   {
> -	struct page *page;
> +	struct folio *folio;
>   
>   	if (!pte_write(pte))
>   		return false;
> @@ -1096,10 +1096,10 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
>   		return false;
>   	if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
>   		return false;
> -	page = vm_normal_page(vma, addr, pte);
> -	if (!page)
> +	folio = vm_normal_folio(vma, addr, pte);
> +	if (!folio)
>   		return false;
> -	return page_maybe_dma_pinned(page);
> +	return folio_maybe_dma_pinned(folio);
>   }
>   
>   static inline void clear_soft_dirty(struct vm_area_struct *vma,

Likely we should just get rid of the pte_is_pinned() check completely 
now. We don't perform the same for PMDs, we don't sync against GUP-fast, 
and the original COW vs. GUP issue was resolved.
Kefeng Wang June 4, 2024, 2:26 p.m. UTC | #2
On 2024/6/4 19:51, David Hildenbrand wrote:
> On 04.06.24 13:48, Kefeng Wang wrote:
>> Convert to use vm_normal_folio() and folio_maybe_dma_pinned() API,
>> which helps to remove page_maybe_dma_pinned() in the subsequent change.
>>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> ---
>>   fs/proc/task_mmu.c | 8 ++++----
>>   1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index f8d35f993fe5..5aceb3db7565 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -1088,7 +1088,7 @@ struct clear_refs_private {
>>   static inline bool pte_is_pinned(struct vm_area_struct *vma, 
>> unsigned long addr, pte_t pte)
>>   {
>> -    struct page *page;
>> +    struct folio *folio;
>>       if (!pte_write(pte))
>>           return false;
>> @@ -1096,10 +1096,10 @@ static inline bool pte_is_pinned(struct 
>> vm_area_struct *vma, unsigned long addr,
>>           return false;
>>       if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
>>           return false;
>> -    page = vm_normal_page(vma, addr, pte);
>> -    if (!page)
>> +    folio = vm_normal_folio(vma, addr, pte);
>> +    if (!folio)
>>           return false;
>> -    return page_maybe_dma_pinned(page);
>> +    return folio_maybe_dma_pinned(folio);
>>   }
>>   static inline void clear_soft_dirty(struct vm_area_struct *vma,
> 
> Likely we should just get rid of the pte_is_pinned() check completely 
> now. We don't perform the same for PMDs, we don't sync against GUP-fast, 

Yes, no handle for clear PMDs.

> and the original COW vs. GUP issue was resolved.

Agree, but I'm not sure about removing it, from the commit 9348b73c2e1bf,

   "The whole "track page soft dirty" state doesn't work with pinned pages
   anyway, since the page might be dirtied by the pinning entity without
   ever being noticed in the page tables."

The issue is between the pin mechanism and "track page soft dirty", if
the page is pinned, the pining entiry(DMA?) could change the page but
the pte dirty won't be set, so maybe we still need it, even add some 
similar thing for PMD? Correct me if I'm wrong, thanks.
Kefeng Wang June 5, 2024, 1:30 a.m. UTC | #3
On 2024/6/4 23:47, David Hildenbrand wrote:
> On 04.06.24 16:26, Kefeng Wang wrote:
>>
>>
>> On 2024/6/4 19:51, David Hildenbrand wrote:
>>> On 04.06.24 13:48, Kefeng Wang wrote:
>>>> Convert to use vm_normal_folio() and folio_maybe_dma_pinned() API,
>>>> which helps to remove page_maybe_dma_pinned() in the subsequent change.
>>>>
>>>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>>> ---
>>>>    fs/proc/task_mmu.c | 8 ++++----
>>>>    1 file changed, 4 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>> index f8d35f993fe5..5aceb3db7565 100644
>>>> --- a/fs/proc/task_mmu.c
>>>> +++ b/fs/proc/task_mmu.c
>>>> @@ -1088,7 +1088,7 @@ struct clear_refs_private {
>>>>    static inline bool pte_is_pinned(struct vm_area_struct *vma,
>>>> unsigned long addr, pte_t pte)
>>>>    {
>>>> -    struct page *page;
>>>> +    struct folio *folio;
>>>>        if (!pte_write(pte))
>>>>            return false;
>>>> @@ -1096,10 +1096,10 @@ static inline bool pte_is_pinned(struct
>>>> vm_area_struct *vma, unsigned long addr,
>>>>            return false;
>>>>        if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
>>>>            return false;
>>>> -    page = vm_normal_page(vma, addr, pte);
>>>> -    if (!page)
>>>> +    folio = vm_normal_folio(vma, addr, pte);
>>>> +    if (!folio)
>>>>            return false;
>>>> -    return page_maybe_dma_pinned(page);
>>>> +    return folio_maybe_dma_pinned(folio);
>>>>    }
>>>>    static inline void clear_soft_dirty(struct vm_area_struct *vma,
>>>
>>> Likely we should just get rid of the pte_is_pinned() check completely
>>> now. We don't perform the same for PMDs, we don't sync against GUP-fast,
>>
>> Yes, no handle for clear PMDs.
>>
>>> and the original COW vs. GUP issue was resolved.
>>
>> Agree, but I'm not sure about removing it, from the commit 9348b73c2e1bf,
>>
>>     "The whole "track page soft dirty" state doesn't work with pinned 
>> pages
>>     anyway, since the page might be dirtied by the pinning entity without
>>     ever being noticed in the page tables."
>>
>> The issue is between the pin mechanism and "track page soft dirty", if
>> the page is pinned, the pining entiry(DMA?) could change the page but
>> the pte dirty won't be set, so maybe we still need it, even add some
>> similar thing for PMD? Correct me if I'm wrong, thanks.
> 
> Yes, but it doesn't work with any mechanism that write-protects PTEs,
> including mprotect() and uffd-wp.
> 
> Then, we never synced agaisnt concurrent GUP-fast, concurrent O_DIRECT
> that might still use !FOLL_PIN, never handled PMD ... so it's all not
> consistent nor really helpful nowdays.
> 
> ... and if you have read-only pinned pages (which we cannot distinguish)
> 

OK, many cases need to be addressed.

> I have a proper patch lying around here for quite a while:
> 
> commit 9ef578b7aba8bba626b904fe90e5be0690842fd3
> Author: David Hildenbrand <david@redhat.com>
> Date:   Wed Feb 16 20:39:43 2022 +0100
> 
>      fs/proc/task_mmu: allow setting pinned pages R/O for softdirty 
> tracking
>      Before we had PG_anon_exclusive, our new COW logic would end up
>      replacing a pinned page in the page tables, detecting it as possibly
>      shared. Consequently, we'd lost synchronicity between the page mapped
>      into the page table and the page pin.
>      We tried preventing mapping pinned pages R/O, however, history told us
>      that that is impossible -- and we added PG_anon_exclusive to have 
> it all
>      working reliably again.
>      Now that we have PG_anon_exclusive in place, let's get rid of the 
> check for
>      pinned pages and revert it to the old state.
>      Yes, we won't be able to detect the following scenario correctly:
>      (1) R/W-pin a page.
>      (2) Clear softdirty.
>      (3) Modify the page via the R/W-pin.
>      However, that isn't working reliably right now either way, because
>      * The current check is racy, because we can race with GUP-fast 
> taking a
>        R/W pin while we're clearing the softdirty marker and mapping the 
> page
>        R/O.
>      * The current check cannot handle FOLL_GET R/W references, as used for
>        O_DIRECT. So if there is concurrent I/O the PTE will get marked as
>        !softdirty, yet, direct I/O will modify page content afterwards.
>      * We check for pins only when handling PTEs, but not for PMDs etc.
>      Also, this handling is in no way different to other mechanisms 
> (mprotect,
>      uffd-wp) that map pages R/O to catch successive write access via the
>      page table -- because acceses via the page pin no longer go logically
>      via the page table, the page table is bypassed.
>      With this change, the interface now works again as expected when we 
> have
>      R/O pins, and behaves just like any other mechanism that uses write
>      protection to catch successive writes (mprotect, uffd-wp) -- and 
> they all
>      face the same issue regarding R/W access via GUP (FOLL_PIN and
>      FOLL_GET).
>      User space better be aware that using read-protection to catch 
> writes to
>      a page can miss writes via GUP. Softdirt tracking cannot reliably 
> catch
>      modifications via GUP after clearing softdirty and returning to user
>      space.
> 
> But I understand if you want to be careful :) So I might send that patch 
> out at
> some point myself ...
> 

Thank for your detail explanation, let's wait it out.
Andrew Morton June 25, 2024, 10:38 p.m. UTC | #4
On Wed, 5 Jun 2024 09:30:59 +0800 Kefeng Wang <wangkefeng.wang@huawei.com> wrote:

> >      protection to catch successive writes (mprotect, uffd-wp) -- and 
> > they all
> >      face the same issue regarding R/W access via GUP (FOLL_PIN and
> >      FOLL_GET).
> >      User space better be aware that using read-protection to catch 
> > writes to
> >      a page can miss writes via GUP. Softdirt tracking cannot reliably 
> > catch
> >      modifications via GUP after clearing softdirty and returning to user
> >      space.
> > 
> > But I understand if you want to be careful :) So I might send that patch 
> > out at
> > some point myself ...
> > 
> 
> Thank for your detail explanation, let's wait it out.

Did we wait long enough?  Should we move ahead with this change
or should we drop it and have another run at it?
David Hildenbrand June 26, 2024, 4:57 a.m. UTC | #5
On 26.06.24 00:38, Andrew Morton wrote:
> On Wed, 5 Jun 2024 09:30:59 +0800 Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
> 
>>>       protection to catch successive writes (mprotect, uffd-wp) -- and
>>> they all
>>>       face the same issue regarding R/W access via GUP (FOLL_PIN and
>>>       FOLL_GET).
>>>       User space better be aware that using read-protection to catch
>>> writes to
>>>       a page can miss writes via GUP. Softdirt tracking cannot reliably
>>> catch
>>>       modifications via GUP after clearing softdirty and returning to user
>>>       space.
>>>
>>> But I understand if you want to be careful :) So I might send that patch
>>> out at
>>> some point myself ...
>>>
>>
>> Thank for your detail explanation, let's wait it out.
> 
> Did we wait long enough?  Should we move ahead with this change
> or should we drop it and have another run at it?

The change is fine, we will revisit removing this code (instead of 
cleaning it up) separately.
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f8d35f993fe5..5aceb3db7565 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1088,7 +1088,7 @@  struct clear_refs_private {
 
 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
-	struct page *page;
+	struct folio *folio;
 
 	if (!pte_write(pte))
 		return false;
@@ -1096,10 +1096,10 @@  static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
 		return false;
 	if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
 		return false;
-	page = vm_normal_page(vma, addr, pte);
-	if (!page)
+	folio = vm_normal_folio(vma, addr, pte);
+	if (!folio)
 		return false;
-	return page_maybe_dma_pinned(page);
+	return folio_maybe_dma_pinned(folio);
 }
 
 static inline void clear_soft_dirty(struct vm_area_struct *vma,