diff mbox series

[v3,4/9] mm: introduce skip_none_ptes()

Message ID 574bc9b646c87d878a5048edb63698a1f8483e10.1731566457.git.zhengqi.arch@bytedance.com (mailing list archive)
State New
Headers show
Series synchronously scan and reclaim empty user PTE pages | expand

Commit Message

Qi Zheng Nov. 14, 2024, 6:59 a.m. UTC
This commit introduces skip_none_ptes() to skip over all consecutive none
ptes in zap_pte_range(), which helps optimize away need_resched() +
force_break + incremental pte/addr increments etc.

Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 mm/memory.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

Comments

David Hildenbrand Nov. 14, 2024, 8:04 a.m. UTC | #1
>   static unsigned long zap_pte_range(struct mmu_gather *tlb,
>   				struct vm_area_struct *vma, pmd_t *pmd,
>   				unsigned long addr, unsigned long end,
> @@ -1682,13 +1704,17 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>   		pte_t ptent = ptep_get(pte);
>   		int max_nr;
>   
> -		nr = 1;
> -		if (pte_none(ptent))
> -			continue;
> -
>   		if (need_resched())
>   			break;
>   
> +		nr = skip_none_ptes(pte, addr, end);
> +		if (nr) {
> +			addr += PAGE_SIZE * nr;
> +			if (addr == end)
> +				break;
> +			pte += nr;
> +		}
> +
>   		max_nr = (end - addr) / PAGE_SIZE;

I dislike calculating max_nr twice, once here and once in skip_non_ptes.

Further, you're missing to update ptent here. If you inline it you can 
avoid another ptep_get().

>   		if (pte_present(ptent)) {
>   			nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
Qi Zheng Nov. 14, 2024, 9:20 a.m. UTC | #2
On 2024/11/14 16:04, David Hildenbrand wrote:
> 
>>   static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>                   struct vm_area_struct *vma, pmd_t *pmd,
>>                   unsigned long addr, unsigned long end,
>> @@ -1682,13 +1704,17 @@ static unsigned long zap_pte_range(struct 
>> mmu_gather *tlb,
>>           pte_t ptent = ptep_get(pte);
>>           int max_nr;
>> -        nr = 1;
>> -        if (pte_none(ptent))
>> -            continue;
>> -
>>           if (need_resched())
>>               break;
>> +        nr = skip_none_ptes(pte, addr, end);
>> +        if (nr) {
>> +            addr += PAGE_SIZE * nr;
>> +            if (addr == end)
>> +                break;
>> +            pte += nr;
>> +        }
>> +
>>           max_nr = (end - addr) / PAGE_SIZE;
> 
> I dislike calculating max_nr twice, once here and once in skip_non_ptes.
> 
> Further, you're missing to update ptent here. 

Oh, my bad. However, with [PATCH v3 5/9], there will be no problem, but
there are still two ptep_get() and max_nr calculation.

If you inline it you can
> avoid another ptep_get().

Do you mean to inline the skip_none_ptes() into do_zap_pte_range()? This
will cause these none ptes to be checked again in count_pte_none() when
PTE_MARKER_UFFD_WP is enabled. Of course, maybe we can count none ptes
in do_zap_pte_range() and pass it through function parameters like 
force_break.

> 
>>           if (pte_present(ptent)) {
>>               nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
> 
>
David Hildenbrand Nov. 14, 2024, 12:32 p.m. UTC | #3
On 14.11.24 10:20, Qi Zheng wrote:
> 
> 
> On 2024/11/14 16:04, David Hildenbrand wrote:
>>
>>>    static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>>                    struct vm_area_struct *vma, pmd_t *pmd,
>>>                    unsigned long addr, unsigned long end,
>>> @@ -1682,13 +1704,17 @@ static unsigned long zap_pte_range(struct
>>> mmu_gather *tlb,
>>>            pte_t ptent = ptep_get(pte);
>>>            int max_nr;
>>> -        nr = 1;
>>> -        if (pte_none(ptent))
>>> -            continue;
>>> -
>>>            if (need_resched())
>>>                break;
>>> +        nr = skip_none_ptes(pte, addr, end);
>>> +        if (nr) {
>>> +            addr += PAGE_SIZE * nr;
>>> +            if (addr == end)
>>> +                break;
>>> +            pte += nr;
>>> +        }
>>> +
>>>            max_nr = (end - addr) / PAGE_SIZE;
>>
>> I dislike calculating max_nr twice, once here and once in skip_non_ptes.
>>
>> Further, you're missing to update ptent here.
> 
> Oh, my bad. However, with [PATCH v3 5/9], there will be no problem, but
> there are still two ptep_get() and max_nr calculation.
> 
> If you inline it you can
>> avoid another ptep_get().
> 
> Do you mean to inline the skip_none_ptes() into do_zap_pte_range()?

Effectively moving this patch after #5, and have it be something like:

diff --git a/mm/memory.c b/mm/memory.c
index 1949f5e0fece5..4f5d1e4c6688e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1667,8 +1667,21 @@ static inline int do_zap_pte_range(struct mmu_gather *tlb,
         pte_t ptent = ptep_get(pte);
         int max_nr = (end - addr) / PAGE_SIZE;
  
-       if (pte_none(ptent))
-               return 1;
+       /* Skip all consecutive pte_none(). */
+       if (pte_none(ptent)) {
+               int nr;
+
+               for (nr = 1; nr < max_nr; nr++) {
+                       ptent = ptep_get(pte + nr);
+                       if (!pte_none(ptent))
+                               break;
+               }
+               max_nr -= nr;
+               if (!max_nr)
+                       return nr;
+               pte += nr;
+               addr += nr * PAGE_SIZE;
+       }
  
         if (pte_present(ptent))
                 return zap_present_ptes(tlb, vma, pte, ptent, max_nr,


In the context of this patch this makes most sense.

Regarding "count_pte_none" comment, I assume you talk about patch #7.

Can't you simply return the number of pte_none that you skipped here using another
input variable, if really required?
Qi Zheng Nov. 14, 2024, 12:51 p.m. UTC | #4
On 2024/11/14 20:32, David Hildenbrand wrote:
> On 14.11.24 10:20, Qi Zheng wrote:
>>
>>
>> On 2024/11/14 16:04, David Hildenbrand wrote:
>>>
>>>>    static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>>>                    struct vm_area_struct *vma, pmd_t *pmd,
>>>>                    unsigned long addr, unsigned long end,
>>>> @@ -1682,13 +1704,17 @@ static unsigned long zap_pte_range(struct
>>>> mmu_gather *tlb,
>>>>            pte_t ptent = ptep_get(pte);
>>>>            int max_nr;
>>>> -        nr = 1;
>>>> -        if (pte_none(ptent))
>>>> -            continue;
>>>> -
>>>>            if (need_resched())
>>>>                break;
>>>> +        nr = skip_none_ptes(pte, addr, end);
>>>> +        if (nr) {
>>>> +            addr += PAGE_SIZE * nr;
>>>> +            if (addr == end)
>>>> +                break;
>>>> +            pte += nr;
>>>> +        }
>>>> +
>>>>            max_nr = (end - addr) / PAGE_SIZE;
>>>
>>> I dislike calculating max_nr twice, once here and once in skip_non_ptes.
>>>
>>> Further, you're missing to update ptent here.
>>
>> Oh, my bad. However, with [PATCH v3 5/9], there will be no problem, but
>> there are still two ptep_get() and max_nr calculation.
>>
>> If you inline it you can
>>> avoid another ptep_get().
>>
>> Do you mean to inline the skip_none_ptes() into do_zap_pte_range()?
> 
> Effectively moving this patch after #5, and have it be something like:
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 1949f5e0fece5..4f5d1e4c6688e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1667,8 +1667,21 @@ static inline int do_zap_pte_range(struct 
> mmu_gather *tlb,
>          pte_t ptent = ptep_get(pte);
>          int max_nr = (end - addr) / PAGE_SIZE;
> 
> -       if (pte_none(ptent))
> -               return 1;
> +       /* Skip all consecutive pte_none(). */
> +       if (pte_none(ptent)) {
> +               int nr;
> +
> +               for (nr = 1; nr < max_nr; nr++) {
> +                       ptent = ptep_get(pte + nr);
> +                       if (!pte_none(ptent))
> +                               break;
> +               }
> +               max_nr -= nr;
> +               if (!max_nr)
> +                       return nr;
> +               pte += nr;
> +               addr += nr * PAGE_SIZE;
> +       }
> 
>          if (pte_present(ptent))
>                  return zap_present_ptes(tlb, vma, pte, ptent, max_nr,
> 
> 
> In the context of this patch this makes most sense.
> 
> Regarding "count_pte_none" comment, I assume you talk about patch #7.

Yes.

> 
> Can't you simply return the number of pte_none that you skipped here 
> using another
> input variable, if really required?

Suppose we add an input variable nr_skip to do_zap_pte_range(), you mean
to return the above nr to zap_pte_range() through:

*nr_skip = nr;

and then:

zap_pte_range
--> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, &skip_nr,
  				      rss, &force_flush, &force_break);
     if (can_reclaim_pt) {
         none_nr += count_pte_none(pte, nr);
         none_nr += nr_skip;
     }

Right?

>
David Hildenbrand Nov. 14, 2024, 9:19 p.m. UTC | #5
On 14.11.24 13:51, Qi Zheng wrote:
> 
> 
> On 2024/11/14 20:32, David Hildenbrand wrote:
>> On 14.11.24 10:20, Qi Zheng wrote:
>>>
>>>
>>> On 2024/11/14 16:04, David Hildenbrand wrote:
>>>>
>>>>>     static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>>>>                     struct vm_area_struct *vma, pmd_t *pmd,
>>>>>                     unsigned long addr, unsigned long end,
>>>>> @@ -1682,13 +1704,17 @@ static unsigned long zap_pte_range(struct
>>>>> mmu_gather *tlb,
>>>>>             pte_t ptent = ptep_get(pte);
>>>>>             int max_nr;
>>>>> -        nr = 1;
>>>>> -        if (pte_none(ptent))
>>>>> -            continue;
>>>>> -
>>>>>             if (need_resched())
>>>>>                 break;
>>>>> +        nr = skip_none_ptes(pte, addr, end);
>>>>> +        if (nr) {
>>>>> +            addr += PAGE_SIZE * nr;
>>>>> +            if (addr == end)
>>>>> +                break;
>>>>> +            pte += nr;
>>>>> +        }
>>>>> +
>>>>>             max_nr = (end - addr) / PAGE_SIZE;
>>>>
>>>> I dislike calculating max_nr twice, once here and once in skip_non_ptes.
>>>>
>>>> Further, you're missing to update ptent here.
>>>
>>> Oh, my bad. However, with [PATCH v3 5/9], there will be no problem, but
>>> there are still two ptep_get() and max_nr calculation.
>>>
>>> If you inline it you can
>>>> avoid another ptep_get().
>>>
>>> Do you mean to inline the skip_none_ptes() into do_zap_pte_range()?
>>
>> Effectively moving this patch after #5, and have it be something like:
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 1949f5e0fece5..4f5d1e4c6688e 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1667,8 +1667,21 @@ static inline int do_zap_pte_range(struct
>> mmu_gather *tlb,
>>           pte_t ptent = ptep_get(pte);
>>           int max_nr = (end - addr) / PAGE_SIZE;
>>
>> -       if (pte_none(ptent))
>> -               return 1;
>> +       /* Skip all consecutive pte_none(). */
>> +       if (pte_none(ptent)) {
>> +               int nr;
>> +
>> +               for (nr = 1; nr < max_nr; nr++) {
>> +                       ptent = ptep_get(pte + nr);
>> +                       if (!pte_none(ptent))
>> +                               break;
>> +               }
>> +               max_nr -= nr;
>> +               if (!max_nr)
>> +                       return nr;
>> +               pte += nr;
>> +               addr += nr * PAGE_SIZE;
>> +       }
>>
>>           if (pte_present(ptent))
>>                   return zap_present_ptes(tlb, vma, pte, ptent, max_nr,
>>
>>
>> In the context of this patch this makes most sense.
>>
>> Regarding "count_pte_none" comment, I assume you talk about patch #7.
> 
> Yes.
> 
>>
>> Can't you simply return the number of pte_none that you skipped here
>> using another
>> input variable, if really required?
> 
> Suppose we add an input variable nr_skip to do_zap_pte_range(), you mean
> to return the above nr to zap_pte_range() through:

Maybe "cur_none_nr" or something similar.

> 
> *nr_skip = nr;
> 
> and then:
> 
> zap_pte_range
> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, &skip_nr,
>    				      rss, &force_flush, &force_break);
>       if (can_reclaim_pt) {
>           none_nr += count_pte_none(pte, nr);
>           none_nr += nr_skip;
>       }
> 
> Right?

Yes. I did not look closely at the patch that adds the counting of 
pte_none though (to digest why it is required :) ).
Qi Zheng Nov. 15, 2024, 3:03 a.m. UTC | #6
On 2024/11/15 05:19, David Hildenbrand wrote:
> On 14.11.24 13:51, Qi Zheng wrote:
>>
>>
>> On 2024/11/14 20:32, David Hildenbrand wrote:
>>> On 14.11.24 10:20, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2024/11/14 16:04, David Hildenbrand wrote:
>>>>>
>>>>>>     static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>>>>>                     struct vm_area_struct *vma, pmd_t *pmd,
>>>>>>                     unsigned long addr, unsigned long end,
>>>>>> @@ -1682,13 +1704,17 @@ static unsigned long zap_pte_range(struct
>>>>>> mmu_gather *tlb,
>>>>>>             pte_t ptent = ptep_get(pte);
>>>>>>             int max_nr;
>>>>>> -        nr = 1;
>>>>>> -        if (pte_none(ptent))
>>>>>> -            continue;
>>>>>> -
>>>>>>             if (need_resched())
>>>>>>                 break;
>>>>>> +        nr = skip_none_ptes(pte, addr, end);
>>>>>> +        if (nr) {
>>>>>> +            addr += PAGE_SIZE * nr;
>>>>>> +            if (addr == end)
>>>>>> +                break;
>>>>>> +            pte += nr;
>>>>>> +        }
>>>>>> +
>>>>>>             max_nr = (end - addr) / PAGE_SIZE;
>>>>>
>>>>> I dislike calculating max_nr twice, once here and once in 
>>>>> skip_non_ptes.
>>>>>
>>>>> Further, you're missing to update ptent here.
>>>>
>>>> Oh, my bad. However, with [PATCH v3 5/9], there will be no problem, but
>>>> there are still two ptep_get() and max_nr calculation.
>>>>
>>>> If you inline it you can
>>>>> avoid another ptep_get().
>>>>
>>>> Do you mean to inline the skip_none_ptes() into do_zap_pte_range()?
>>>
>>> Effectively moving this patch after #5, and have it be something like:
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 1949f5e0fece5..4f5d1e4c6688e 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -1667,8 +1667,21 @@ static inline int do_zap_pte_range(struct
>>> mmu_gather *tlb,
>>>           pte_t ptent = ptep_get(pte);
>>>           int max_nr = (end - addr) / PAGE_SIZE;
>>>
>>> -       if (pte_none(ptent))
>>> -               return 1;
>>> +       /* Skip all consecutive pte_none(). */
>>> +       if (pte_none(ptent)) {
>>> +               int nr;
>>> +
>>> +               for (nr = 1; nr < max_nr; nr++) {
>>> +                       ptent = ptep_get(pte + nr);
>>> +                       if (!pte_none(ptent))
>>> +                               break;
>>> +               }
>>> +               max_nr -= nr;
>>> +               if (!max_nr)
>>> +                       return nr;
>>> +               pte += nr;
>>> +               addr += nr * PAGE_SIZE;
>>> +       }
>>>
>>>           if (pte_present(ptent))
>>>                   return zap_present_ptes(tlb, vma, pte, ptent, max_nr,
>>>
>>>
>>> In the context of this patch this makes most sense.
>>>
>>> Regarding "count_pte_none" comment, I assume you talk about patch #7.
>>
>> Yes.
>>
>>>
>>> Can't you simply return the number of pte_none that you skipped here
>>> using another
>>> input variable, if really required?
>>
>> Suppose we add an input variable nr_skip to do_zap_pte_range(), you mean
>> to return the above nr to zap_pte_range() through:
> 
> Maybe "cur_none_nr" or something similar.

OK.

> 
>>
>> *nr_skip = nr;
>>
>> and then:
>>
>> zap_pte_range
>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, &skip_nr,
>>                          rss, &force_flush, &force_break);
>>       if (can_reclaim_pt) {
>>           none_nr += count_pte_none(pte, nr);
>>           none_nr += nr_skip;
>>       }
>>
>> Right?
> 
> Yes. I did not look closely at the patch that adds the counting of 

Got it.

> pte_none though (to digest why it is required :) ).

Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
empty PTE page.

Looking forward to your more review feedback on this series.

Thanks!

>
David Hildenbrand Nov. 15, 2024, 10:22 a.m. UTC | #7
>>> *nr_skip = nr;
>>>
>>> and then:
>>>
>>> zap_pte_range
>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, &skip_nr,
>>>                           rss, &force_flush, &force_break);
>>>        if (can_reclaim_pt) {
>>>            none_nr += count_pte_none(pte, nr);
>>>            none_nr += nr_skip;
>>>        }
>>>
>>> Right?
>>
>> Yes. I did not look closely at the patch that adds the counting of
> 
> Got it.
> 
>> pte_none though (to digest why it is required :) ).
> 
> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
> empty PTE page.

Okay, so the problem is that "nr" would be "all processed entries" but 
there are cases where we "process an entry but not zap it".

What you really only want to know is "was any entry not zapped", which 
could be a simple input boolean variable passed into do_zap_pte_range?

Because as soon as any entry was processed but  no zapped, you can 
immediately give up on reclaiming that table.

> 
> Looking forward to your more review feedback on this series.

Thanks for all your hard work on this, I'm only able to make slow 
progress because I keep getting distracted by all different kinds of 
things :(
Qi Zheng Nov. 15, 2024, 2:41 p.m. UTC | #8
On 2024/11/15 18:22, David Hildenbrand wrote:
>>>> *nr_skip = nr;
>>>>
>>>> and then:
>>>>
>>>> zap_pte_range
>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, &skip_nr,
>>>>                           rss, &force_flush, &force_break);
>>>>        if (can_reclaim_pt) {
>>>>            none_nr += count_pte_none(pte, nr);
>>>>            none_nr += nr_skip;
>>>>        }
>>>>
>>>> Right?
>>>
>>> Yes. I did not look closely at the patch that adds the counting of
>>
>> Got it.
>>
>>> pte_none though (to digest why it is required :) ).
>>
>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>> empty PTE page.
> 
> Okay, so the problem is that "nr" would be "all processed entries" but 
> there are cases where we "process an entry but not zap it".
> 
> What you really only want to know is "was any entry not zapped", which 
> could be a simple input boolean variable passed into do_zap_pte_range?
> 
> Because as soon as any entry was processed but  no zapped, you can 
> immediately give up on reclaiming that table.

Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
found in count_pte_none().

> 
>>
>> Looking forward to your more review feedback on this series.
> 
> Thanks for all your hard work on this, I'm only able to make slow 
> progress because I keep getting distracted by all different kinds of 
> things :(

It's OK, just take your time. :)

Thanks!

>
David Hildenbrand Nov. 15, 2024, 2:59 p.m. UTC | #9
On 15.11.24 15:41, Qi Zheng wrote:
> 
> 
> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>> *nr_skip = nr;
>>>>>
>>>>> and then:
>>>>>
>>>>> zap_pte_range
>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, &skip_nr,
>>>>>                            rss, &force_flush, &force_break);
>>>>>         if (can_reclaim_pt) {
>>>>>             none_nr += count_pte_none(pte, nr);
>>>>>             none_nr += nr_skip;
>>>>>         }
>>>>>
>>>>> Right?
>>>>
>>>> Yes. I did not look closely at the patch that adds the counting of
>>>
>>> Got it.
>>>
>>>> pte_none though (to digest why it is required :) ).
>>>
>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>> empty PTE page.
>>
>> Okay, so the problem is that "nr" would be "all processed entries" but
>> there are cases where we "process an entry but not zap it".
>>
>> What you really only want to know is "was any entry not zapped", which
>> could be a simple input boolean variable passed into do_zap_pte_range?
>>
>> Because as soon as any entry was processed but  no zapped, you can
>> immediately give up on reclaiming that table.
> 
> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
> found in count_pte_none().

I'm not sure if well need cont_pte_none(), but I'll have to take a look 
at your new patch to see how this fits together with doing the pte_none 
detection+skipping in do_zap_pte_range().

I was wondering if you cannot simply avoid the additional scanning and 
simply set "can_reclaim_pt" if you skip a zap.
Qi Zheng Nov. 18, 2024, 3:35 a.m. UTC | #10
On 2024/11/15 22:59, David Hildenbrand wrote:
> On 15.11.24 15:41, Qi Zheng wrote:
>>
>>
>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>> *nr_skip = nr;
>>>>>>
>>>>>> and then:
>>>>>>
>>>>>> zap_pte_range
>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, 
>>>>>> &skip_nr,
>>>>>>                            rss, &force_flush, &force_break);
>>>>>>         if (can_reclaim_pt) {
>>>>>>             none_nr += count_pte_none(pte, nr);
>>>>>>             none_nr += nr_skip;
>>>>>>         }
>>>>>>
>>>>>> Right?
>>>>>
>>>>> Yes. I did not look closely at the patch that adds the counting of
>>>>
>>>> Got it.
>>>>
>>>>> pte_none though (to digest why it is required :) ).
>>>>
>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>> empty PTE page.
>>>
>>> Okay, so the problem is that "nr" would be "all processed entries" but
>>> there are cases where we "process an entry but not zap it".
>>>
>>> What you really only want to know is "was any entry not zapped", which
>>> could be a simple input boolean variable passed into do_zap_pte_range?
>>>
>>> Because as soon as any entry was processed but  no zapped, you can
>>> immediately give up on reclaiming that table.
>>
>> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
>> found in count_pte_none().
> 
> I'm not sure if well need cont_pte_none(), but I'll have to take a look 
> at your new patch to see how this fits together with doing the pte_none 
> detection+skipping in do_zap_pte_range().
> 
> I was wondering if you cannot simply avoid the additional scanning and 
> simply set "can_reclaim_pt" if you skip a zap.

Maybe we can return the information whether the zap was skipped from
zap_present_ptes() and zap_nonpresent_ptes() through parameters like I
did in [PATCH v1 3/7] and [PATCH v1 4/7].

In theory, we can detect empty PTE pages in the following two ways:

1) If no zap is skipped, it means that all pte entries have been
    zap, and the PTE page must be empty.
2) If all pte entries are detected to be none, then the PTE page is
    empty.

In the error case, 1) may cause non-empty PTE pages to be reclaimed
(which is unacceptable), while the 2) will at most cause empty PTE pages
to not be reclaimed.

So the most reliable and efficient method may be:

a. If there is a zap that is skipped, stop scanning and do not reclaim
    the PTE page;
b. Otherwise, as now, detect the empty PTE page through count_pte_none()


>
David Hildenbrand Nov. 18, 2024, 9:29 a.m. UTC | #11
On 18.11.24 04:35, Qi Zheng wrote:
> 
> 
> On 2024/11/15 22:59, David Hildenbrand wrote:
>> On 15.11.24 15:41, Qi Zheng wrote:
>>>
>>>
>>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>>> *nr_skip = nr;
>>>>>>>
>>>>>>> and then:
>>>>>>>
>>>>>>> zap_pte_range
>>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>>>>>>> &skip_nr,
>>>>>>>                             rss, &force_flush, &force_break);
>>>>>>>          if (can_reclaim_pt) {
>>>>>>>              none_nr += count_pte_none(pte, nr);
>>>>>>>              none_nr += nr_skip;
>>>>>>>          }
>>>>>>>
>>>>>>> Right?
>>>>>>
>>>>>> Yes. I did not look closely at the patch that adds the counting of
>>>>>
>>>>> Got it.
>>>>>
>>>>>> pte_none though (to digest why it is required :) ).
>>>>>
>>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>>> empty PTE page.
>>>>
>>>> Okay, so the problem is that "nr" would be "all processed entries" but
>>>> there are cases where we "process an entry but not zap it".
>>>>
>>>> What you really only want to know is "was any entry not zapped", which
>>>> could be a simple input boolean variable passed into do_zap_pte_range?
>>>>
>>>> Because as soon as any entry was processed but  no zapped, you can
>>>> immediately give up on reclaiming that table.
>>>
>>> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
>>> found in count_pte_none().
>>
>> I'm not sure if well need cont_pte_none(), but I'll have to take a look
>> at your new patch to see how this fits together with doing the pte_none
>> detection+skipping in do_zap_pte_range().
>>
>> I was wondering if you cannot simply avoid the additional scanning and
>> simply set "can_reclaim_pt" if you skip a zap.
> 
> Maybe we can return the information whether the zap was skipped from
> zap_present_ptes() and zap_nonpresent_ptes() through parameters like I
> did in [PATCH v1 3/7] and [PATCH v1 4/7].
> 
> In theory, we can detect empty PTE pages in the following two ways:
> 
> 1) If no zap is skipped, it means that all pte entries have been
>      zap, and the PTE page must be empty.
> 2) If all pte entries are detected to be none, then the PTE page is
>      empty.
> 
> In the error case, 1) may cause non-empty PTE pages to be reclaimed
> (which is unacceptable), while the 2) will at most cause empty PTE pages
> to not be reclaimed.
> 
> So the most reliable and efficient method may be:
> 
> a. If there is a zap that is skipped, stop scanning and do not reclaim
>      the PTE page;
> b. Otherwise, as now, detect the empty PTE page through count_pte_none()

Is there a need for count_pte_none() that I am missing?

Assume we have

nr = do_zap_pte_range(&any_skipped)


If "nr" is the number of processed entries (including pte_none()), and
"any_skipped" is set whenever we skipped to zap a !pte_none entry, we 
can detect what we need, no?

If any_skipped == false after the call, we now have "nr" pte_none() 
entries. -> We can continue trying to reclaim

If any_skipped == true, we have at least one !pte_none() entry among the 
"nr" entries. -> We cannot and must not reclaim.
Qi Zheng Nov. 18, 2024, 10:34 a.m. UTC | #12
On 2024/11/18 17:29, David Hildenbrand wrote:
> On 18.11.24 04:35, Qi Zheng wrote:
>>
>>
>> On 2024/11/15 22:59, David Hildenbrand wrote:
>>> On 15.11.24 15:41, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>>>> *nr_skip = nr;
>>>>>>>>
>>>>>>>> and then:
>>>>>>>>
>>>>>>>> zap_pte_range
>>>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>>>>>>>> &skip_nr,
>>>>>>>>                             rss, &force_flush, &force_break);
>>>>>>>>          if (can_reclaim_pt) {
>>>>>>>>              none_nr += count_pte_none(pte, nr);
>>>>>>>>              none_nr += nr_skip;
>>>>>>>>          }
>>>>>>>>
>>>>>>>> Right?
>>>>>>>
>>>>>>> Yes. I did not look closely at the patch that adds the counting of
>>>>>>
>>>>>> Got it.
>>>>>>
>>>>>>> pte_none though (to digest why it is required :) ).
>>>>>>
>>>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>>>> empty PTE page.
>>>>>
>>>>> Okay, so the problem is that "nr" would be "all processed entries" but
>>>>> there are cases where we "process an entry but not zap it".
>>>>>
>>>>> What you really only want to know is "was any entry not zapped", which
>>>>> could be a simple input boolean variable passed into do_zap_pte_range?
>>>>>
>>>>> Because as soon as any entry was processed but  no zapped, you can
>>>>> immediately give up on reclaiming that table.
>>>>
>>>> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
>>>> found in count_pte_none().
>>>
>>> I'm not sure if well need cont_pte_none(), but I'll have to take a look
>>> at your new patch to see how this fits together with doing the pte_none
>>> detection+skipping in do_zap_pte_range().
>>>
>>> I was wondering if you cannot simply avoid the additional scanning and
>>> simply set "can_reclaim_pt" if you skip a zap.
>>
>> Maybe we can return the information whether the zap was skipped from
>> zap_present_ptes() and zap_nonpresent_ptes() through parameters like I
>> did in [PATCH v1 3/7] and [PATCH v1 4/7].
>>
>> In theory, we can detect empty PTE pages in the following two ways:
>>
>> 1) If no zap is skipped, it means that all pte entries have been
>>      zap, and the PTE page must be empty.
>> 2) If all pte entries are detected to be none, then the PTE page is
>>      empty.
>>
>> In the error case, 1) may cause non-empty PTE pages to be reclaimed
>> (which is unacceptable), while the 2) will at most cause empty PTE pages
>> to not be reclaimed.
>>
>> So the most reliable and efficient method may be:
>>
>> a. If there is a zap that is skipped, stop scanning and do not reclaim
>>      the PTE page;
>> b. Otherwise, as now, detect the empty PTE page through count_pte_none()
> 
> Is there a need for count_pte_none() that I am missing?

When any_skipped == false, at least add VM_BUG_ON() to recheck none ptes.

> 
> Assume we have
> 
> nr = do_zap_pte_range(&any_skipped)
> 
> 
> If "nr" is the number of processed entries (including pte_none()), and
> "any_skipped" is set whenever we skipped to zap a !pte_none entry, we 
> can detect what we need, no?
> 
> If any_skipped == false after the call, we now have "nr" pte_none() 
> entries. -> We can continue trying to reclaim

I prefer that "nr" should not include pte_none().

Something like this:

nr = do_zap_pte_range(&any_skipped);
if (can_reclaim_pt) {
	VM_BUG_ON(!any_skipped && count_pte_none(nr) == nr);
	if (any_skipped)
		can_reclaim_pt = false;
}

nr: the number of processed entries (excluding pte_none())
any_skipped: set whenever we skipped to zap a !pte_none entry


```
do_zap_pte_range
-->     pte_t ptent = ptep_get(pte);
	int max_nr = (end - addr) / PAGE_SIZE;

	/* Skip all consecutive pte_none(). */
	if (pte_none(ptent)) {
		int nr;

		for (nr = 1; nr < max_nr; nr++) {
			ptent = ptep_get(pte + nr);
			if (!pte_none(ptent))
				break;
		}
		max_nr -= nr;
		if (!max_nr)
			return 0;
		pte += nr;
		addr += nr * PAGE_SIZE;
	}

	if (pte_present(ptent))
		return zap_present_ptes(tlb, vma, pte, ptent, max_nr,
					addr, details, rss, force_flush,
					force_break, any_skipped);

	return zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
					 details, rss, any_skipped);
```

> 
> If any_skipped == true, we have at least one !pte_none() entry among the 
> "nr" entries. -> We cannot and must not reclaim.

>
David Hildenbrand Nov. 18, 2024, 10:41 a.m. UTC | #13
On 18.11.24 11:34, Qi Zheng wrote:
> 
> 
> On 2024/11/18 17:29, David Hildenbrand wrote:
>> On 18.11.24 04:35, Qi Zheng wrote:
>>>
>>>
>>> On 2024/11/15 22:59, David Hildenbrand wrote:
>>>> On 15.11.24 15:41, Qi Zheng wrote:
>>>>>
>>>>>
>>>>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>>>>> *nr_skip = nr;
>>>>>>>>>
>>>>>>>>> and then:
>>>>>>>>>
>>>>>>>>> zap_pte_range
>>>>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>>>>>>>>> &skip_nr,
>>>>>>>>>                              rss, &force_flush, &force_break);
>>>>>>>>>           if (can_reclaim_pt) {
>>>>>>>>>               none_nr += count_pte_none(pte, nr);
>>>>>>>>>               none_nr += nr_skip;
>>>>>>>>>           }
>>>>>>>>>
>>>>>>>>> Right?
>>>>>>>>
>>>>>>>> Yes. I did not look closely at the patch that adds the counting of
>>>>>>>
>>>>>>> Got it.
>>>>>>>
>>>>>>>> pte_none though (to digest why it is required :) ).
>>>>>>>
>>>>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>>>>> empty PTE page.
>>>>>>
>>>>>> Okay, so the problem is that "nr" would be "all processed entries" but
>>>>>> there are cases where we "process an entry but not zap it".
>>>>>>
>>>>>> What you really only want to know is "was any entry not zapped", which
>>>>>> could be a simple input boolean variable passed into do_zap_pte_range?
>>>>>>
>>>>>> Because as soon as any entry was processed but  no zapped, you can
>>>>>> immediately give up on reclaiming that table.
>>>>>
>>>>> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
>>>>> found in count_pte_none().
>>>>
>>>> I'm not sure if well need cont_pte_none(), but I'll have to take a look
>>>> at your new patch to see how this fits together with doing the pte_none
>>>> detection+skipping in do_zap_pte_range().
>>>>
>>>> I was wondering if you cannot simply avoid the additional scanning and
>>>> simply set "can_reclaim_pt" if you skip a zap.
>>>
>>> Maybe we can return the information whether the zap was skipped from
>>> zap_present_ptes() and zap_nonpresent_ptes() through parameters like I
>>> did in [PATCH v1 3/7] and [PATCH v1 4/7].
>>>
>>> In theory, we can detect empty PTE pages in the following two ways:
>>>
>>> 1) If no zap is skipped, it means that all pte entries have been
>>>       zap, and the PTE page must be empty.
>>> 2) If all pte entries are detected to be none, then the PTE page is
>>>       empty.
>>>
>>> In the error case, 1) may cause non-empty PTE pages to be reclaimed
>>> (which is unacceptable), while the 2) will at most cause empty PTE pages
>>> to not be reclaimed.
>>>
>>> So the most reliable and efficient method may be:
>>>
>>> a. If there is a zap that is skipped, stop scanning and do not reclaim
>>>       the PTE page;
>>> b. Otherwise, as now, detect the empty PTE page through count_pte_none()
>>
>> Is there a need for count_pte_none() that I am missing?
> 
> When any_skipped == false, at least add VM_BUG_ON() to recheck none ptes.
> 
>>
>> Assume we have
>>
>> nr = do_zap_pte_range(&any_skipped)
>>
>>
>> If "nr" is the number of processed entries (including pte_none()), and
>> "any_skipped" is set whenever we skipped to zap a !pte_none entry, we
>> can detect what we need, no?
>>
>> If any_skipped == false after the call, we now have "nr" pte_none()
>> entries. -> We can continue trying to reclaim
> 
> I prefer that "nr" should not include pte_none().
> 

Why? do_zap_pte_range() should tell you how far to advance, nothing 
less, nothing more.

Let's just keep it simple and avoid count_pte_none().

I'm probably missing something important?
Qi Zheng Nov. 18, 2024, 10:56 a.m. UTC | #14
On 2024/11/18 18:41, David Hildenbrand wrote:
> On 18.11.24 11:34, Qi Zheng wrote:
>>
>>
>> On 2024/11/18 17:29, David Hildenbrand wrote:
>>> On 18.11.24 04:35, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2024/11/15 22:59, David Hildenbrand wrote:
>>>>> On 15.11.24 15:41, Qi Zheng wrote:
>>>>>>
>>>>>>
>>>>>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>>>>>> *nr_skip = nr;
>>>>>>>>>>
>>>>>>>>>> and then:
>>>>>>>>>>
>>>>>>>>>> zap_pte_range
>>>>>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>>>>>>>>>> &skip_nr,
>>>>>>>>>>                              rss, &force_flush, &force_break);
>>>>>>>>>>           if (can_reclaim_pt) {
>>>>>>>>>>               none_nr += count_pte_none(pte, nr);
>>>>>>>>>>               none_nr += nr_skip;
>>>>>>>>>>           }
>>>>>>>>>>
>>>>>>>>>> Right?
>>>>>>>>>
>>>>>>>>> Yes. I did not look closely at the patch that adds the counting of
>>>>>>>>
>>>>>>>> Got it.
>>>>>>>>
>>>>>>>>> pte_none though (to digest why it is required :) ).
>>>>>>>>
>>>>>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>>>>>> empty PTE page.
>>>>>>>
>>>>>>> Okay, so the problem is that "nr" would be "all processed 
>>>>>>> entries" but
>>>>>>> there are cases where we "process an entry but not zap it".
>>>>>>>
>>>>>>> What you really only want to know is "was any entry not zapped", 
>>>>>>> which
>>>>>>> could be a simple input boolean variable passed into 
>>>>>>> do_zap_pte_range?
>>>>>>>
>>>>>>> Because as soon as any entry was processed but  no zapped, you can
>>>>>>> immediately give up on reclaiming that table.
>>>>>>
>>>>>> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
>>>>>> found in count_pte_none().
>>>>>
>>>>> I'm not sure if well need cont_pte_none(), but I'll have to take a 
>>>>> look
>>>>> at your new patch to see how this fits together with doing the 
>>>>> pte_none
>>>>> detection+skipping in do_zap_pte_range().
>>>>>
>>>>> I was wondering if you cannot simply avoid the additional scanning and
>>>>> simply set "can_reclaim_pt" if you skip a zap.
>>>>
>>>> Maybe we can return the information whether the zap was skipped from
>>>> zap_present_ptes() and zap_nonpresent_ptes() through parameters like I
>>>> did in [PATCH v1 3/7] and [PATCH v1 4/7].
>>>>
>>>> In theory, we can detect empty PTE pages in the following two ways:
>>>>
>>>> 1) If no zap is skipped, it means that all pte entries have been
>>>>       zap, and the PTE page must be empty.
>>>> 2) If all pte entries are detected to be none, then the PTE page is
>>>>       empty.
>>>>
>>>> In the error case, 1) may cause non-empty PTE pages to be reclaimed
>>>> (which is unacceptable), while the 2) will at most cause empty PTE 
>>>> pages
>>>> to not be reclaimed.
>>>>
>>>> So the most reliable and efficient method may be:
>>>>
>>>> a. If there is a zap that is skipped, stop scanning and do not reclaim
>>>>       the PTE page;
>>>> b. Otherwise, as now, detect the empty PTE page through 
>>>> count_pte_none()
>>>
>>> Is there a need for count_pte_none() that I am missing?
>>
>> When any_skipped == false, at least add VM_BUG_ON() to recheck none ptes.
>>
>>>
>>> Assume we have
>>>
>>> nr = do_zap_pte_range(&any_skipped)
>>>
>>>
>>> If "nr" is the number of processed entries (including pte_none()), and
>>> "any_skipped" is set whenever we skipped to zap a !pte_none entry, we
>>> can detect what we need, no?
>>>
>>> If any_skipped == false after the call, we now have "nr" pte_none()
>>> entries. -> We can continue trying to reclaim
>>
>> I prefer that "nr" should not include pte_none().
>>
> 
> Why? do_zap_pte_range() should tell you how far to advance, nothing 
> less, nothing more.
> 
> Let's just keep it simple and avoid count_pte_none().
> 
> I'm probably missing something important?

As we discussed before, we should skip all consecutive none ptes,
pte and addr are already incremented before returning.

>
David Hildenbrand Nov. 18, 2024, 10:59 a.m. UTC | #15
On 18.11.24 11:56, Qi Zheng wrote:
> 
> 
> On 2024/11/18 18:41, David Hildenbrand wrote:
>> On 18.11.24 11:34, Qi Zheng wrote:
>>>
>>>
>>> On 2024/11/18 17:29, David Hildenbrand wrote:
>>>> On 18.11.24 04:35, Qi Zheng wrote:
>>>>>
>>>>>
>>>>> On 2024/11/15 22:59, David Hildenbrand wrote:
>>>>>> On 15.11.24 15:41, Qi Zheng wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>>>>>>> *nr_skip = nr;
>>>>>>>>>>>
>>>>>>>>>>> and then:
>>>>>>>>>>>
>>>>>>>>>>> zap_pte_range
>>>>>>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>>>>>>>>>>> &skip_nr,
>>>>>>>>>>>                               rss, &force_flush, &force_break);
>>>>>>>>>>>            if (can_reclaim_pt) {
>>>>>>>>>>>                none_nr += count_pte_none(pte, nr);
>>>>>>>>>>>                none_nr += nr_skip;
>>>>>>>>>>>            }
>>>>>>>>>>>
>>>>>>>>>>> Right?
>>>>>>>>>>
>>>>>>>>>> Yes. I did not look closely at the patch that adds the counting of
>>>>>>>>>
>>>>>>>>> Got it.
>>>>>>>>>
>>>>>>>>>> pte_none though (to digest why it is required :) ).
>>>>>>>>>
>>>>>>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>>>>>>> empty PTE page.
>>>>>>>>
>>>>>>>> Okay, so the problem is that "nr" would be "all processed
>>>>>>>> entries" but
>>>>>>>> there are cases where we "process an entry but not zap it".
>>>>>>>>
>>>>>>>> What you really only want to know is "was any entry not zapped",
>>>>>>>> which
>>>>>>>> could be a simple input boolean variable passed into
>>>>>>>> do_zap_pte_range?
>>>>>>>>
>>>>>>>> Because as soon as any entry was processed but  no zapped, you can
>>>>>>>> immediately give up on reclaiming that table.
>>>>>>>
>>>>>>> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
>>>>>>> found in count_pte_none().
>>>>>>
>>>>>> I'm not sure if well need cont_pte_none(), but I'll have to take a
>>>>>> look
>>>>>> at your new patch to see how this fits together with doing the
>>>>>> pte_none
>>>>>> detection+skipping in do_zap_pte_range().
>>>>>>
>>>>>> I was wondering if you cannot simply avoid the additional scanning and
>>>>>> simply set "can_reclaim_pt" if you skip a zap.
>>>>>
>>>>> Maybe we can return the information whether the zap was skipped from
>>>>> zap_present_ptes() and zap_nonpresent_ptes() through parameters like I
>>>>> did in [PATCH v1 3/7] and [PATCH v1 4/7].
>>>>>
>>>>> In theory, we can detect empty PTE pages in the following two ways:
>>>>>
>>>>> 1) If no zap is skipped, it means that all pte entries have been
>>>>>        zap, and the PTE page must be empty.
>>>>> 2) If all pte entries are detected to be none, then the PTE page is
>>>>>        empty.
>>>>>
>>>>> In the error case, 1) may cause non-empty PTE pages to be reclaimed
>>>>> (which is unacceptable), while the 2) will at most cause empty PTE
>>>>> pages
>>>>> to not be reclaimed.
>>>>>
>>>>> So the most reliable and efficient method may be:
>>>>>
>>>>> a. If there is a zap that is skipped, stop scanning and do not reclaim
>>>>>        the PTE page;
>>>>> b. Otherwise, as now, detect the empty PTE page through
>>>>> count_pte_none()
>>>>
>>>> Is there a need for count_pte_none() that I am missing?
>>>
>>> When any_skipped == false, at least add VM_BUG_ON() to recheck none ptes.
>>>
>>>>
>>>> Assume we have
>>>>
>>>> nr = do_zap_pte_range(&any_skipped)
>>>>
>>>>
>>>> If "nr" is the number of processed entries (including pte_none()), and
>>>> "any_skipped" is set whenever we skipped to zap a !pte_none entry, we
>>>> can detect what we need, no?
>>>>
>>>> If any_skipped == false after the call, we now have "nr" pte_none()
>>>> entries. -> We can continue trying to reclaim
>>>
>>> I prefer that "nr" should not include pte_none().
>>>
>>
>> Why? do_zap_pte_range() should tell you how far to advance, nothing
>> less, nothing more.
>>
>> Let's just keep it simple and avoid count_pte_none().
>>
>> I'm probably missing something important?
> 
> As we discussed before, we should skip all consecutive none ptes,
 > pte and addr are already incremented before returning.

It's probably best to send the resulting patch so I can either 
understand why count_pte_none() is required or comment on how to get rid 
of it.
Qi Zheng Nov. 18, 2024, 11:13 a.m. UTC | #16
On 2024/11/18 18:59, David Hildenbrand wrote:
> On 18.11.24 11:56, Qi Zheng wrote:
>>
>>
>> On 2024/11/18 18:41, David Hildenbrand wrote:
>>> On 18.11.24 11:34, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2024/11/18 17:29, David Hildenbrand wrote:
>>>>> On 18.11.24 04:35, Qi Zheng wrote:
>>>>>>
>>>>>>
>>>>>> On 2024/11/15 22:59, David Hildenbrand wrote:
>>>>>>> On 15.11.24 15:41, Qi Zheng wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>>>>>>>> *nr_skip = nr;
>>>>>>>>>>>>
>>>>>>>>>>>> and then:
>>>>>>>>>>>>
>>>>>>>>>>>> zap_pte_range
>>>>>>>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>>>>>>>>>>>> &skip_nr,
>>>>>>>>>>>>                               rss, &force_flush, &force_break);
>>>>>>>>>>>>            if (can_reclaim_pt) {
>>>>>>>>>>>>                none_nr += count_pte_none(pte, nr);
>>>>>>>>>>>>                none_nr += nr_skip;
>>>>>>>>>>>>            }
>>>>>>>>>>>>
>>>>>>>>>>>> Right?
>>>>>>>>>>>
>>>>>>>>>>> Yes. I did not look closely at the patch that adds the 
>>>>>>>>>>> counting of
>>>>>>>>>>
>>>>>>>>>> Got it.
>>>>>>>>>>
>>>>>>>>>>> pte_none though (to digest why it is required :) ).
>>>>>>>>>>
>>>>>>>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>>>>>>>> empty PTE page.
>>>>>>>>>
>>>>>>>>> Okay, so the problem is that "nr" would be "all processed
>>>>>>>>> entries" but
>>>>>>>>> there are cases where we "process an entry but not zap it".
>>>>>>>>>
>>>>>>>>> What you really only want to know is "was any entry not zapped",
>>>>>>>>> which
>>>>>>>>> could be a simple input boolean variable passed into
>>>>>>>>> do_zap_pte_range?
>>>>>>>>>
>>>>>>>>> Because as soon as any entry was processed but  no zapped, you can
>>>>>>>>> immediately give up on reclaiming that table.
>>>>>>>>
>>>>>>>> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
>>>>>>>> found in count_pte_none().
>>>>>>>
>>>>>>> I'm not sure if well need cont_pte_none(), but I'll have to take a
>>>>>>> look
>>>>>>> at your new patch to see how this fits together with doing the
>>>>>>> pte_none
>>>>>>> detection+skipping in do_zap_pte_range().
>>>>>>>
>>>>>>> I was wondering if you cannot simply avoid the additional 
>>>>>>> scanning and
>>>>>>> simply set "can_reclaim_pt" if you skip a zap.
>>>>>>
>>>>>> Maybe we can return the information whether the zap was skipped from
>>>>>> zap_present_ptes() and zap_nonpresent_ptes() through parameters 
>>>>>> like I
>>>>>> did in [PATCH v1 3/7] and [PATCH v1 4/7].
>>>>>>
>>>>>> In theory, we can detect empty PTE pages in the following two ways:
>>>>>>
>>>>>> 1) If no zap is skipped, it means that all pte entries have been
>>>>>>        zap, and the PTE page must be empty.
>>>>>> 2) If all pte entries are detected to be none, then the PTE page is
>>>>>>        empty.
>>>>>>
>>>>>> In the error case, 1) may cause non-empty PTE pages to be reclaimed
>>>>>> (which is unacceptable), while the 2) will at most cause empty PTE
>>>>>> pages
>>>>>> to not be reclaimed.
>>>>>>
>>>>>> So the most reliable and efficient method may be:
>>>>>>
>>>>>> a. If there is a zap that is skipped, stop scanning and do not 
>>>>>> reclaim
>>>>>>        the PTE page;
>>>>>> b. Otherwise, as now, detect the empty PTE page through
>>>>>> count_pte_none()
>>>>>
>>>>> Is there a need for count_pte_none() that I am missing?
>>>>
>>>> When any_skipped == false, at least add VM_BUG_ON() to recheck none 
>>>> ptes.
>>>>
>>>>>
>>>>> Assume we have
>>>>>
>>>>> nr = do_zap_pte_range(&any_skipped)
>>>>>
>>>>>
>>>>> If "nr" is the number of processed entries (including pte_none()), and
>>>>> "any_skipped" is set whenever we skipped to zap a !pte_none entry, we
>>>>> can detect what we need, no?
>>>>>
>>>>> If any_skipped == false after the call, we now have "nr" pte_none()
>>>>> entries. -> We can continue trying to reclaim
>>>>
>>>> I prefer that "nr" should not include pte_none().
>>>>
>>>
>>> Why? do_zap_pte_range() should tell you how far to advance, nothing
>>> less, nothing more.
>>>
>>> Let's just keep it simple and avoid count_pte_none().
>>>
>>> I'm probably missing something important?
>>
>> As we discussed before, we should skip all consecutive none ptes,
>  > pte and addr are already incremented before returning.
> 
> It's probably best to send the resulting patch so I can either 
> understand why count_pte_none() is required or comment on how to get rid 
> of it.

Something like this:

diff --git a/mm/memory.c b/mm/memory.c
index bd9ebe0f4471f..e9bec3cd49d44 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1657,6 +1657,66 @@ static inline int zap_nonpresent_ptes(struct 
mmu_gather *tlb,
         return nr;
  }

+static inline int do_zap_pte_range(struct mmu_gather *tlb,
+                                  struct vm_area_struct *vma, pte_t *pte,
+                                  unsigned long addr, unsigned long end,
+                                  struct zap_details *details, int *rss,
+                                  bool *force_flush, bool *force_break,
+                                  bool *any_skipped)
+{
+       pte_t ptent = ptep_get(pte);
+       int max_nr = (end - addr) / PAGE_SIZE;
+
+       /* Skip all consecutive pte_none(). */
+       if (pte_none(ptent)) {
+               int nr;
+
+               for (nr = 1; nr < max_nr; nr++) {
+                       ptent = ptep_get(pte + nr);
+                       if (!pte_none(ptent))
+                               break;
+               }
+               max_nr -= nr;
+               if (!max_nr)
+                       return 0;
+               pte += nr;
+               addr += nr * PAGE_SIZE;
+       }
+
+       if (pte_present(ptent))
+               return zap_present_ptes(tlb, vma, pte, ptent, max_nr,
+                                       addr, details, rss, force_flush,
+                                       force_break, any_skipped);
+
+       return zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
+                                  details, rss, any_skipped);
+}
+
+static inline int count_pte_none(pte_t *pte, int nr)
+{
+       int none_nr = 0;
+
+       /*
+        * If PTE_MARKER_UFFD_WP is enabled, the uffd-wp PTEs may be
+        * re-installed, so we need to check pte_none() one by one.
+        * Otherwise, checking a single PTE in a batch is sufficient.
+        */
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
+       for (;;) {
+               if (pte_none(ptep_get(pte)))
+                       none_nr++;
+               if (--nr == 0)
+                       break;
+               pte++;
+       }
+#else
+       if (pte_none(ptep_get(pte)))
+               none_nr = nr;
+#endif
+       return none_nr;
+}
+
+
  static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                 struct vm_area_struct *vma, pmd_t *pmd,
                                 unsigned long addr, unsigned long end,
@@ -1667,6 +1727,7 @@ static unsigned long zap_pte_range(struct 
mmu_gather *tlb,
         int rss[NR_MM_COUNTERS];
         spinlock_t *ptl;
         pte_t *start_pte;
+       bool can_reclaim_pt;
         pte_t *pte;
         int nr;

@@ -1679,28 +1740,22 @@ static unsigned long zap_pte_range(struct 
mmu_gather *tlb,
         flush_tlb_batched_pending(mm);
         arch_enter_lazy_mmu_mode();
         do {
-               pte_t ptent = ptep_get(pte);
-               int max_nr;
-
-               nr = 1;
-               if (pte_none(ptent))
-                       continue;
+               bool any_skipped;

                 if (need_resched())
                         break;

-               max_nr = (end - addr) / PAGE_SIZE;
-               if (pte_present(ptent)) {
-                       nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
-                                             addr, details, rss, 
&force_flush,
-                                             &force_break);
-                       if (unlikely(force_break)) {
-                               addr += nr * PAGE_SIZE;
-                               break;
-                       }
-               } else {
-                       nr = zap_nonpresent_ptes(tlb, vma, pte, ptent, 
max_nr,
-                                                addr, details, rss);
+               nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
+                                     rss, &force_flush, &force_break,
+                                     &any_skipped);
+               if (can_reclaim_pt) {
+                       VM_BUG_ON(!any_skipped && count_pte_none(pte, 
nr) == nr);
+                       if (any_skipped)
+                               can_reclaim_pt = false;
+               }
+               if (unlikely(force_break)) {
+                       addr += nr * PAGE_SIZE;
+                       break;
                 }
         } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);


>
David Hildenbrand Nov. 19, 2024, 9:55 a.m. UTC | #17
On 18.11.24 12:13, Qi Zheng wrote:
> 
> 
> On 2024/11/18 18:59, David Hildenbrand wrote:
>> On 18.11.24 11:56, Qi Zheng wrote:
>>>
>>>
>>> On 2024/11/18 18:41, David Hildenbrand wrote:
>>>> On 18.11.24 11:34, Qi Zheng wrote:
>>>>>
>>>>>
>>>>> On 2024/11/18 17:29, David Hildenbrand wrote:
>>>>>> On 18.11.24 04:35, Qi Zheng wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 2024/11/15 22:59, David Hildenbrand wrote:
>>>>>>>> On 15.11.24 15:41, Qi Zheng wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>>>>>>>>> *nr_skip = nr;
>>>>>>>>>>>>>
>>>>>>>>>>>>> and then:
>>>>>>>>>>>>>
>>>>>>>>>>>>> zap_pte_range
>>>>>>>>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>>>>>>>>>>>>> &skip_nr,
>>>>>>>>>>>>>                                rss, &force_flush, &force_break);
>>>>>>>>>>>>>             if (can_reclaim_pt) {
>>>>>>>>>>>>>                 none_nr += count_pte_none(pte, nr);
>>>>>>>>>>>>>                 none_nr += nr_skip;
>>>>>>>>>>>>>             }
>>>>>>>>>>>>>
>>>>>>>>>>>>> Right?
>>>>>>>>>>>>
>>>>>>>>>>>> Yes. I did not look closely at the patch that adds the
>>>>>>>>>>>> counting of
>>>>>>>>>>>
>>>>>>>>>>> Got it.
>>>>>>>>>>>
>>>>>>>>>>>> pte_none though (to digest why it is required :) ).
>>>>>>>>>>>
>>>>>>>>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>>>>>>>>> empty PTE page.
>>>>>>>>>>
>>>>>>>>>> Okay, so the problem is that "nr" would be "all processed
>>>>>>>>>> entries" but
>>>>>>>>>> there are cases where we "process an entry but not zap it".
>>>>>>>>>>
>>>>>>>>>> What you really only want to know is "was any entry not zapped",
>>>>>>>>>> which
>>>>>>>>>> could be a simple input boolean variable passed into
>>>>>>>>>> do_zap_pte_range?
>>>>>>>>>>
>>>>>>>>>> Because as soon as any entry was processed but  no zapped, you can
>>>>>>>>>> immediately give up on reclaiming that table.
>>>>>>>>>
>>>>>>>>> Yes, we can set can_reclaim_pt to false when a !pte_none() entry is
>>>>>>>>> found in count_pte_none().
>>>>>>>>
>>>>>>>> I'm not sure if well need cont_pte_none(), but I'll have to take a
>>>>>>>> look
>>>>>>>> at your new patch to see how this fits together with doing the
>>>>>>>> pte_none
>>>>>>>> detection+skipping in do_zap_pte_range().
>>>>>>>>
>>>>>>>> I was wondering if you cannot simply avoid the additional
>>>>>>>> scanning and
>>>>>>>> simply set "can_reclaim_pt" if you skip a zap.
>>>>>>>
>>>>>>> Maybe we can return the information whether the zap was skipped from
>>>>>>> zap_present_ptes() and zap_nonpresent_ptes() through parameters
>>>>>>> like I
>>>>>>> did in [PATCH v1 3/7] and [PATCH v1 4/7].
>>>>>>>
>>>>>>> In theory, we can detect empty PTE pages in the following two ways:
>>>>>>>
>>>>>>> 1) If no zap is skipped, it means that all pte entries have been
>>>>>>>         zap, and the PTE page must be empty.
>>>>>>> 2) If all pte entries are detected to be none, then the PTE page is
>>>>>>>         empty.
>>>>>>>
>>>>>>> In the error case, 1) may cause non-empty PTE pages to be reclaimed
>>>>>>> (which is unacceptable), while the 2) will at most cause empty PTE
>>>>>>> pages
>>>>>>> to not be reclaimed.
>>>>>>>
>>>>>>> So the most reliable and efficient method may be:
>>>>>>>
>>>>>>> a. If there is a zap that is skipped, stop scanning and do not
>>>>>>> reclaim
>>>>>>>         the PTE page;
>>>>>>> b. Otherwise, as now, detect the empty PTE page through
>>>>>>> count_pte_none()
>>>>>>
>>>>>> Is there a need for count_pte_none() that I am missing?
>>>>>
>>>>> When any_skipped == false, at least add VM_BUG_ON() to recheck none
>>>>> ptes.
>>>>>
>>>>>>
>>>>>> Assume we have
>>>>>>
>>>>>> nr = do_zap_pte_range(&any_skipped)
>>>>>>
>>>>>>
>>>>>> If "nr" is the number of processed entries (including pte_none()), and
>>>>>> "any_skipped" is set whenever we skipped to zap a !pte_none entry, we
>>>>>> can detect what we need, no?
>>>>>>
>>>>>> If any_skipped == false after the call, we now have "nr" pte_none()
>>>>>> entries. -> We can continue trying to reclaim
>>>>>
>>>>> I prefer that "nr" should not include pte_none().
>>>>>
>>>>
>>>> Why? do_zap_pte_range() should tell you how far to advance, nothing
>>>> less, nothing more.
>>>>
>>>> Let's just keep it simple and avoid count_pte_none().
>>>>
>>>> I'm probably missing something important?
>>>
>>> As we discussed before, we should skip all consecutive none ptes,
>>   > pte and addr are already incremented before returning.
>>
>> It's probably best to send the resulting patch so I can either
>> understand why count_pte_none() is required or comment on how to get rid
>> of it.
> 
> Something like this:
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index bd9ebe0f4471f..e9bec3cd49d44 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1657,6 +1657,66 @@ static inline int zap_nonpresent_ptes(struct
> mmu_gather *tlb,
>           return nr;
>    }
> 
> +static inline int do_zap_pte_range(struct mmu_gather *tlb,
> +                                  struct vm_area_struct *vma, pte_t *pte,
> +                                  unsigned long addr, unsigned long end,
> +                                  struct zap_details *details, int *rss,
> +                                  bool *force_flush, bool *force_break,
> +                                  bool *any_skipped)
> +{
> +       pte_t ptent = ptep_get(pte);
> +       int max_nr = (end - addr) / PAGE_SIZE;

I'd do here:

	  int nr = 0;

> +
> +       /* Skip all consecutive pte_none(). */
> +       if (pte_none(ptent)) {
> +

instead of the "int nr" here

> +               for (nr = 1; nr < max_nr; nr++) {
> +                       ptent = ptep_get(pte + nr);
> +                       if (!pte_none(ptent))
> +                               break;
> +               }
> +               max_nr -= nr;
> +               if (!max_nr)
> +                       return 0;
> +               pte += nr;
> +               addr += nr * PAGE_SIZE;
> +       }
> +
> +       if (pte_present(ptent))
> +               return zap_present_ptes(tlb, vma, pte, ptent, max_nr,
> +                                       addr, details, rss, force_flush,
> +                                       force_break, any_skipped);

and here:

if (pte_present(ptent))
	nr += zap_present_ptes(...)
else
	nr += zap_nonpresent_ptes();
return nr;

So you really return the number of processed items -- how much the 
caller should advance.

> +
> +       return zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
> +                                  details, rss, any_skipped);
> +}
> +
> +static inline int count_pte_none(pte_t *pte, int nr)
> +{
> +       int none_nr = 0;
> +
> +       /*
> +        * If PTE_MARKER_UFFD_WP is enabled, the uffd-wp PTEs may be
> +        * re-installed, so we need to check pte_none() one by one.
> +        * Otherwise, checking a single PTE in a batch is sufficient.
> +        */
> +#ifdef CONFIG_PTE_MARKER_UFFD_WP
> +       for (;;) {
> +               if (pte_none(ptep_get(pte)))
> +                       none_nr++;
> +               if (--nr == 0)
> +                       break;
> +               pte++;
> +       }
> +#else
> +       if (pte_none(ptep_get(pte)))
> +               none_nr = nr;
> +#endif
> +       return none_nr;
> +}
> +
> +
>    static unsigned long zap_pte_range(struct mmu_gather *tlb,
>                                   struct vm_area_struct *vma, pmd_t *pmd,
>                                   unsigned long addr, unsigned long end,
> @@ -1667,6 +1727,7 @@ static unsigned long zap_pte_range(struct
> mmu_gather *tlb,
>           int rss[NR_MM_COUNTERS];
>           spinlock_t *ptl;
>           pte_t *start_pte;
> +       bool can_reclaim_pt;
>           pte_t *pte;
>           int nr;
> 
> @@ -1679,28 +1740,22 @@ static unsigned long zap_pte_range(struct
> mmu_gather *tlb,
>           flush_tlb_batched_pending(mm);
>           arch_enter_lazy_mmu_mode();
>           do {
> -               pte_t ptent = ptep_get(pte);
> -               int max_nr;
> -
> -               nr = 1;
> -               if (pte_none(ptent))
> -                       continue;
> +               bool any_skipped;
> 
>                   if (need_resched())
>                           break;
> 
> -               max_nr = (end - addr) / PAGE_SIZE;
> -               if (pte_present(ptent)) {
> -                       nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
> -                                             addr, details, rss,
> &force_flush,
> -                                             &force_break);
> -                       if (unlikely(force_break)) {
> -                               addr += nr * PAGE_SIZE;
> -                               break;
> -                       }
> -               } else {
> -                       nr = zap_nonpresent_ptes(tlb, vma, pte, ptent,
> max_nr,
> -                                                addr, details, rss);
> +               nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
> +                                     rss, &force_flush, &force_break,
> +                                     &any_skipped);
> +               if (can_reclaim_pt) {
> +                       VM_BUG_ON(!any_skipped && count_pte_none(pte,
> nr) == nr);

Okay, so this is really only for debugging then? So we can safely drop 
that for now.

I assume it would make sense to check when reclaiming a page table with 
CONFIG_DEBUG_VM, that the table is actually all-pte_none instead?

(as a side note: no VM_BUG_ON, please :) )
Qi Zheng Nov. 19, 2024, 10:03 a.m. UTC | #18
On 2024/11/19 17:55, David Hildenbrand wrote:
> On 18.11.24 12:13, Qi Zheng wrote:
>>
>>
>> On 2024/11/18 18:59, David Hildenbrand wrote:
>>> On 18.11.24 11:56, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2024/11/18 18:41, David Hildenbrand wrote:
>>>>> On 18.11.24 11:34, Qi Zheng wrote:
>>>>>>
>>>>>>
>>>>>> On 2024/11/18 17:29, David Hildenbrand wrote:
>>>>>>> On 18.11.24 04:35, Qi Zheng wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 2024/11/15 22:59, David Hildenbrand wrote:
>>>>>>>>> On 15.11.24 15:41, Qi Zheng wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 2024/11/15 18:22, David Hildenbrand wrote:
>>>>>>>>>>>>>> *nr_skip = nr;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> and then:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> zap_pte_range
>>>>>>>>>>>>>> --> nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>>>>>>>>>>>>>> &skip_nr,
>>>>>>>>>>>>>>                                rss, &force_flush, 
>>>>>>>>>>>>>> &force_break);
>>>>>>>>>>>>>>             if (can_reclaim_pt) {
>>>>>>>>>>>>>>                 none_nr += count_pte_none(pte, nr);
>>>>>>>>>>>>>>                 none_nr += nr_skip;
>>>>>>>>>>>>>>             }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Right?
>>>>>>>>>>>>>
>>>>>>>>>>>>> Yes. I did not look closely at the patch that adds the
>>>>>>>>>>>>> counting of
>>>>>>>>>>>>
>>>>>>>>>>>> Got it.
>>>>>>>>>>>>
>>>>>>>>>>>>> pte_none though (to digest why it is required :) ).
>>>>>>>>>>>>
>>>>>>>>>>>> Because 'none_nr == PTRS_PER_PTE' is used in patch #7 to detect
>>>>>>>>>>>> empty PTE page.
>>>>>>>>>>>
>>>>>>>>>>> Okay, so the problem is that "nr" would be "all processed
>>>>>>>>>>> entries" but
>>>>>>>>>>> there are cases where we "process an entry but not zap it".
>>>>>>>>>>>
>>>>>>>>>>> What you really only want to know is "was any entry not zapped",
>>>>>>>>>>> which
>>>>>>>>>>> could be a simple input boolean variable passed into
>>>>>>>>>>> do_zap_pte_range?
>>>>>>>>>>>
>>>>>>>>>>> Because as soon as any entry was processed but  no zapped, 
>>>>>>>>>>> you can
>>>>>>>>>>> immediately give up on reclaiming that table.
>>>>>>>>>>
>>>>>>>>>> Yes, we can set can_reclaim_pt to false when a !pte_none() 
>>>>>>>>>> entry is
>>>>>>>>>> found in count_pte_none().
>>>>>>>>>
>>>>>>>>> I'm not sure if well need cont_pte_none(), but I'll have to take a
>>>>>>>>> look
>>>>>>>>> at your new patch to see how this fits together with doing the
>>>>>>>>> pte_none
>>>>>>>>> detection+skipping in do_zap_pte_range().
>>>>>>>>>
>>>>>>>>> I was wondering if you cannot simply avoid the additional
>>>>>>>>> scanning and
>>>>>>>>> simply set "can_reclaim_pt" if you skip a zap.
>>>>>>>>
>>>>>>>> Maybe we can return the information whether the zap was skipped 
>>>>>>>> from
>>>>>>>> zap_present_ptes() and zap_nonpresent_ptes() through parameters
>>>>>>>> like I
>>>>>>>> did in [PATCH v1 3/7] and [PATCH v1 4/7].
>>>>>>>>
>>>>>>>> In theory, we can detect empty PTE pages in the following two ways:
>>>>>>>>
>>>>>>>> 1) If no zap is skipped, it means that all pte entries have been
>>>>>>>>         zap, and the PTE page must be empty.
>>>>>>>> 2) If all pte entries are detected to be none, then the PTE page is
>>>>>>>>         empty.
>>>>>>>>
>>>>>>>> In the error case, 1) may cause non-empty PTE pages to be reclaimed
>>>>>>>> (which is unacceptable), while the 2) will at most cause empty PTE
>>>>>>>> pages
>>>>>>>> to not be reclaimed.
>>>>>>>>
>>>>>>>> So the most reliable and efficient method may be:
>>>>>>>>
>>>>>>>> a. If there is a zap that is skipped, stop scanning and do not
>>>>>>>> reclaim
>>>>>>>>         the PTE page;
>>>>>>>> b. Otherwise, as now, detect the empty PTE page through
>>>>>>>> count_pte_none()
>>>>>>>
>>>>>>> Is there a need for count_pte_none() that I am missing?
>>>>>>
>>>>>> When any_skipped == false, at least add VM_BUG_ON() to recheck none
>>>>>> ptes.
>>>>>>
>>>>>>>
>>>>>>> Assume we have
>>>>>>>
>>>>>>> nr = do_zap_pte_range(&any_skipped)
>>>>>>>
>>>>>>>
>>>>>>> If "nr" is the number of processed entries (including 
>>>>>>> pte_none()), and
>>>>>>> "any_skipped" is set whenever we skipped to zap a !pte_none 
>>>>>>> entry, we
>>>>>>> can detect what we need, no?
>>>>>>>
>>>>>>> If any_skipped == false after the call, we now have "nr" pte_none()
>>>>>>> entries. -> We can continue trying to reclaim
>>>>>>
>>>>>> I prefer that "nr" should not include pte_none().
>>>>>>
>>>>>
>>>>> Why? do_zap_pte_range() should tell you how far to advance, nothing
>>>>> less, nothing more.
>>>>>
>>>>> Let's just keep it simple and avoid count_pte_none().
>>>>>
>>>>> I'm probably missing something important?
>>>>
>>>> As we discussed before, we should skip all consecutive none ptes,
>>>   > pte and addr are already incremented before returning.
>>>
>>> It's probably best to send the resulting patch so I can either
>>> understand why count_pte_none() is required or comment on how to get rid
>>> of it.
>>
>> Something like this:
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index bd9ebe0f4471f..e9bec3cd49d44 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1657,6 +1657,66 @@ static inline int zap_nonpresent_ptes(struct
>> mmu_gather *tlb,
>>           return nr;
>>    }
>>
>> +static inline int do_zap_pte_range(struct mmu_gather *tlb,
>> +                                  struct vm_area_struct *vma, pte_t 
>> *pte,
>> +                                  unsigned long addr, unsigned long end,
>> +                                  struct zap_details *details, int *rss,
>> +                                  bool *force_flush, bool *force_break,
>> +                                  bool *any_skipped)
>> +{
>> +       pte_t ptent = ptep_get(pte);
>> +       int max_nr = (end - addr) / PAGE_SIZE;
> 
> I'd do here:
> 
>        int nr = 0;
> 
>> +
>> +       /* Skip all consecutive pte_none(). */
>> +       if (pte_none(ptent)) {
>> +
> 
> instead of the "int nr" here
> 
>> +               for (nr = 1; nr < max_nr; nr++) {
>> +                       ptent = ptep_get(pte + nr);
>> +                       if (!pte_none(ptent))
>> +                               break;
>> +               }
>> +               max_nr -= nr;
>> +               if (!max_nr)
>> +                       return 0;
>> +               pte += nr;
>> +               addr += nr * PAGE_SIZE;
>> +       }
>> +
>> +       if (pte_present(ptent))
>> +               return zap_present_ptes(tlb, vma, pte, ptent, max_nr,
>> +                                       addr, details, rss, force_flush,
>> +                                       force_break, any_skipped);
> 
> and here:
> 
> if (pte_present(ptent))
>      nr += zap_present_ptes(...)
> else
>      nr += zap_nonpresent_ptes();
> return nr;
> 
> So you really return the number of processed items -- how much the 
> caller should advance.

Got it, please ignore my previous stupid considerations. ;)

> 
>> +
>> +       return zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
>> +                                  details, rss, any_skipped);
>> +}
>> +
>> +static inline int count_pte_none(pte_t *pte, int nr)
>> +{
>> +       int none_nr = 0;
>> +
>> +       /*
>> +        * If PTE_MARKER_UFFD_WP is enabled, the uffd-wp PTEs may be
>> +        * re-installed, so we need to check pte_none() one by one.
>> +        * Otherwise, checking a single PTE in a batch is sufficient.
>> +        */
>> +#ifdef CONFIG_PTE_MARKER_UFFD_WP
>> +       for (;;) {
>> +               if (pte_none(ptep_get(pte)))
>> +                       none_nr++;
>> +               if (--nr == 0)
>> +                       break;
>> +               pte++;
>> +       }
>> +#else
>> +       if (pte_none(ptep_get(pte)))
>> +               none_nr = nr;
>> +#endif
>> +       return none_nr;
>> +}
>> +
>> +
>>    static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>                                   struct vm_area_struct *vma, pmd_t *pmd,
>>                                   unsigned long addr, unsigned long end,
>> @@ -1667,6 +1727,7 @@ static unsigned long zap_pte_range(struct
>> mmu_gather *tlb,
>>           int rss[NR_MM_COUNTERS];
>>           spinlock_t *ptl;
>>           pte_t *start_pte;
>> +       bool can_reclaim_pt;
>>           pte_t *pte;
>>           int nr;
>>
>> @@ -1679,28 +1740,22 @@ static unsigned long zap_pte_range(struct
>> mmu_gather *tlb,
>>           flush_tlb_batched_pending(mm);
>>           arch_enter_lazy_mmu_mode();
>>           do {
>> -               pte_t ptent = ptep_get(pte);
>> -               int max_nr;
>> -
>> -               nr = 1;
>> -               if (pte_none(ptent))
>> -                       continue;
>> +               bool any_skipped;
>>
>>                   if (need_resched())
>>                           break;
>>
>> -               max_nr = (end - addr) / PAGE_SIZE;
>> -               if (pte_present(ptent)) {
>> -                       nr = zap_present_ptes(tlb, vma, pte, ptent, 
>> max_nr,
>> -                                             addr, details, rss,
>> &force_flush,
>> -                                             &force_break);
>> -                       if (unlikely(force_break)) {
>> -                               addr += nr * PAGE_SIZE;
>> -                               break;
>> -                       }
>> -               } else {
>> -                       nr = zap_nonpresent_ptes(tlb, vma, pte, ptent,
>> max_nr,
>> -                                                addr, details, rss);
>> +               nr = do_zap_pte_range(tlb, vma, pte, addr, end, details,
>> +                                     rss, &force_flush, &force_break,
>> +                                     &any_skipped);
>> +               if (can_reclaim_pt) {
>> +                       VM_BUG_ON(!any_skipped && count_pte_none(pte,
>> nr) == nr);
> 
> Okay, so this is really only for debugging then? So we can safely drop 
> that for now.
> 
> I assume it would make sense to check when reclaiming a page table with 
> CONFIG_DEBUG_VM, that the table is actually all-pte_none instead?

Ah, make sense. Will change to it in the next version.

> 
> (as a side note: no VM_BUG_ON, please :) )

Got it.

Thanks!

>
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index bd9ebe0f4471f..24633d0e1445a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1657,6 +1657,28 @@  static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
 	return nr;
 }
 
+static inline int skip_none_ptes(pte_t *pte, unsigned long addr,
+				 unsigned long end)
+{
+	pte_t ptent = ptep_get(pte);
+	int max_nr;
+	int nr;
+
+	if (!pte_none(ptent))
+		return 0;
+
+	max_nr = (end - addr) / PAGE_SIZE;
+	nr = 1;
+
+	for (; nr < max_nr; nr++) {
+		ptent = ptep_get(pte + nr);
+		if (!pte_none(ptent))
+			break;
+	}
+
+	return nr;
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
@@ -1682,13 +1704,17 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		pte_t ptent = ptep_get(pte);
 		int max_nr;
 
-		nr = 1;
-		if (pte_none(ptent))
-			continue;
-
 		if (need_resched())
 			break;
 
+		nr = skip_none_ptes(pte, addr, end);
+		if (nr) {
+			addr += PAGE_SIZE * nr;
+			if (addr == end)
+				break;
+			pte += nr;
+		}
+
 		max_nr = (end - addr) / PAGE_SIZE;
 		if (pte_present(ptent)) {
 			nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,