diff mbox series

[v3] mempolicy: Optimize queue_folios_pte_range by PTE batching

Message ID 20250416053048.96479-1-dev.jain@arm.com (mailing list archive)
State New
Headers show
Series [v3] mempolicy: Optimize queue_folios_pte_range by PTE batching | expand

Commit Message

Dev Jain April 16, 2025, 5:30 a.m. UTC
After the check for queue_folio_required(), the code only cares about the
folio in the for loop, i.e the PTEs are redundant. Therefore, optimize
this loop by skipping over a PTE batch mapping the same folio.

With a test program migrating pages of the calling process, which includes
a mapped VMA of size 4GB with pte-mapped large folios of order-9, and
migrating once back and forth node-0 and node-1, the average execution
time reduces from 7.5 to 4 seconds, giving an approx 47% speedup.

v2->v3:
 - Don't use assignment in if condition

v1->v2:
 - Follow reverse xmas tree declarations
 - Don't initialize nr
 - Move folio_pte_batch() immediately after retrieving a normal folio
 - increment nr_failed in one shot

Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/mempolicy.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

Comments

Baolin Wang April 16, 2025, 6:32 a.m. UTC | #1
On 2025/4/16 13:30, Dev Jain wrote:
> After the check for queue_folio_required(), the code only cares about the
> folio in the for loop, i.e the PTEs are redundant. Therefore, optimize
> this loop by skipping over a PTE batch mapping the same folio.
> 
> With a test program migrating pages of the calling process, which includes
> a mapped VMA of size 4GB with pte-mapped large folios of order-9, and
> migrating once back and forth node-0 and node-1, the average execution
> time reduces from 7.5 to 4 seconds, giving an approx 47% speedup.
> 
> v2->v3:
>   - Don't use assignment in if condition
> 
> v1->v2:
>   - Follow reverse xmas tree declarations
>   - Don't initialize nr
>   - Move folio_pte_batch() immediately after retrieving a normal folio
>   - increment nr_failed in one shot
> 
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>   mm/mempolicy.c | 12 ++++++++++--
>   1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index b28a1e6ae096..4d2dc8b63965 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -566,6 +566,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
>   static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>   			unsigned long end, struct mm_walk *walk)
>   {
> +	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>   	struct vm_area_struct *vma = walk->vma;
>   	struct folio *folio;
>   	struct queue_pages *qp = walk->private;
> @@ -573,6 +574,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>   	pte_t *pte, *mapped_pte;
>   	pte_t ptent;
>   	spinlock_t *ptl;
> +	int max_nr, nr;
>   
>   	ptl = pmd_trans_huge_lock(pmd, vma);
>   	if (ptl) {
> @@ -586,7 +588,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>   		walk->action = ACTION_AGAIN;
>   		return 0;
>   	}
> -	for (; addr != end; pte++, addr += PAGE_SIZE) {
> +	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
> +		max_nr = (end - addr) >> PAGE_SHIFT;
> +		nr = 1;
>   		ptent = ptep_get(pte);
>   		if (pte_none(ptent))
>   			continue;
> @@ -598,6 +602,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>   		folio = vm_normal_folio(vma, addr, ptent);
>   		if (!folio || folio_is_zone_device(folio))
>   			continue;
> +		if (folio_test_large(folio) && max_nr != 1)
> +			nr = folio_pte_batch(folio, addr, pte, ptent,
> +					     max_nr, fpb_flags,
> +					     NULL, NULL, NULL);
>   		/*
>   		 * vm_normal_folio() filters out zero pages, but there might
>   		 * still be reserved folios to skip, perhaps in a VDSO.
> @@ -630,7 +638,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>   		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
>   		    !vma_migratable(vma) ||
>   		    !migrate_folio_add(folio, qp->pagelist, flags)) {
> -			qp->nr_failed++;
> +			qp->nr_failed += nr;

Sorry for chiming in late, but I am not convinced that 'qp->nr_failed' 
should add 'nr' when isolation fails.

 From the comments of queue_pages_range():
"
* >0 - this number of misplaced folios could not be queued for moving
  *      (a hugetlbfs page or a transparent huge page being counted as 1).
"

That means if a large folio is failed to isolate, we should only add '1' 
for qp->nr_failed instead of the number of pages in this large folio. Right?
David Hildenbrand April 16, 2025, 8:22 a.m. UTC | #2
On 16.04.25 08:32, Baolin Wang wrote:
> 
> 
> On 2025/4/16 13:30, Dev Jain wrote:
>> After the check for queue_folio_required(), the code only cares about the
>> folio in the for loop, i.e the PTEs are redundant. Therefore, optimize
>> this loop by skipping over a PTE batch mapping the same folio.
>>
>> With a test program migrating pages of the calling process, which includes
>> a mapped VMA of size 4GB with pte-mapped large folios of order-9, and
>> migrating once back and forth node-0 and node-1, the average execution
>> time reduces from 7.5 to 4 seconds, giving an approx 47% speedup.
>>
>> v2->v3:
>>    - Don't use assignment in if condition
>>
>> v1->v2:
>>    - Follow reverse xmas tree declarations
>>    - Don't initialize nr
>>    - Move folio_pte_batch() immediately after retrieving a normal folio
>>    - increment nr_failed in one shot
>>
>> Acked-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>    mm/mempolicy.c | 12 ++++++++++--
>>    1 file changed, 10 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>> index b28a1e6ae096..4d2dc8b63965 100644
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -566,6 +566,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
>>    static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>>    			unsigned long end, struct mm_walk *walk)
>>    {
>> +	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>    	struct vm_area_struct *vma = walk->vma;
>>    	struct folio *folio;
>>    	struct queue_pages *qp = walk->private;
>> @@ -573,6 +574,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>>    	pte_t *pte, *mapped_pte;
>>    	pte_t ptent;
>>    	spinlock_t *ptl;
>> +	int max_nr, nr;
>>    
>>    	ptl = pmd_trans_huge_lock(pmd, vma);
>>    	if (ptl) {
>> @@ -586,7 +588,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>>    		walk->action = ACTION_AGAIN;
>>    		return 0;
>>    	}
>> -	for (; addr != end; pte++, addr += PAGE_SIZE) {
>> +	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
>> +		max_nr = (end - addr) >> PAGE_SHIFT;
>> +		nr = 1;
>>    		ptent = ptep_get(pte);
>>    		if (pte_none(ptent))
>>    			continue;
>> @@ -598,6 +602,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>>    		folio = vm_normal_folio(vma, addr, ptent);
>>    		if (!folio || folio_is_zone_device(folio))
>>    			continue;
>> +		if (folio_test_large(folio) && max_nr != 1)
>> +			nr = folio_pte_batch(folio, addr, pte, ptent,
>> +					     max_nr, fpb_flags,
>> +					     NULL, NULL, NULL);
>>    		/*
>>    		 * vm_normal_folio() filters out zero pages, but there might
>>    		 * still be reserved folios to skip, perhaps in a VDSO.
>> @@ -630,7 +638,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>>    		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
>>    		    !vma_migratable(vma) ||
>>    		    !migrate_folio_add(folio, qp->pagelist, flags)) {
>> -			qp->nr_failed++;
>> +			qp->nr_failed += nr;
> 
> Sorry for chiming in late, but I am not convinced that 'qp->nr_failed'
> should add 'nr' when isolation fails.

This patch does not change the existing behavior. But I stumbled over 
that as well ... and scratched my head.

> 
>   From the comments of queue_pages_range():
> "
> * >0 - this number of misplaced folios could not be queued for moving
>    *      (a hugetlbfs page or a transparent huge page being counted as 1).
> "
> 
> That means if a large folio is failed to isolate, we should only add '1'
> for qp->nr_failed instead of the number of pages in this large folio. Right?

I think what the doc really meant is "PMD-mapped THP". PTE-mapped THPs 
always had the same behavior: per PTE of the THP we would increment 
nr_failed by 1.

I assume returning "1" for PMD-mapped THPs was wrong from the beginning; 
it might only have been right for hugetlb pages.

With COW and similar things (VMA splits), achieving "count each folio 
only once" reliably is a very hard thing to achieve.


Let's explore how "nr_failed" will get used.

1) do_mbind()

Only cares if "any failed", not the exact number.


2) migrate_pages()

Will return the number to user space, where documentation says:

"On success migrate_pages() returns the number of pages that could not 
be moved (i.e., a return of zero means that all pages were successfully 
moved)."

man-page does not document THP specifics AFAIKs. I would assume most 
users care about "all migrated vs. any not migrated".


I would even feel confident to change the THP PMD-handling to return the 
actual *pages*.
Baolin Wang April 16, 2025, 8:41 a.m. UTC | #3
On 2025/4/16 16:22, David Hildenbrand wrote:
> On 16.04.25 08:32, Baolin Wang wrote:
>>
>>
>> On 2025/4/16 13:30, Dev Jain wrote:
>>> After the check for queue_folio_required(), the code only cares about 
>>> the
>>> folio in the for loop, i.e the PTEs are redundant. Therefore, optimize
>>> this loop by skipping over a PTE batch mapping the same folio.
>>>
>>> With a test program migrating pages of the calling process, which 
>>> includes
>>> a mapped VMA of size 4GB with pte-mapped large folios of order-9, and
>>> migrating once back and forth node-0 and node-1, the average execution
>>> time reduces from 7.5 to 4 seconds, giving an approx 47% speedup.
>>>
>>> v2->v3:
>>>    - Don't use assignment in if condition
>>>
>>> v1->v2:
>>>    - Follow reverse xmas tree declarations
>>>    - Don't initialize nr
>>>    - Move folio_pte_batch() immediately after retrieving a normal folio
>>>    - increment nr_failed in one shot
>>>
>>> Acked-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>    mm/mempolicy.c | 12 ++++++++++--
>>>    1 file changed, 10 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>>> index b28a1e6ae096..4d2dc8b63965 100644
>>> --- a/mm/mempolicy.c
>>> +++ b/mm/mempolicy.c
>>> @@ -566,6 +566,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct 
>>> mm_walk *walk)
>>>    static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>>>                unsigned long end, struct mm_walk *walk)
>>>    {
>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>        struct vm_area_struct *vma = walk->vma;
>>>        struct folio *folio;
>>>        struct queue_pages *qp = walk->private;
>>> @@ -573,6 +574,7 @@ static int queue_folios_pte_range(pmd_t *pmd, 
>>> unsigned long addr,
>>>        pte_t *pte, *mapped_pte;
>>>        pte_t ptent;
>>>        spinlock_t *ptl;
>>> +    int max_nr, nr;
>>>        ptl = pmd_trans_huge_lock(pmd, vma);
>>>        if (ptl) {
>>> @@ -586,7 +588,9 @@ static int queue_folios_pte_range(pmd_t *pmd, 
>>> unsigned long addr,
>>>            walk->action = ACTION_AGAIN;
>>>            return 0;
>>>        }
>>> -    for (; addr != end; pte++, addr += PAGE_SIZE) {
>>> +    for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
>>> +        max_nr = (end - addr) >> PAGE_SHIFT;
>>> +        nr = 1;
>>>            ptent = ptep_get(pte);
>>>            if (pte_none(ptent))
>>>                continue;
>>> @@ -598,6 +602,10 @@ static int queue_folios_pte_range(pmd_t *pmd, 
>>> unsigned long addr,
>>>            folio = vm_normal_folio(vma, addr, ptent);
>>>            if (!folio || folio_is_zone_device(folio))
>>>                continue;
>>> +        if (folio_test_large(folio) && max_nr != 1)
>>> +            nr = folio_pte_batch(folio, addr, pte, ptent,
>>> +                         max_nr, fpb_flags,
>>> +                         NULL, NULL, NULL);
>>>            /*
>>>             * vm_normal_folio() filters out zero pages, but there might
>>>             * still be reserved folios to skip, perhaps in a VDSO.
>>> @@ -630,7 +638,7 @@ static int queue_folios_pte_range(pmd_t *pmd, 
>>> unsigned long addr,
>>>            if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
>>>                !vma_migratable(vma) ||
>>>                !migrate_folio_add(folio, qp->pagelist, flags)) {
>>> -            qp->nr_failed++;
>>> +            qp->nr_failed += nr;
>>
>> Sorry for chiming in late, but I am not convinced that 'qp->nr_failed'
>> should add 'nr' when isolation fails.
> 
> This patch does not change the existing behavior. But I stumbled over 
> that as well ... and scratched my head.
> 
>>
>>   From the comments of queue_pages_range():
>> "
>> * >0 - this number of misplaced folios could not be queued for moving
>>    *      (a hugetlbfs page or a transparent huge page being counted 
>> as 1).
>> "
>>
>> That means if a large folio is failed to isolate, we should only add '1'
>> for qp->nr_failed instead of the number of pages in this large folio. 
>> Right?
> 
> I think what the doc really meant is "PMD-mapped THP". PTE-mapped THPs 
> always had the same behavior: per PTE of the THP we would increment 
> nr_failed by 1.

No? For pte-mapped THPs, it only adds 1 for the large folio, since we 
have below check in queue_folios_pte_range().

if (folio == qp->large)
	continue;

Or I missed anything else?

> I assume returning "1" for PMD-mapped THPs was wrong from the beginning; 
> it might only have been right for hugetlb pages.
> 
> With COW and similar things (VMA splits), achieving "count each folio 
> only once" reliably is a very hard thing to achieve.
> 
> 
> Let's explore how "nr_failed" will get used.
> 
> 1) do_mbind()
> 
> Only cares if "any failed", not the exact number.
> 
> 
> 2) migrate_pages()
> 
> Will return the number to user space, where documentation says:
> 
> "On success migrate_pages() returns the number of pages that could not 
> be moved (i.e., a return of zero means that all pages were successfully 
> moved)."
> 
> man-page does not document THP specifics AFAIKs. I would assume most 
> users care about "all migrated vs. any not migrated".
> 
> 
> I would even feel confident to change the THP PMD-handling to return the 
> actual *pages*.
>
David Hildenbrand April 16, 2025, 8:51 a.m. UTC | #4
On 16.04.25 10:41, Baolin Wang wrote:
> 
> 
> On 2025/4/16 16:22, David Hildenbrand wrote:
>> On 16.04.25 08:32, Baolin Wang wrote:
>>>
>>>
>>> On 2025/4/16 13:30, Dev Jain wrote:
>>>> After the check for queue_folio_required(), the code only cares about
>>>> the
>>>> folio in the for loop, i.e the PTEs are redundant. Therefore, optimize
>>>> this loop by skipping over a PTE batch mapping the same folio.
>>>>
>>>> With a test program migrating pages of the calling process, which
>>>> includes
>>>> a mapped VMA of size 4GB with pte-mapped large folios of order-9, and
>>>> migrating once back and forth node-0 and node-1, the average execution
>>>> time reduces from 7.5 to 4 seconds, giving an approx 47% speedup.
>>>>
>>>> v2->v3:
>>>>     - Don't use assignment in if condition
>>>>
>>>> v1->v2:
>>>>     - Follow reverse xmas tree declarations
>>>>     - Don't initialize nr
>>>>     - Move folio_pte_batch() immediately after retrieving a normal folio
>>>>     - increment nr_failed in one shot
>>>>
>>>> Acked-by: David Hildenbrand <david@redhat.com>
>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>> ---
>>>>     mm/mempolicy.c | 12 ++++++++++--
>>>>     1 file changed, 10 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>>>> index b28a1e6ae096..4d2dc8b63965 100644
>>>> --- a/mm/mempolicy.c
>>>> +++ b/mm/mempolicy.c
>>>> @@ -566,6 +566,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct
>>>> mm_walk *walk)
>>>>     static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>>>>                 unsigned long end, struct mm_walk *walk)
>>>>     {
>>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>>         struct vm_area_struct *vma = walk->vma;
>>>>         struct folio *folio;
>>>>         struct queue_pages *qp = walk->private;
>>>> @@ -573,6 +574,7 @@ static int queue_folios_pte_range(pmd_t *pmd,
>>>> unsigned long addr,
>>>>         pte_t *pte, *mapped_pte;
>>>>         pte_t ptent;
>>>>         spinlock_t *ptl;
>>>> +    int max_nr, nr;
>>>>         ptl = pmd_trans_huge_lock(pmd, vma);
>>>>         if (ptl) {
>>>> @@ -586,7 +588,9 @@ static int queue_folios_pte_range(pmd_t *pmd,
>>>> unsigned long addr,
>>>>             walk->action = ACTION_AGAIN;
>>>>             return 0;
>>>>         }
>>>> -    for (; addr != end; pte++, addr += PAGE_SIZE) {
>>>> +    for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
>>>> +        max_nr = (end - addr) >> PAGE_SHIFT;
>>>> +        nr = 1;
>>>>             ptent = ptep_get(pte);
>>>>             if (pte_none(ptent))
>>>>                 continue;
>>>> @@ -598,6 +602,10 @@ static int queue_folios_pte_range(pmd_t *pmd,
>>>> unsigned long addr,
>>>>             folio = vm_normal_folio(vma, addr, ptent);
>>>>             if (!folio || folio_is_zone_device(folio))
>>>>                 continue;
>>>> +        if (folio_test_large(folio) && max_nr != 1)
>>>> +            nr = folio_pte_batch(folio, addr, pte, ptent,
>>>> +                         max_nr, fpb_flags,
>>>> +                         NULL, NULL, NULL);
>>>>             /*
>>>>              * vm_normal_folio() filters out zero pages, but there might
>>>>              * still be reserved folios to skip, perhaps in a VDSO.
>>>> @@ -630,7 +638,7 @@ static int queue_folios_pte_range(pmd_t *pmd,
>>>> unsigned long addr,
>>>>             if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
>>>>                 !vma_migratable(vma) ||
>>>>                 !migrate_folio_add(folio, qp->pagelist, flags)) {
>>>> -            qp->nr_failed++;
>>>> +            qp->nr_failed += nr;
>>>
>>> Sorry for chiming in late, but I am not convinced that 'qp->nr_failed'
>>> should add 'nr' when isolation fails.
>>
>> This patch does not change the existing behavior. But I stumbled over
>> that as well ... and scratched my head.
>>
>>>
>>>    From the comments of queue_pages_range():
>>> "
>>> * >0 - this number of misplaced folios could not be queued for moving
>>>     *      (a hugetlbfs page or a transparent huge page being counted
>>> as 1).
>>> "
>>>
>>> That means if a large folio is failed to isolate, we should only add '1'
>>> for qp->nr_failed instead of the number of pages in this large folio.
>>> Right?
>>
>> I think what the doc really meant is "PMD-mapped THP". PTE-mapped THPs
>> always had the same behavior: per PTE of the THP we would increment
>> nr_failed by 1.
> 
> No? For pte-mapped THPs, it only adds 1 for the large folio, since we
> have below check in queue_folios_pte_range().
> 
> if (folio == qp->large)
> 	continue;
> 
> Or I missed anything else?

Ah, I got confused by that and thought it would only be for LRU 
isolation purposes.

Yeah, it will kind-of work for now and I think you are correct that we 
would only increment nr_failed by 1.

I still think that counting nr_failed that way is dubious. We should be 
counting pages, which is something that user space from migrate_pages() 
could understand. Having it count arbitrary THPs/large folio sizes is 
really questionable.

But that is indeed a separate issue to resolve.
David Hildenbrand April 16, 2025, 8:56 a.m. UTC | #5
On 16.04.25 10:51, David Hildenbrand wrote:
> On 16.04.25 10:41, Baolin Wang wrote:
>>
>>
>> On 2025/4/16 16:22, David Hildenbrand wrote:
>>> On 16.04.25 08:32, Baolin Wang wrote:
>>>>
>>>>
>>>> On 2025/4/16 13:30, Dev Jain wrote:
>>>>> After the check for queue_folio_required(), the code only cares about
>>>>> the
>>>>> folio in the for loop, i.e the PTEs are redundant. Therefore, optimize
>>>>> this loop by skipping over a PTE batch mapping the same folio.
>>>>>
>>>>> With a test program migrating pages of the calling process, which
>>>>> includes
>>>>> a mapped VMA of size 4GB with pte-mapped large folios of order-9, and
>>>>> migrating once back and forth node-0 and node-1, the average execution
>>>>> time reduces from 7.5 to 4 seconds, giving an approx 47% speedup.
>>>>>
>>>>> v2->v3:
>>>>>      - Don't use assignment in if condition
>>>>>
>>>>> v1->v2:
>>>>>      - Follow reverse xmas tree declarations
>>>>>      - Don't initialize nr
>>>>>      - Move folio_pte_batch() immediately after retrieving a normal folio
>>>>>      - increment nr_failed in one shot
>>>>>
>>>>> Acked-by: David Hildenbrand <david@redhat.com>
>>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>>> ---
>>>>>      mm/mempolicy.c | 12 ++++++++++--
>>>>>      1 file changed, 10 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>>>>> index b28a1e6ae096..4d2dc8b63965 100644
>>>>> --- a/mm/mempolicy.c
>>>>> +++ b/mm/mempolicy.c
>>>>> @@ -566,6 +566,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct
>>>>> mm_walk *walk)
>>>>>      static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
>>>>>                  unsigned long end, struct mm_walk *walk)
>>>>>      {
>>>>> +    const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>>>          struct vm_area_struct *vma = walk->vma;
>>>>>          struct folio *folio;
>>>>>          struct queue_pages *qp = walk->private;
>>>>> @@ -573,6 +574,7 @@ static int queue_folios_pte_range(pmd_t *pmd,
>>>>> unsigned long addr,
>>>>>          pte_t *pte, *mapped_pte;
>>>>>          pte_t ptent;
>>>>>          spinlock_t *ptl;
>>>>> +    int max_nr, nr;
>>>>>          ptl = pmd_trans_huge_lock(pmd, vma);
>>>>>          if (ptl) {
>>>>> @@ -586,7 +588,9 @@ static int queue_folios_pte_range(pmd_t *pmd,
>>>>> unsigned long addr,
>>>>>              walk->action = ACTION_AGAIN;
>>>>>              return 0;
>>>>>          }
>>>>> -    for (; addr != end; pte++, addr += PAGE_SIZE) {
>>>>> +    for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
>>>>> +        max_nr = (end - addr) >> PAGE_SHIFT;
>>>>> +        nr = 1;
>>>>>              ptent = ptep_get(pte);
>>>>>              if (pte_none(ptent))
>>>>>                  continue;
>>>>> @@ -598,6 +602,10 @@ static int queue_folios_pte_range(pmd_t *pmd,
>>>>> unsigned long addr,
>>>>>              folio = vm_normal_folio(vma, addr, ptent);
>>>>>              if (!folio || folio_is_zone_device(folio))
>>>>>                  continue;
>>>>> +        if (folio_test_large(folio) && max_nr != 1)
>>>>> +            nr = folio_pte_batch(folio, addr, pte, ptent,
>>>>> +                         max_nr, fpb_flags,
>>>>> +                         NULL, NULL, NULL);
>>>>>              /*
>>>>>               * vm_normal_folio() filters out zero pages, but there might
>>>>>               * still be reserved folios to skip, perhaps in a VDSO.
>>>>> @@ -630,7 +638,7 @@ static int queue_folios_pte_range(pmd_t *pmd,
>>>>> unsigned long addr,
>>>>>              if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
>>>>>                  !vma_migratable(vma) ||
>>>>>                  !migrate_folio_add(folio, qp->pagelist, flags)) {
>>>>> -            qp->nr_failed++;
>>>>> +            qp->nr_failed += nr;
>>>>
>>>> Sorry for chiming in late, but I am not convinced that 'qp->nr_failed'
>>>> should add 'nr' when isolation fails.
>>>
>>> This patch does not change the existing behavior. But I stumbled over
>>> that as well ... and scratched my head.
>>>
>>>>
>>>>     From the comments of queue_pages_range():
>>>> "
>>>> * >0 - this number of misplaced folios could not be queued for moving
>>>>      *      (a hugetlbfs page or a transparent huge page being counted
>>>> as 1).
>>>> "
>>>>
>>>> That means if a large folio is failed to isolate, we should only add '1'
>>>> for qp->nr_failed instead of the number of pages in this large folio.
>>>> Right?
>>>
>>> I think what the doc really meant is "PMD-mapped THP". PTE-mapped THPs
>>> always had the same behavior: per PTE of the THP we would increment
>>> nr_failed by 1.
>>
>> No? For pte-mapped THPs, it only adds 1 for the large folio, since we
>> have below check in queue_folios_pte_range().
>>
>> if (folio == qp->large)
>> 	continue;
>>
>> Or I missed anything else?
> 
> Ah, I got confused by that and thought it would only be for LRU
> isolation purposes.
> 
> Yeah, it will kind-of work for now and I think you are correct that we
> would only increment nr_failed by 1.
> 
> I still think that counting nr_failed that way is dubious. We should be
> counting pages, which is something that user space from migrate_pages()
> could understand. Having it count arbitrary THPs/large folio sizes is
> really questionable.
> 
> But that is indeed a separate issue to resolve.

Digging into it:

commit 1cb5d11a370f661c5d0d888bb0cfc2cdc5791382
Author: Hugh Dickins <hughd@google.com>
Date:   Tue Oct 3 02:17:43 2023 -0700

     mempolicy: fix migrate_pages(2) syscall return nr_failed
     
     "man 2 migrate_pages" says "On success migrate_pages() returns the number
     of pages that could not be moved".  Although 5.3 and 5.4 commits fixed
     mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages
     could be moved (because some could not be isolated for migration),
     migrate_pages(2) was left still reporting only those pages failing at the
     migration stage, forgetting those failing at the earlier isolation stage.
     
     Fix that by accumulating a long nr_failed count in struct queue_pages,
     returned by queue_pages_range() when it's not returning an error, for
     adding on to the nr_failed count from migrate_pages() in mm/migrate.c.  A
     count of pages?  It's more a count of folios, but changing it to pages
     would entail more work (also in mm/migrate.c): does not seem justified.

Yeah, we should be counting pages, but likely nobody really cares, because we
only care if everything was migrated or something was not migrated.
diff mbox series

Patch

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b28a1e6ae096..4d2dc8b63965 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -566,6 +566,7 @@  static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 			unsigned long end, struct mm_walk *walk)
 {
+	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
 	struct vm_area_struct *vma = walk->vma;
 	struct folio *folio;
 	struct queue_pages *qp = walk->private;
@@ -573,6 +574,7 @@  static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 	pte_t *pte, *mapped_pte;
 	pte_t ptent;
 	spinlock_t *ptl;
+	int max_nr, nr;
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
@@ -586,7 +588,9 @@  static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 		walk->action = ACTION_AGAIN;
 		return 0;
 	}
-	for (; addr != end; pte++, addr += PAGE_SIZE) {
+	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
+		max_nr = (end - addr) >> PAGE_SHIFT;
+		nr = 1;
 		ptent = ptep_get(pte);
 		if (pte_none(ptent))
 			continue;
@@ -598,6 +602,10 @@  static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 		folio = vm_normal_folio(vma, addr, ptent);
 		if (!folio || folio_is_zone_device(folio))
 			continue;
+		if (folio_test_large(folio) && max_nr != 1)
+			nr = folio_pte_batch(folio, addr, pte, ptent,
+					     max_nr, fpb_flags,
+					     NULL, NULL, NULL);
 		/*
 		 * vm_normal_folio() filters out zero pages, but there might
 		 * still be reserved folios to skip, perhaps in a VDSO.
@@ -630,7 +638,7 @@  static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
 		    !vma_migratable(vma) ||
 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
-			qp->nr_failed++;
+			qp->nr_failed += nr;
 			if (strictly_unmovable(flags))
 				break;
 		}