diff mbox series

[v3,1/6] mm: memory: extend finish_fault() to support large folio

Message ID bf80d4a792ea82ab066f819ad7d10ed22a2f8e66.1717033868.git.baolin.wang@linux.alibaba.com (mailing list archive)
State New
Headers show
Series add mTHP support for anonymous shmem | expand

Commit Message

Baolin Wang May 30, 2024, 2:04 a.m. UTC
Add large folio mapping establishment support for finish_fault() as a preparation,
to support multi-size THP allocation of anonymous shmem pages in the following
patches.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 mm/memory.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

Comments

Lance Yang June 3, 2024, 4:44 a.m. UTC | #1
On Thu, May 30, 2024 at 10:04 AM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
> Add large folio mapping establishment support for finish_fault() as a preparation,
> to support multi-size THP allocation of anonymous shmem pages in the following
> patches.
>
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
>  mm/memory.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 48 insertions(+), 10 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index eef4e482c0c2..435187ff7ea4 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4831,9 +4831,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  {
>         struct vm_area_struct *vma = vmf->vma;
>         struct page *page;
> +       struct folio *folio;
>         vm_fault_t ret;
>         bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>                       !(vma->vm_flags & VM_SHARED);
> +       int type, nr_pages, i;
> +       unsigned long addr = vmf->address;
>
>         /* Did we COW the page? */
>         if (is_cow)
> @@ -4864,24 +4867,59 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>                         return VM_FAULT_OOM;
>         }
>
> +       folio = page_folio(page);
> +       nr_pages = folio_nr_pages(folio);
> +
> +       /*
> +        * Using per-page fault to maintain the uffd semantics, and same
> +        * approach also applies to non-anonymous-shmem faults to avoid
> +        * inflating the RSS of the process.
> +        */
> +       if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
> +               nr_pages = 1;
> +       } else if (nr_pages > 1) {
> +               pgoff_t idx = folio_page_idx(folio, page);
> +               /* The page offset of vmf->address within the VMA. */
> +               pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
> +
> +               /*
> +                * Fallback to per-page fault in case the folio size in page
> +                * cache beyond the VMA limits.
> +                */
> +               if (unlikely(vma_off < idx ||
> +                            vma_off + (nr_pages - idx) > vma_pages(vma))) {
> +                       nr_pages = 1;
> +               } else {
> +                       /* Now we can set mappings for the whole large folio. */
> +                       addr = vmf->address - idx * PAGE_SIZE;
> +                       page = &folio->page;
> +               }
> +       }
> +
>         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> -                                     vmf->address, &vmf->ptl);
> +                                      addr, &vmf->ptl);
>         if (!vmf->pte)
>                 return VM_FAULT_NOPAGE;
>
>         /* Re-check under ptl */
> -       if (likely(!vmf_pte_changed(vmf))) {
> -               struct folio *folio = page_folio(page);
> -               int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
> -
> -               set_pte_range(vmf, folio, page, 1, vmf->address);
> -               add_mm_counter(vma->vm_mm, type, 1);
> -               ret = 0;
> -       } else {
> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
> +       if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
> +               update_mmu_tlb(vma, addr, vmf->pte);
>                 ret = VM_FAULT_NOPAGE;
> +               goto unlock;
> +       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> +               for (i = 0; i < nr_pages; i++)
> +                       update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);

Just a friendly reminder: Bang has added the update_mmu_tlb_range()[1] batch
function to update TLB in batches, so we can use it instead of the
update_mmu_tlb()
loop.

[1] https://lore.kernel.org/linux-mm/20240522061204.117421-1-libang.li@antgroup.com/

Thanks,
Lance

> +               ret = VM_FAULT_NOPAGE;
> +               goto unlock;
>         }
>
> +       folio_ref_add(folio, nr_pages - 1);
> +       set_pte_range(vmf, folio, page, nr_pages, addr);
> +       type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
> +       add_mm_counter(vma->vm_mm, type, nr_pages);
> +       ret = 0;
> +
> +unlock:
>         pte_unmap_unlock(vmf->pte, vmf->ptl);
>         return ret;
>  }
> --
> 2.39.3
>
Barry Song June 3, 2024, 5:28 a.m. UTC | #2
On Thu, May 30, 2024 at 2:04 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
> Add large folio mapping establishment support for finish_fault() as a preparation,
> to support multi-size THP allocation of anonymous shmem pages in the following
> patches.
>
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
>  mm/memory.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 48 insertions(+), 10 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index eef4e482c0c2..435187ff7ea4 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4831,9 +4831,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  {
>         struct vm_area_struct *vma = vmf->vma;
>         struct page *page;
> +       struct folio *folio;
>         vm_fault_t ret;
>         bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>                       !(vma->vm_flags & VM_SHARED);
> +       int type, nr_pages, i;
> +       unsigned long addr = vmf->address;
>
>         /* Did we COW the page? */
>         if (is_cow)
> @@ -4864,24 +4867,59 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>                         return VM_FAULT_OOM;
>         }
>
> +       folio = page_folio(page);
> +       nr_pages = folio_nr_pages(folio);
> +
> +       /*
> +        * Using per-page fault to maintain the uffd semantics, and same
> +        * approach also applies to non-anonymous-shmem faults to avoid
> +        * inflating the RSS of the process.

I don't feel the comment explains the root cause.
For non-shmem, anyway we have allocated the memory? Avoiding inflating
RSS seems not so useful as we have occupied the memory. the memory footprint
is what we really care about. so we want to rely on read-ahead hints of subpage
to determine read-ahead size? that is why we don't map nr_pages for non-shmem
files though we can potentially reduce nr_pages - 1 page faults?

> +        */
> +       if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
> +               nr_pages = 1;
> +       } else if (nr_pages > 1) {
> +               pgoff_t idx = folio_page_idx(folio, page);
> +               /* The page offset of vmf->address within the VMA. */
> +               pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
> +
> +               /*
> +                * Fallback to per-page fault in case the folio size in page
> +                * cache beyond the VMA limits.
> +                */
> +               if (unlikely(vma_off < idx ||
> +                            vma_off + (nr_pages - idx) > vma_pages(vma))) {
> +                       nr_pages = 1;
> +               } else {
> +                       /* Now we can set mappings for the whole large folio. */
> +                       addr = vmf->address - idx * PAGE_SIZE;
> +                       page = &folio->page;
> +               }
> +       }
> +
>         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> -                                     vmf->address, &vmf->ptl);
> +                                      addr, &vmf->ptl);
>         if (!vmf->pte)
>                 return VM_FAULT_NOPAGE;
>
>         /* Re-check under ptl */
> -       if (likely(!vmf_pte_changed(vmf))) {
> -               struct folio *folio = page_folio(page);
> -               int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
> -
> -               set_pte_range(vmf, folio, page, 1, vmf->address);
> -               add_mm_counter(vma->vm_mm, type, 1);
> -               ret = 0;
> -       } else {
> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
> +       if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
> +               update_mmu_tlb(vma, addr, vmf->pte);
>                 ret = VM_FAULT_NOPAGE;
> +               goto unlock;
> +       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {

In what case we can't use !pte_range_none(vmf->pte, 1) for nr_pages == 1
then unify the code for nr_pages==1 and nr_pages > 1?

It seems this has been discussed before, but I forget the reason.

> +               for (i = 0; i < nr_pages; i++)
> +                       update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
> +               ret = VM_FAULT_NOPAGE;
> +               goto unlock;
>         }
>
> +       folio_ref_add(folio, nr_pages - 1);
> +       set_pte_range(vmf, folio, page, nr_pages, addr);
> +       type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
> +       add_mm_counter(vma->vm_mm, type, nr_pages);
> +       ret = 0;
> +
> +unlock:
>         pte_unmap_unlock(vmf->pte, vmf->ptl);
>         return ret;
>  }
> --
> 2.39.3
>

Thanks
Barry
Baolin Wang June 3, 2024, 8:04 a.m. UTC | #3
On 2024/6/3 12:44, Lance Yang wrote:
> On Thu, May 30, 2024 at 10:04 AM Baolin Wang
> <baolin.wang@linux.alibaba.com> wrote:
>>
>> Add large folio mapping establishment support for finish_fault() as a preparation,
>> to support multi-size THP allocation of anonymous shmem pages in the following
>> patches.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
>>   mm/memory.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
>>   1 file changed, 48 insertions(+), 10 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index eef4e482c0c2..435187ff7ea4 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4831,9 +4831,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>   {
>>          struct vm_area_struct *vma = vmf->vma;
>>          struct page *page;
>> +       struct folio *folio;
>>          vm_fault_t ret;
>>          bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>                        !(vma->vm_flags & VM_SHARED);
>> +       int type, nr_pages, i;
>> +       unsigned long addr = vmf->address;
>>
>>          /* Did we COW the page? */
>>          if (is_cow)
>> @@ -4864,24 +4867,59 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>                          return VM_FAULT_OOM;
>>          }
>>
>> +       folio = page_folio(page);
>> +       nr_pages = folio_nr_pages(folio);
>> +
>> +       /*
>> +        * Using per-page fault to maintain the uffd semantics, and same
>> +        * approach also applies to non-anonymous-shmem faults to avoid
>> +        * inflating the RSS of the process.
>> +        */
>> +       if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
>> +               nr_pages = 1;
>> +       } else if (nr_pages > 1) {
>> +               pgoff_t idx = folio_page_idx(folio, page);
>> +               /* The page offset of vmf->address within the VMA. */
>> +               pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
>> +
>> +               /*
>> +                * Fallback to per-page fault in case the folio size in page
>> +                * cache beyond the VMA limits.
>> +                */
>> +               if (unlikely(vma_off < idx ||
>> +                            vma_off + (nr_pages - idx) > vma_pages(vma))) {
>> +                       nr_pages = 1;
>> +               } else {
>> +                       /* Now we can set mappings for the whole large folio. */
>> +                       addr = vmf->address - idx * PAGE_SIZE;
>> +                       page = &folio->page;
>> +               }
>> +       }
>> +
>>          vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>> -                                     vmf->address, &vmf->ptl);
>> +                                      addr, &vmf->ptl);
>>          if (!vmf->pte)
>>                  return VM_FAULT_NOPAGE;
>>
>>          /* Re-check under ptl */
>> -       if (likely(!vmf_pte_changed(vmf))) {
>> -               struct folio *folio = page_folio(page);
>> -               int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>> -
>> -               set_pte_range(vmf, folio, page, 1, vmf->address);
>> -               add_mm_counter(vma->vm_mm, type, 1);
>> -               ret = 0;
>> -       } else {
>> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
>> +       if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
>> +               update_mmu_tlb(vma, addr, vmf->pte);
>>                  ret = VM_FAULT_NOPAGE;
>> +               goto unlock;
>> +       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>> +               for (i = 0; i < nr_pages; i++)
>> +                       update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
> 
> Just a friendly reminder: Bang has added the update_mmu_tlb_range()[1] batch
> function to update TLB in batches, so we can use it instead of the
> update_mmu_tlb()
> loop.
> 
> [1] https://lore.kernel.org/linux-mm/20240522061204.117421-1-libang.li@antgroup.com/

Good point, I will use the new helper instead.
Baolin Wang June 3, 2024, 8:29 a.m. UTC | #4
On 2024/6/3 13:28, Barry Song wrote:
> On Thu, May 30, 2024 at 2:04 PM Baolin Wang
> <baolin.wang@linux.alibaba.com> wrote:
>>
>> Add large folio mapping establishment support for finish_fault() as a preparation,
>> to support multi-size THP allocation of anonymous shmem pages in the following
>> patches.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
>>   mm/memory.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
>>   1 file changed, 48 insertions(+), 10 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index eef4e482c0c2..435187ff7ea4 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4831,9 +4831,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>   {
>>          struct vm_area_struct *vma = vmf->vma;
>>          struct page *page;
>> +       struct folio *folio;
>>          vm_fault_t ret;
>>          bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>                        !(vma->vm_flags & VM_SHARED);
>> +       int type, nr_pages, i;
>> +       unsigned long addr = vmf->address;
>>
>>          /* Did we COW the page? */
>>          if (is_cow)
>> @@ -4864,24 +4867,59 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>                          return VM_FAULT_OOM;
>>          }
>>
>> +       folio = page_folio(page);
>> +       nr_pages = folio_nr_pages(folio);
>> +
>> +       /*
>> +        * Using per-page fault to maintain the uffd semantics, and same
>> +        * approach also applies to non-anonymous-shmem faults to avoid
>> +        * inflating the RSS of the process.
> 
> I don't feel the comment explains the root cause.
> For non-shmem, anyway we have allocated the memory? Avoiding inflating
> RSS seems not so useful as we have occupied the memory. the memory footprint

This is also to keep the same behavior as before for non-anon-shmem, and 
will be discussed in the future.

> is what we really care about. so we want to rely on read-ahead hints of subpage
> to determine read-ahead size? that is why we don't map nr_pages for non-shmem
> files though we can potentially reduce nr_pages - 1 page faults?

IMHO, there is 2 cases for non-anon-shmem:
(1) read mmap() faults: we can rely on the 'fault_around_bytes' 
interface to determin what size of mapping to build.
(2) writable mmap() faults: I want to keep the same behavior as before 
(per-page fault), but we can talk about this when I send new patches to 
use mTHP to control large folio allocation for writable mmap().

>> +        */
>> +       if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
>> +               nr_pages = 1;
>> +       } else if (nr_pages > 1) {
>> +               pgoff_t idx = folio_page_idx(folio, page);
>> +               /* The page offset of vmf->address within the VMA. */
>> +               pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
>> +
>> +               /*
>> +                * Fallback to per-page fault in case the folio size in page
>> +                * cache beyond the VMA limits.
>> +                */
>> +               if (unlikely(vma_off < idx ||
>> +                            vma_off + (nr_pages - idx) > vma_pages(vma))) {
>> +                       nr_pages = 1;
>> +               } else {
>> +                       /* Now we can set mappings for the whole large folio. */
>> +                       addr = vmf->address - idx * PAGE_SIZE;
>> +                       page = &folio->page;
>> +               }
>> +       }
>> +
>>          vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>> -                                     vmf->address, &vmf->ptl);
>> +                                      addr, &vmf->ptl);
>>          if (!vmf->pte)
>>                  return VM_FAULT_NOPAGE;
>>
>>          /* Re-check under ptl */
>> -       if (likely(!vmf_pte_changed(vmf))) {
>> -               struct folio *folio = page_folio(page);
>> -               int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>> -
>> -               set_pte_range(vmf, folio, page, 1, vmf->address);
>> -               add_mm_counter(vma->vm_mm, type, 1);
>> -               ret = 0;
>> -       } else {
>> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
>> +       if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
>> +               update_mmu_tlb(vma, addr, vmf->pte);
>>                  ret = VM_FAULT_NOPAGE;
>> +               goto unlock;
>> +       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> 
> In what case we can't use !pte_range_none(vmf->pte, 1) for nr_pages == 1
> then unify the code for nr_pages==1 and nr_pages > 1?
> 
> It seems this has been discussed before, but I forget the reason.

IIUC, this is for uffd case, which is not a none pte entry.
Barry Song June 3, 2024, 8:58 a.m. UTC | #5
On Mon, Jun 3, 2024 at 8:29 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2024/6/3 13:28, Barry Song wrote:
> > On Thu, May 30, 2024 at 2:04 PM Baolin Wang
> > <baolin.wang@linux.alibaba.com> wrote:
> >>
> >> Add large folio mapping establishment support for finish_fault() as a preparation,
> >> to support multi-size THP allocation of anonymous shmem pages in the following
> >> patches.
> >>
> >> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> >> ---
> >>   mm/memory.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
> >>   1 file changed, 48 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index eef4e482c0c2..435187ff7ea4 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -4831,9 +4831,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
> >>   {
> >>          struct vm_area_struct *vma = vmf->vma;
> >>          struct page *page;
> >> +       struct folio *folio;
> >>          vm_fault_t ret;
> >>          bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
> >>                        !(vma->vm_flags & VM_SHARED);
> >> +       int type, nr_pages, i;
> >> +       unsigned long addr = vmf->address;
> >>
> >>          /* Did we COW the page? */
> >>          if (is_cow)
> >> @@ -4864,24 +4867,59 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
> >>                          return VM_FAULT_OOM;
> >>          }
> >>
> >> +       folio = page_folio(page);
> >> +       nr_pages = folio_nr_pages(folio);
> >> +
> >> +       /*
> >> +        * Using per-page fault to maintain the uffd semantics, and same
> >> +        * approach also applies to non-anonymous-shmem faults to avoid
> >> +        * inflating the RSS of the process.
> >
> > I don't feel the comment explains the root cause.
> > For non-shmem, anyway we have allocated the memory? Avoiding inflating
> > RSS seems not so useful as we have occupied the memory. the memory footprint
>
> This is also to keep the same behavior as before for non-anon-shmem, and
> will be discussed in the future.

OK.

>
> > is what we really care about. so we want to rely on read-ahead hints of subpage
> > to determine read-ahead size? that is why we don't map nr_pages for non-shmem
> > files though we can potentially reduce nr_pages - 1 page faults?
>
> IMHO, there is 2 cases for non-anon-shmem:
> (1) read mmap() faults: we can rely on the 'fault_around_bytes'
> interface to determin what size of mapping to build.
> (2) writable mmap() faults: I want to keep the same behavior as before
> (per-page fault), but we can talk about this when I send new patches to
> use mTHP to control large folio allocation for writable mmap().

OK.

>
> >> +        */
> >> +       if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
> >> +               nr_pages = 1;
> >> +       } else if (nr_pages > 1) {
> >> +               pgoff_t idx = folio_page_idx(folio, page);
> >> +               /* The page offset of vmf->address within the VMA. */
> >> +               pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
> >> +
> >> +               /*
> >> +                * Fallback to per-page fault in case the folio size in page
> >> +                * cache beyond the VMA limits.
> >> +                */
> >> +               if (unlikely(vma_off < idx ||
> >> +                            vma_off + (nr_pages - idx) > vma_pages(vma))) {
> >> +                       nr_pages = 1;
> >> +               } else {
> >> +                       /* Now we can set mappings for the whole large folio. */
> >> +                       addr = vmf->address - idx * PAGE_SIZE;
> >> +                       page = &folio->page;
> >> +               }
> >> +       }
> >> +
> >>          vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> >> -                                     vmf->address, &vmf->ptl);
> >> +                                      addr, &vmf->ptl);
> >>          if (!vmf->pte)
> >>                  return VM_FAULT_NOPAGE;
> >>
> >>          /* Re-check under ptl */
> >> -       if (likely(!vmf_pte_changed(vmf))) {
> >> -               struct folio *folio = page_folio(page);
> >> -               int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
> >> -
> >> -               set_pte_range(vmf, folio, page, 1, vmf->address);
> >> -               add_mm_counter(vma->vm_mm, type, 1);
> >> -               ret = 0;
> >> -       } else {
> >> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
> >> +       if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
> >> +               update_mmu_tlb(vma, addr, vmf->pte);
> >>                  ret = VM_FAULT_NOPAGE;
> >> +               goto unlock;
> >> +       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> >
> > In what case we can't use !pte_range_none(vmf->pte, 1) for nr_pages == 1
> > then unify the code for nr_pages==1 and nr_pages > 1?
> >
> > It seems this has been discussed before, but I forget the reason.
>
> IIUC, this is for uffd case, which is not a none pte entry.

Is it possible to have a COW case for shmem? For example, if someone
maps a shmem
file as read-only and then writes to it, would that prevent the use of
pte_range_none?

Furthermore, if we encounter a large folio in shmem while reading,
does it necessarily
mean we can map the entire folio? Is it possible for some processes to
only map part
of large folios? For instance, if process A allocates large folios and
process B maps
only part of this shmem file or partially unmaps a large folio, how
would that be handled?

Apologies for not debugging this thoroughly, but these two corner
cases seem worth
considering. If these scenarios have already been addressed, please disregard my
comments.

Thanks
Barry
Barry Song June 3, 2024, 9:01 a.m. UTC | #6
On Mon, Jun 3, 2024 at 8:58 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Mon, Jun 3, 2024 at 8:29 PM Baolin Wang
> <baolin.wang@linux.alibaba.com> wrote:
> >
> >
> >
> > On 2024/6/3 13:28, Barry Song wrote:
> > > On Thu, May 30, 2024 at 2:04 PM Baolin Wang
> > > <baolin.wang@linux.alibaba.com> wrote:
> > >>
> > >> Add large folio mapping establishment support for finish_fault() as a preparation,
> > >> to support multi-size THP allocation of anonymous shmem pages in the following
> > >> patches.
> > >>
> > >> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > >> ---
> > >>   mm/memory.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
> > >>   1 file changed, 48 insertions(+), 10 deletions(-)
> > >>
> > >> diff --git a/mm/memory.c b/mm/memory.c
> > >> index eef4e482c0c2..435187ff7ea4 100644
> > >> --- a/mm/memory.c
> > >> +++ b/mm/memory.c
> > >> @@ -4831,9 +4831,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
> > >>   {
> > >>          struct vm_area_struct *vma = vmf->vma;
> > >>          struct page *page;
> > >> +       struct folio *folio;
> > >>          vm_fault_t ret;
> > >>          bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
> > >>                        !(vma->vm_flags & VM_SHARED);
> > >> +       int type, nr_pages, i;
> > >> +       unsigned long addr = vmf->address;
> > >>
> > >>          /* Did we COW the page? */
> > >>          if (is_cow)
> > >> @@ -4864,24 +4867,59 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
> > >>                          return VM_FAULT_OOM;
> > >>          }
> > >>
> > >> +       folio = page_folio(page);
> > >> +       nr_pages = folio_nr_pages(folio);
> > >> +
> > >> +       /*
> > >> +        * Using per-page fault to maintain the uffd semantics, and same
> > >> +        * approach also applies to non-anonymous-shmem faults to avoid
> > >> +        * inflating the RSS of the process.
> > >
> > > I don't feel the comment explains the root cause.
> > > For non-shmem, anyway we have allocated the memory? Avoiding inflating
> > > RSS seems not so useful as we have occupied the memory. the memory footprint
> >
> > This is also to keep the same behavior as before for non-anon-shmem, and
> > will be discussed in the future.
>
> OK.
>
> >
> > > is what we really care about. so we want to rely on read-ahead hints of subpage
> > > to determine read-ahead size? that is why we don't map nr_pages for non-shmem
> > > files though we can potentially reduce nr_pages - 1 page faults?
> >
> > IMHO, there is 2 cases for non-anon-shmem:
> > (1) read mmap() faults: we can rely on the 'fault_around_bytes'
> > interface to determin what size of mapping to build.
> > (2) writable mmap() faults: I want to keep the same behavior as before
> > (per-page fault), but we can talk about this when I send new patches to
> > use mTHP to control large folio allocation for writable mmap().
>
> OK.
>
> >
> > >> +        */
> > >> +       if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
> > >> +               nr_pages = 1;
> > >> +       } else if (nr_pages > 1) {
> > >> +               pgoff_t idx = folio_page_idx(folio, page);
> > >> +               /* The page offset of vmf->address within the VMA. */
> > >> +               pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
> > >> +
> > >> +               /*
> > >> +                * Fallback to per-page fault in case the folio size in page
> > >> +                * cache beyond the VMA limits.
> > >> +                */
> > >> +               if (unlikely(vma_off < idx ||
> > >> +                            vma_off + (nr_pages - idx) > vma_pages(vma))) {
> > >> +                       nr_pages = 1;
> > >> +               } else {
> > >> +                       /* Now we can set mappings for the whole large folio. */
> > >> +                       addr = vmf->address - idx * PAGE_SIZE;
> > >> +                       page = &folio->page;
> > >> +               }
> > >> +       }
> > >> +
> > >>          vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> > >> -                                     vmf->address, &vmf->ptl);
> > >> +                                      addr, &vmf->ptl);
> > >>          if (!vmf->pte)
> > >>                  return VM_FAULT_NOPAGE;
> > >>
> > >>          /* Re-check under ptl */
> > >> -       if (likely(!vmf_pte_changed(vmf))) {
> > >> -               struct folio *folio = page_folio(page);
> > >> -               int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
> > >> -
> > >> -               set_pte_range(vmf, folio, page, 1, vmf->address);
> > >> -               add_mm_counter(vma->vm_mm, type, 1);
> > >> -               ret = 0;
> > >> -       } else {
> > >> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
> > >> +       if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
> > >> +               update_mmu_tlb(vma, addr, vmf->pte);
> > >>                  ret = VM_FAULT_NOPAGE;
> > >> +               goto unlock;
> > >> +       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> > >
> > > In what case we can't use !pte_range_none(vmf->pte, 1) for nr_pages == 1
> > > then unify the code for nr_pages==1 and nr_pages > 1?
> > >
> > > It seems this has been discussed before, but I forget the reason.
> >
> > IIUC, this is for uffd case, which is not a none pte entry.
>
> Is it possible to have a COW case for shmem? For example, if someone
> maps a shmem
> file as read-only and then writes to it, would that prevent the use of
> pte_range_none?

sorry, i mean PRIVATE but not READ-ONLY.

>
> Furthermore, if we encounter a large folio in shmem while reading,
> does it necessarily
> mean we can map the entire folio? Is it possible for some processes to
> only map part
> of large folios? For instance, if process A allocates large folios and
> process B maps
> only part of this shmem file or partially unmaps a large folio, how
> would that be handled?
>
> Apologies for not debugging this thoroughly, but these two corner
> cases seem worth
> considering. If these scenarios have already been addressed, please disregard my
> comments.
>
> Thanks
> Barry
Baolin Wang June 3, 2024, 9:37 a.m. UTC | #7
On 2024/6/3 17:01, Barry Song wrote:
> On Mon, Jun 3, 2024 at 8:58 PM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Mon, Jun 3, 2024 at 8:29 PM Baolin Wang
>> <baolin.wang@linux.alibaba.com> wrote:
>>>
>>>
>>>
>>> On 2024/6/3 13:28, Barry Song wrote:
>>>> On Thu, May 30, 2024 at 2:04 PM Baolin Wang
>>>> <baolin.wang@linux.alibaba.com> wrote:
>>>>>
>>>>> Add large folio mapping establishment support for finish_fault() as a preparation,
>>>>> to support multi-size THP allocation of anonymous shmem pages in the following
>>>>> patches.
>>>>>
>>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>>> ---
>>>>>    mm/memory.c | 58 ++++++++++++++++++++++++++++++++++++++++++++---------
>>>>>    1 file changed, 48 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index eef4e482c0c2..435187ff7ea4 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -4831,9 +4831,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>    {
>>>>>           struct vm_area_struct *vma = vmf->vma;
>>>>>           struct page *page;
>>>>> +       struct folio *folio;
>>>>>           vm_fault_t ret;
>>>>>           bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>>>>                         !(vma->vm_flags & VM_SHARED);
>>>>> +       int type, nr_pages, i;
>>>>> +       unsigned long addr = vmf->address;
>>>>>
>>>>>           /* Did we COW the page? */
>>>>>           if (is_cow)
>>>>> @@ -4864,24 +4867,59 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>                           return VM_FAULT_OOM;
>>>>>           }
>>>>>
>>>>> +       folio = page_folio(page);
>>>>> +       nr_pages = folio_nr_pages(folio);
>>>>> +
>>>>> +       /*
>>>>> +        * Using per-page fault to maintain the uffd semantics, and same
>>>>> +        * approach also applies to non-anonymous-shmem faults to avoid
>>>>> +        * inflating the RSS of the process.
>>>>
>>>> I don't feel the comment explains the root cause.
>>>> For non-shmem, anyway we have allocated the memory? Avoiding inflating
>>>> RSS seems not so useful as we have occupied the memory. the memory footprint
>>>
>>> This is also to keep the same behavior as before for non-anon-shmem, and
>>> will be discussed in the future.
>>
>> OK.
>>
>>>
>>>> is what we really care about. so we want to rely on read-ahead hints of subpage
>>>> to determine read-ahead size? that is why we don't map nr_pages for non-shmem
>>>> files though we can potentially reduce nr_pages - 1 page faults?
>>>
>>> IMHO, there is 2 cases for non-anon-shmem:
>>> (1) read mmap() faults: we can rely on the 'fault_around_bytes'
>>> interface to determin what size of mapping to build.
>>> (2) writable mmap() faults: I want to keep the same behavior as before
>>> (per-page fault), but we can talk about this when I send new patches to
>>> use mTHP to control large folio allocation for writable mmap().
>>
>> OK.
>>
>>>
>>>>> +        */
>>>>> +       if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
>>>>> +               nr_pages = 1;
>>>>> +       } else if (nr_pages > 1) {
>>>>> +               pgoff_t idx = folio_page_idx(folio, page);
>>>>> +               /* The page offset of vmf->address within the VMA. */
>>>>> +               pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
>>>>> +
>>>>> +               /*
>>>>> +                * Fallback to per-page fault in case the folio size in page
>>>>> +                * cache beyond the VMA limits.
>>>>> +                */
>>>>> +               if (unlikely(vma_off < idx ||
>>>>> +                            vma_off + (nr_pages - idx) > vma_pages(vma))) {
>>>>> +                       nr_pages = 1;
>>>>> +               } else {
>>>>> +                       /* Now we can set mappings for the whole large folio. */
>>>>> +                       addr = vmf->address - idx * PAGE_SIZE;
>>>>> +                       page = &folio->page;
>>>>> +               }
>>>>> +       }
>>>>> +
>>>>>           vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>>>> -                                     vmf->address, &vmf->ptl);
>>>>> +                                      addr, &vmf->ptl);
>>>>>           if (!vmf->pte)
>>>>>                   return VM_FAULT_NOPAGE;
>>>>>
>>>>>           /* Re-check under ptl */
>>>>> -       if (likely(!vmf_pte_changed(vmf))) {
>>>>> -               struct folio *folio = page_folio(page);
>>>>> -               int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>>>>> -
>>>>> -               set_pte_range(vmf, folio, page, 1, vmf->address);
>>>>> -               add_mm_counter(vma->vm_mm, type, 1);
>>>>> -               ret = 0;
>>>>> -       } else {
>>>>> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>>> +       if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
>>>>> +               update_mmu_tlb(vma, addr, vmf->pte);
>>>>>                   ret = VM_FAULT_NOPAGE;
>>>>> +               goto unlock;
>>>>> +       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>>>>
>>>> In what case we can't use !pte_range_none(vmf->pte, 1) for nr_pages == 1
>>>> then unify the code for nr_pages==1 and nr_pages > 1?
>>>>
>>>> It seems this has been discussed before, but I forget the reason.
>>>
>>> IIUC, this is for uffd case, which is not a none pte entry.
>>
>> Is it possible to have a COW case for shmem? For example, if someone
>> maps a shmem
>> file as read-only and then writes to it, would that prevent the use of
>> pte_range_none?
> 
> sorry, i mean PRIVATE but not READ-ONLY.

Yes, I think so. Now CoW case still use per-page fault in do_cow_fault().

>> Furthermore, if we encounter a large folio in shmem while reading,
>> does it necessarily
>> mean we can map the entire folio? Is it possible for some processes to

Now this will depend on the 'fault_around_bytes' interface.

>> only map part
>> of large folios? For instance, if process A allocates large folios and
>> process B maps
>> only part of this shmem file or partially unmaps a large folio, how
>> would that be handled?

This is certainly possible.

For tmpfs:
(1) If 'fault_around_bytes' is enabled, filemap_map_pages() will handle 
partially mapping of the large folio for process B.

(2) If 'fault_around_bytes' is set to 0, finish_fault() will fallback to 
per-page fault.

For Anonomous shmem, process B should be the child of process A in your 
case, then:
(1) If 'fault_around_bytes' is enabled, behavior is same with tmpfs.

(2) If 'fault_around_bytes' is set to 0, finish_fault() will build the 
whole large folio mapping for process B. Since process B will copy the 
same shared VMA from parent process A, which means a mTHP mapping to share.

>> Apologies for not debugging this thoroughly, but these two corner
>> cases seem worth
>> considering. If these scenarios have already been addressed, please disregard my
>> comments.

No worries:) Thanks for your valuable input.
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index eef4e482c0c2..435187ff7ea4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4831,9 +4831,12 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
+	struct folio *folio;
 	vm_fault_t ret;
 	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
 		      !(vma->vm_flags & VM_SHARED);
+	int type, nr_pages, i;
+	unsigned long addr = vmf->address;
 
 	/* Did we COW the page? */
 	if (is_cow)
@@ -4864,24 +4867,59 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 			return VM_FAULT_OOM;
 	}
 
+	folio = page_folio(page);
+	nr_pages = folio_nr_pages(folio);
+
+	/*
+	 * Using per-page fault to maintain the uffd semantics, and same
+	 * approach also applies to non-anonymous-shmem faults to avoid
+	 * inflating the RSS of the process.
+	 */
+	if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
+		nr_pages = 1;
+	} else if (nr_pages > 1) {
+		pgoff_t idx = folio_page_idx(folio, page);
+		/* The page offset of vmf->address within the VMA. */
+		pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
+
+		/*
+		 * Fallback to per-page fault in case the folio size in page
+		 * cache beyond the VMA limits.
+		 */
+		if (unlikely(vma_off < idx ||
+			     vma_off + (nr_pages - idx) > vma_pages(vma))) {
+			nr_pages = 1;
+		} else {
+			/* Now we can set mappings for the whole large folio. */
+			addr = vmf->address - idx * PAGE_SIZE;
+			page = &folio->page;
+		}
+	}
+
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-				      vmf->address, &vmf->ptl);
+				       addr, &vmf->ptl);
 	if (!vmf->pte)
 		return VM_FAULT_NOPAGE;
 
 	/* Re-check under ptl */
-	if (likely(!vmf_pte_changed(vmf))) {
-		struct folio *folio = page_folio(page);
-		int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
-
-		set_pte_range(vmf, folio, page, 1, vmf->address);
-		add_mm_counter(vma->vm_mm, type, 1);
-		ret = 0;
-	} else {
-		update_mmu_tlb(vma, vmf->address, vmf->pte);
+	if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
+		update_mmu_tlb(vma, addr, vmf->pte);
 		ret = VM_FAULT_NOPAGE;
+		goto unlock;
+	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+		for (i = 0; i < nr_pages; i++)
+			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
+		ret = VM_FAULT_NOPAGE;
+		goto unlock;
 	}
 
+	folio_ref_add(folio, nr_pages - 1);
+	set_pte_range(vmf, folio, page, nr_pages, addr);
+	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
+	add_mm_counter(vma->vm_mm, type, nr_pages);
+	ret = 0;
+
+unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return ret;
 }