diff mbox series

[v4,03/10] userfaultfd/shmem: support UFFDIO_CONTINUE for shmem

Message ID 20210420220804.486803-4-axelrasmussen@google.com (mailing list archive)
State New
Headers show
Series userfaultfd: add minor fault handling for shmem | expand

Commit Message

Axel Rasmussen April 20, 2021, 10:07 p.m. UTC
With this change, userspace can resolve a minor fault within a
shmem-backed area with a UFFDIO_CONTINUE ioctl. The semantics for this
match those for hugetlbfs - we look up the existing page in the page
cache, and install a PTE for it.

This commit introduces a new helper: mcopy_atomic_install_pte.

Why handle UFFDIO_CONTINUE for shmem in mm/userfaultfd.c, instead of in
shmem.c? The existing userfault implementation only relies on shmem.c
for VM_SHARED VMAs. However, minor fault handling / CONTINUE work just
fine for !VM_SHARED VMAs as well. We'd prefer to handle CONTINUE for
shmem in one place, regardless of shared/private (to reduce code
duplication).

Why add a new mcopy_atomic_install_pte helper? A problem we have with
continue is that shmem_mcopy_atomic_pte() and mcopy_atomic_pte() are
*close* to what we want, but not exactly. We do want to setup the PTEs
in a CONTINUE operation, but we don't want to e.g. allocate a new page,
charge it (e.g. to the shmem inode), manipulate various flags, etc. Also
we have the problem stated above: shmem_mcopy_atomic_pte() and
mcopy_atomic_pte() both handle one-half of the problem (shared /
private) continue cares about. So, introduce mcontinue_atomic_pte(), to
handle all of the shmem continue cases. Introduce the helper so it
doesn't duplicate code with mcopy_atomic_pte().

In a future commit, shmem_mcopy_atomic_pte() will also be modified to
use this new helper. However, since this is a bigger refactor, it seems
most clear to do it as a separate change.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
---
 mm/userfaultfd.c | 172 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 127 insertions(+), 45 deletions(-)

Comments

Axel Rasmussen April 22, 2021, 8:22 p.m. UTC | #1
On Tue, Apr 20, 2021 at 3:08 PM Axel Rasmussen <axelrasmussen@google.com> wrote:
>
> With this change, userspace can resolve a minor fault within a
> shmem-backed area with a UFFDIO_CONTINUE ioctl. The semantics for this
> match those for hugetlbfs - we look up the existing page in the page
> cache, and install a PTE for it.
>
> This commit introduces a new helper: mcopy_atomic_install_pte.
>
> Why handle UFFDIO_CONTINUE for shmem in mm/userfaultfd.c, instead of in
> shmem.c? The existing userfault implementation only relies on shmem.c
> for VM_SHARED VMAs. However, minor fault handling / CONTINUE work just
> fine for !VM_SHARED VMAs as well. We'd prefer to handle CONTINUE for
> shmem in one place, regardless of shared/private (to reduce code
> duplication).
>
> Why add a new mcopy_atomic_install_pte helper? A problem we have with
> continue is that shmem_mcopy_atomic_pte() and mcopy_atomic_pte() are
> *close* to what we want, but not exactly. We do want to setup the PTEs
> in a CONTINUE operation, but we don't want to e.g. allocate a new page,
> charge it (e.g. to the shmem inode), manipulate various flags, etc. Also
> we have the problem stated above: shmem_mcopy_atomic_pte() and
> mcopy_atomic_pte() both handle one-half of the problem (shared /
> private) continue cares about. So, introduce mcontinue_atomic_pte(), to
> handle all of the shmem continue cases. Introduce the helper so it
> doesn't duplicate code with mcopy_atomic_pte().
>
> In a future commit, shmem_mcopy_atomic_pte() will also be modified to
> use this new helper. However, since this is a bigger refactor, it seems
> most clear to do it as a separate change.
>
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> ---
>  mm/userfaultfd.c | 172 ++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 127 insertions(+), 45 deletions(-)
>
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 23fa2583bbd1..51d8c0127161 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -48,6 +48,83 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
>         return dst_vma;
>  }
>
> +/*
> + * Install PTEs, to map dst_addr (within dst_vma) to page.
> + *
> + * This function handles MCOPY_ATOMIC_CONTINUE (which is always file-backed),
> + * whether or not dst_vma is VM_SHARED. It also handles the more general
> + * MCOPY_ATOMIC_NORMAL case, when dst_vma is *not* VM_SHARED (it may be file
> + * backed, or not).
> + *
> + * Note that MCOPY_ATOMIC_NORMAL for a VM_SHARED dst_vma is handled by
> + * shmem_mcopy_atomic_pte instead.
> + */
> +static int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> +                                   struct vm_area_struct *dst_vma,
> +                                   unsigned long dst_addr, struct page *page,
> +                                   bool newly_allocated, bool wp_copy)
> +{
> +       int ret;
> +       pte_t _dst_pte, *dst_pte;
> +       bool writable = dst_vma->vm_flags & VM_WRITE;
> +       bool vm_shared = dst_vma->vm_flags & VM_SHARED;
> +       bool page_in_cache = page->mapping;
> +       spinlock_t *ptl;
> +       struct inode *inode;
> +       pgoff_t offset, max_off;
> +
> +       _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> +       if (page_in_cache && !vm_shared)
> +               writable = false;
> +       if (writable || !page_in_cache)
> +               _dst_pte = pte_mkdirty(_dst_pte);
> +       if (writable) {
> +               if (wp_copy)
> +                       _dst_pte = pte_mkuffd_wp(_dst_pte);
> +               else
> +                       _dst_pte = pte_mkwrite(_dst_pte);
> +       }
> +
> +       dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> +
> +       if (vma_is_shmem(dst_vma)) {
> +               /* serialize against truncate with the page table lock */
> +               inode = dst_vma->vm_file->f_inode;
> +               offset = linear_page_index(dst_vma, dst_addr);
> +               max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> +               ret = -EFAULT;
> +               if (unlikely(offset >= max_off))
> +                       goto out_unlock;
> +       }
> +
> +       ret = -EEXIST;
> +       if (!pte_none(*dst_pte))
> +               goto out_unlock;
> +
> +       if (page_in_cache)
> +               page_add_file_rmap(page, false);
> +       else
> +               page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
> +
> +       /*
> +        * Must happen after rmap, as mm_counter() checks mapping (via
> +        * PageAnon()), which is set by __page_set_anon_rmap().
> +        */
> +       inc_mm_counter(dst_mm, mm_counter(page));

Actually, I've noticed that this is still slightly incorrect.

As Hugh pointed out, this works for the anon case, because
page_add_new_anon_rmap() sets page->mapping.

But for the page_in_cache case, it doesn't work: unlike its anon
counterpart, page_add_file_rmap() *does not* set page->mapping. So, I
think this line needs to become:

inc_mm_counter(dst_mm, page_in_cache ? mm_counter_file(page) :
mm_counter(page));

I'll include this fix in a v5 as well.

> +
> +       if (newly_allocated)
> +               lru_cache_add_inactive_or_unevictable(page, dst_vma);
> +
> +       set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> +
> +       /* No need to invalidate - it was non-present before */
> +       update_mmu_cache(dst_vma, dst_addr, dst_pte);
> +       ret = 0;
> +out_unlock:
> +       pte_unmap_unlock(dst_pte, ptl);
> +       return ret;
> +}
> +
>  static int mcopy_atomic_pte(struct mm_struct *dst_mm,
>                             pmd_t *dst_pmd,
>                             struct vm_area_struct *dst_vma,
> @@ -56,13 +133,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
>                             struct page **pagep,
>                             bool wp_copy)
>  {
> -       pte_t _dst_pte, *dst_pte;
> -       spinlock_t *ptl;
>         void *page_kaddr;
>         int ret;
>         struct page *page;
> -       pgoff_t offset, max_off;
> -       struct inode *inode;
>
>         if (!*pagep) {
>                 ret = -ENOMEM;
> @@ -99,43 +172,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
>         if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
>                 goto out_release;
>
> -       _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
> -       if (dst_vma->vm_flags & VM_WRITE) {
> -               if (wp_copy)
> -                       _dst_pte = pte_mkuffd_wp(_dst_pte);
> -               else
> -                       _dst_pte = pte_mkwrite(_dst_pte);
> -       }
> -
> -       dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> -       if (dst_vma->vm_file) {
> -               /* the shmem MAP_PRIVATE case requires checking the i_size */
> -               inode = dst_vma->vm_file->f_inode;
> -               offset = linear_page_index(dst_vma, dst_addr);
> -               max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> -               ret = -EFAULT;
> -               if (unlikely(offset >= max_off))
> -                       goto out_release_uncharge_unlock;
> -       }
> -       ret = -EEXIST;
> -       if (!pte_none(*dst_pte))
> -               goto out_release_uncharge_unlock;
> -
> -       inc_mm_counter(dst_mm, MM_ANONPAGES);
> -       page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
> -       lru_cache_add_inactive_or_unevictable(page, dst_vma);
> -
> -       set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> -
> -       /* No need to invalidate - it was non-present before */
> -       update_mmu_cache(dst_vma, dst_addr, dst_pte);
> -
> -       pte_unmap_unlock(dst_pte, ptl);
> -       ret = 0;
> +       ret = mcopy_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> +                                      page, true, wp_copy);
> +       if (ret)
> +               goto out_release;
>  out:
>         return ret;
> -out_release_uncharge_unlock:
> -       pte_unmap_unlock(dst_pte, ptl);
>  out_release:
>         put_page(page);
>         goto out;
> @@ -176,6 +218,41 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
>         return ret;
>  }
>
> +/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
> +static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
> +                               pmd_t *dst_pmd,
> +                               struct vm_area_struct *dst_vma,
> +                               unsigned long dst_addr,
> +                               bool wp_copy)
> +{
> +       struct inode *inode = file_inode(dst_vma->vm_file);
> +       pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
> +       struct page *page;
> +       int ret;
> +
> +       ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
> +       if (ret)
> +               goto out;
> +       if (!page) {
> +               ret = -EFAULT;
> +               goto out;
> +       }
> +
> +       ret = mcopy_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> +                                      page, false, wp_copy);
> +       if (ret)
> +               goto out_release;
> +
> +       unlock_page(page);
> +       ret = 0;
> +out:
> +       return ret;
> +out_release:
> +       unlock_page(page);
> +       put_page(page);
> +       goto out;
> +}
> +
>  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
>  {
>         pgd_t *pgd;
> @@ -415,11 +492,16 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>                                                 unsigned long dst_addr,
>                                                 unsigned long src_addr,
>                                                 struct page **page,
> -                                               bool zeropage,
> +                                               enum mcopy_atomic_mode mode,
>                                                 bool wp_copy)
>  {
>         ssize_t err;
>
> +       if (mode == MCOPY_ATOMIC_CONTINUE) {
> +               return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> +                                           wp_copy);
> +       }
> +
>         /*
>          * The normal page fault path for a shmem will invoke the
>          * fault, fill the hole in the file and COW it right away. The
> @@ -431,7 +513,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>          * and not in the radix tree.
>          */
>         if (!(dst_vma->vm_flags & VM_SHARED)) {
> -               if (!zeropage)
> +               if (mode == MCOPY_ATOMIC_NORMAL)
>                         err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
>                                                dst_addr, src_addr, page,
>                                                wp_copy);
> @@ -441,7 +523,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
>         } else {
>                 VM_WARN_ON_ONCE(wp_copy);
>                 err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
> -                                            dst_addr, src_addr, zeropage,
> +                                            dst_addr, src_addr,
> +                                            mode != MCOPY_ATOMIC_NORMAL,
>                                              page);
>         }
>
> @@ -463,7 +546,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>         long copied;
>         struct page *page;
>         bool wp_copy;
> -       bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
>
>         /*
>          * Sanitize the command parameters:
> @@ -526,7 +608,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>
>         if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
>                 goto out_unlock;
> -       if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
> +       if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
>                 goto out_unlock;
>
>         /*
> @@ -574,7 +656,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>                 BUG_ON(pmd_trans_huge(*dst_pmd));
>
>                 err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> -                                      src_addr, &page, zeropage, wp_copy);
> +                                      src_addr, &page, mcopy_mode, wp_copy);
>                 cond_resched();
>
>                 if (unlikely(err == -ENOENT)) {
> --
> 2.31.1.368.gbe11c130af-goog
>
Peter Xu April 22, 2021, 9:18 p.m. UTC | #2
Axel,

On Thu, Apr 22, 2021 at 01:22:02PM -0700, Axel Rasmussen wrote:
> > +       if (page_in_cache)
> > +               page_add_file_rmap(page, false);
> > +       else
> > +               page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
> > +
> > +       /*
> > +        * Must happen after rmap, as mm_counter() checks mapping (via
> > +        * PageAnon()), which is set by __page_set_anon_rmap().
> > +        */
> > +       inc_mm_counter(dst_mm, mm_counter(page));
> 
> Actually, I've noticed that this is still slightly incorrect.
> 
> As Hugh pointed out, this works for the anon case, because
> page_add_new_anon_rmap() sets page->mapping.
> 
> But for the page_in_cache case, it doesn't work: unlike its anon
> counterpart, page_add_file_rmap() *does not* set page->mapping.

If it's already in the page cache, shouldn't it be set already in e.g. one
previous call to shmem_add_to_page_cache()?  Thanks,
Axel Rasmussen April 22, 2021, 10:05 p.m. UTC | #3
On Thu, Apr 22, 2021 at 2:18 PM Peter Xu <peterx@redhat.com> wrote:
>
> Axel,
>
> On Thu, Apr 22, 2021 at 01:22:02PM -0700, Axel Rasmussen wrote:
> > > +       if (page_in_cache)
> > > +               page_add_file_rmap(page, false);
> > > +       else
> > > +               page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
> > > +
> > > +       /*
> > > +        * Must happen after rmap, as mm_counter() checks mapping (via
> > > +        * PageAnon()), which is set by __page_set_anon_rmap().
> > > +        */
> > > +       inc_mm_counter(dst_mm, mm_counter(page));
> >
> > Actually, I've noticed that this is still slightly incorrect.
> >
> > As Hugh pointed out, this works for the anon case, because
> > page_add_new_anon_rmap() sets page->mapping.
> >
> > But for the page_in_cache case, it doesn't work: unlike its anon
> > counterpart, page_add_file_rmap() *does not* set page->mapping.
>
> If it's already in the page cache, shouldn't it be set already in e.g. one
> previous call to shmem_add_to_page_cache()?  Thanks,

Ah, of course. Sorry for the noise. This should have been obvious to
me from how page_in_cache is defined.

I had run into the same "Bad rss-counter state" warning while applying
my patches to an earlier kernel version, and got concerned about this
line after looking at page_add_file_rmap().

But, you're right that this ought to work, and indeed I can't
reproduce the warning when the patches are based on the mm snapshot
mentioned in the cover letter. So, it seems the problem lies with this
other unrelated merge I'm doing, not the series itself. :)

>
> --
> Peter Xu
>
Hugh Dickins April 27, 2021, 2:19 a.m. UTC | #4
On Tue, 20 Apr 2021, Axel Rasmussen wrote:

> With this change, userspace can resolve a minor fault within a
> shmem-backed area with a UFFDIO_CONTINUE ioctl. The semantics for this
> match those for hugetlbfs - we look up the existing page in the page
> cache, and install a PTE for it.
> 
> This commit introduces a new helper: mcopy_atomic_install_pte.
> 
> Why handle UFFDIO_CONTINUE for shmem in mm/userfaultfd.c, instead of in
> shmem.c? The existing userfault implementation only relies on shmem.c
> for VM_SHARED VMAs. However, minor fault handling / CONTINUE work just
> fine for !VM_SHARED VMAs as well. We'd prefer to handle CONTINUE for
> shmem in one place, regardless of shared/private (to reduce code
> duplication).
> 
> Why add a new mcopy_atomic_install_pte helper? A problem we have with
> continue is that shmem_mcopy_atomic_pte() and mcopy_atomic_pte() are
> *close* to what we want, but not exactly. We do want to setup the PTEs
> in a CONTINUE operation, but we don't want to e.g. allocate a new page,
> charge it (e.g. to the shmem inode), manipulate various flags, etc. Also
> we have the problem stated above: shmem_mcopy_atomic_pte() and
> mcopy_atomic_pte() both handle one-half of the problem (shared /
> private) continue cares about. So, introduce mcontinue_atomic_pte(), to
> handle all of the shmem continue cases. Introduce the helper so it
> doesn't duplicate code with mcopy_atomic_pte().
> 
> In a future commit, shmem_mcopy_atomic_pte() will also be modified to
> use this new helper. However, since this is a bigger refactor, it seems
> most clear to do it as a separate change.
> 
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>

If this "03/10" had been numbered 04/10, I would have said
Acked-by: Hugh Dickins <hughd@google.com>

But I find this new ordering incomprehensible - I'm surprised that it
even builds this way around (if it does): this patch is so much about
what has been enabled in "04/10" (references to UFFDIO_CONTINUE shmem
VMAs etc).

Does Peter still think this way round is better? If he does, then we
shall have to compromise by asking you just to squash the two together.

> ---
>  mm/userfaultfd.c | 172 ++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 127 insertions(+), 45 deletions(-)
Peter Xu April 27, 2021, 3:54 p.m. UTC | #5
On Mon, Apr 26, 2021 at 07:19:58PM -0700, Hugh Dickins wrote:
> On Tue, 20 Apr 2021, Axel Rasmussen wrote:
> 
> > With this change, userspace can resolve a minor fault within a
> > shmem-backed area with a UFFDIO_CONTINUE ioctl. The semantics for this
> > match those for hugetlbfs - we look up the existing page in the page
> > cache, and install a PTE for it.
> > 
> > This commit introduces a new helper: mcopy_atomic_install_pte.
> > 
> > Why handle UFFDIO_CONTINUE for shmem in mm/userfaultfd.c, instead of in
> > shmem.c? The existing userfault implementation only relies on shmem.c
> > for VM_SHARED VMAs. However, minor fault handling / CONTINUE work just
> > fine for !VM_SHARED VMAs as well. We'd prefer to handle CONTINUE for
> > shmem in one place, regardless of shared/private (to reduce code
> > duplication).
> > 
> > Why add a new mcopy_atomic_install_pte helper? A problem we have with
> > continue is that shmem_mcopy_atomic_pte() and mcopy_atomic_pte() are
> > *close* to what we want, but not exactly. We do want to setup the PTEs
> > in a CONTINUE operation, but we don't want to e.g. allocate a new page,
> > charge it (e.g. to the shmem inode), manipulate various flags, etc. Also
> > we have the problem stated above: shmem_mcopy_atomic_pte() and
> > mcopy_atomic_pte() both handle one-half of the problem (shared /
> > private) continue cares about. So, introduce mcontinue_atomic_pte(), to
> > handle all of the shmem continue cases. Introduce the helper so it
> > doesn't duplicate code with mcopy_atomic_pte().
> > 
> > In a future commit, shmem_mcopy_atomic_pte() will also be modified to
> > use this new helper. However, since this is a bigger refactor, it seems
> > most clear to do it as a separate change.
> > 
> > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> 
> If this "03/10" had been numbered 04/10, I would have said
> Acked-by: Hugh Dickins <hughd@google.com>
> 
> But I find this new ordering incomprehensible - I'm surprised that it
> even builds this way around (if it does): this patch is so much about
> what has been enabled in "04/10" (references to UFFDIO_CONTINUE shmem
> VMAs etc).
> 
> Does Peter still think this way round is better? If he does, then we
> shall have to compromise by asking you just to squash the two together.

Hi, Hugh, Axel,

I have no strong opinion. To me, UFFDIO_CONTINUE can be introduced earlier like
this. As long as we don't enable the feature (which is done in the next patch),
no one will be able to call it, then it looks clean.  Merging them also looks
good to me.

Thanks,
Axel Rasmussen April 27, 2021, 4:57 p.m. UTC | #6
I'd prefer to keep them separate, as they are not tiny patches (they
are roughly +200/-150 each). And, they really are quite independent -
at least in the sense that I can reorder them via rebase with no
conflicts, and the code builds at each commit in either orientation. I
think this implies they're easier to review separately, rather than
squashed.

I don't have a strong feeling about the order. I slightly prefer
swapping them compared to this v4 series: first introduce minor
faults, then introduce CONTINUE.

Since Peter also has no strong opinion, and Hugh it sounds like you
prefer it the other way around, I'll swap them as we had in some
previous version of this series: first introduce minor faults, then
introduce CONTINUE.

On Tue, Apr 27, 2021 at 8:54 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Mon, Apr 26, 2021 at 07:19:58PM -0700, Hugh Dickins wrote:
> > On Tue, 20 Apr 2021, Axel Rasmussen wrote:
> >
> > > With this change, userspace can resolve a minor fault within a
> > > shmem-backed area with a UFFDIO_CONTINUE ioctl. The semantics for this
> > > match those for hugetlbfs - we look up the existing page in the page
> > > cache, and install a PTE for it.
> > >
> > > This commit introduces a new helper: mcopy_atomic_install_pte.
> > >
> > > Why handle UFFDIO_CONTINUE for shmem in mm/userfaultfd.c, instead of in
> > > shmem.c? The existing userfault implementation only relies on shmem.c
> > > for VM_SHARED VMAs. However, minor fault handling / CONTINUE work just
> > > fine for !VM_SHARED VMAs as well. We'd prefer to handle CONTINUE for
> > > shmem in one place, regardless of shared/private (to reduce code
> > > duplication).
> > >
> > > Why add a new mcopy_atomic_install_pte helper? A problem we have with
> > > continue is that shmem_mcopy_atomic_pte() and mcopy_atomic_pte() are
> > > *close* to what we want, but not exactly. We do want to setup the PTEs
> > > in a CONTINUE operation, but we don't want to e.g. allocate a new page,
> > > charge it (e.g. to the shmem inode), manipulate various flags, etc. Also
> > > we have the problem stated above: shmem_mcopy_atomic_pte() and
> > > mcopy_atomic_pte() both handle one-half of the problem (shared /
> > > private) continue cares about. So, introduce mcontinue_atomic_pte(), to
> > > handle all of the shmem continue cases. Introduce the helper so it
> > > doesn't duplicate code with mcopy_atomic_pte().
> > >
> > > In a future commit, shmem_mcopy_atomic_pte() will also be modified to
> > > use this new helper. However, since this is a bigger refactor, it seems
> > > most clear to do it as a separate change.
> > >
> > > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> >
> > If this "03/10" had been numbered 04/10, I would have said
> > Acked-by: Hugh Dickins <hughd@google.com>
> >
> > But I find this new ordering incomprehensible - I'm surprised that it
> > even builds this way around (if it does): this patch is so much about
> > what has been enabled in "04/10" (references to UFFDIO_CONTINUE shmem
> > VMAs etc).
> >
> > Does Peter still think this way round is better? If he does, then we
> > shall have to compromise by asking you just to squash the two together.
>
> Hi, Hugh, Axel,
>
> I have no strong opinion. To me, UFFDIO_CONTINUE can be introduced earlier like
> this. As long as we don't enable the feature (which is done in the next patch),
> no one will be able to call it, then it looks clean.  Merging them also looks
> good to me.
>
> Thanks,
>
> --
> Peter Xu
>
Peter Xu April 27, 2021, 6:03 p.m. UTC | #7
On Tue, Apr 27, 2021 at 09:57:16AM -0700, Axel Rasmussen wrote:
> I'd prefer to keep them separate, as they are not tiny patches (they
> are roughly +200/-150 each). And, they really are quite independent -
> at least in the sense that I can reorder them via rebase with no
> conflicts, and the code builds at each commit in either orientation. I
> think this implies they're easier to review separately, rather than
> squashed.
> 
> I don't have a strong feeling about the order. I slightly prefer
> swapping them compared to this v4 series: first introduce minor
> faults, then introduce CONTINUE.
> 
> Since Peter also has no strong opinion, and Hugh it sounds like you
> prefer it the other way around, I'll swap them as we had in some
> previous version of this series: first introduce minor faults, then
> introduce CONTINUE.

Yes I have no strong opinion, but that's probably the least I prefer. :-)

Because you'll declare UFFD_FEATURE_MINOR_SHMEM and enable this feature without
the feature being completely implemented (without UFFDIO_CONTINUE, it's not
complete since no one will be able to resolve that minor fault).

Not a big deal anyway, but since we're at it... Basically I think three things
to do for minor shmem support:

  (1) UFFDIO_CONTINUE (resolving path)
  (2) Handle fault path for shmem minor fault (faulting path)
  (3) Enablement of UFFD_FEATURE_MINOR_SHMEM (from which point, user can detect
      and enable it)

I have no preference on how you'd like to merge these steps (right now you did
1 first, then 2+3 later; or as Hugh suggested do 1+2+3 together), but I'd still
hope item 3 should always be the last, if possible...

Thanks,
Axel Rasmussen April 27, 2021, 8:29 p.m. UTC | #8
On Tue, Apr 27, 2021 at 11:03 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Apr 27, 2021 at 09:57:16AM -0700, Axel Rasmussen wrote:
> > I'd prefer to keep them separate, as they are not tiny patches (they
> > are roughly +200/-150 each). And, they really are quite independent -
> > at least in the sense that I can reorder them via rebase with no
> > conflicts, and the code builds at each commit in either orientation. I
> > think this implies they're easier to review separately, rather than
> > squashed.
> >
> > I don't have a strong feeling about the order. I slightly prefer
> > swapping them compared to this v4 series: first introduce minor
> > faults, then introduce CONTINUE.
> >
> > Since Peter also has no strong opinion, and Hugh it sounds like you
> > prefer it the other way around, I'll swap them as we had in some
> > previous version of this series: first introduce minor faults, then
> > introduce CONTINUE.
>
> Yes I have no strong opinion, but that's probably the least I prefer. :-)
>
> Because you'll declare UFFD_FEATURE_MINOR_SHMEM and enable this feature without
> the feature being completely implemented (without UFFDIO_CONTINUE, it's not
> complete since no one will be able to resolve that minor fault).
>
> Not a big deal anyway, but since we're at it... Basically I think three things
> to do for minor shmem support:
>
>   (1) UFFDIO_CONTINUE (resolving path)
>   (2) Handle fault path for shmem minor fault (faulting path)
>   (3) Enablement of UFFD_FEATURE_MINOR_SHMEM (from which point, user can detect
>       and enable it)
>
> I have no preference on how you'd like to merge these steps (right now you did
> 1 first, then 2+3 later; or as Hugh suggested do 1+2+3 together), but I'd still
> hope item 3 should always be the last, if possible...

In that case, I'll split the patch which adds the faulting path in
two: add the faulting path hook and registration mode, and then in a
separate commit advertise the feature flag as available.

Then I'll order them like so, which I think is the order Hugh finds
more natural:
1. MInor fault registration / faulting path
2. CONTINUE ioctl to resolve the faults
3. Advertise the feature as supported

Sound okay?

>
> Thanks,
>
> --
> Peter Xu
>
Peter Xu April 27, 2021, 8:42 p.m. UTC | #9
On Tue, Apr 27, 2021 at 01:29:14PM -0700, Axel Rasmussen wrote:
> On Tue, Apr 27, 2021 at 11:03 AM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Tue, Apr 27, 2021 at 09:57:16AM -0700, Axel Rasmussen wrote:
> > > I'd prefer to keep them separate, as they are not tiny patches (they
> > > are roughly +200/-150 each). And, they really are quite independent -
> > > at least in the sense that I can reorder them via rebase with no
> > > conflicts, and the code builds at each commit in either orientation. I
> > > think this implies they're easier to review separately, rather than
> > > squashed.
> > >
> > > I don't have a strong feeling about the order. I slightly prefer
> > > swapping them compared to this v4 series: first introduce minor
> > > faults, then introduce CONTINUE.
> > >
> > > Since Peter also has no strong opinion, and Hugh it sounds like you
> > > prefer it the other way around, I'll swap them as we had in some
> > > previous version of this series: first introduce minor faults, then
> > > introduce CONTINUE.
> >
> > Yes I have no strong opinion, but that's probably the least I prefer. :-)
> >
> > Because you'll declare UFFD_FEATURE_MINOR_SHMEM and enable this feature without
> > the feature being completely implemented (without UFFDIO_CONTINUE, it's not
> > complete since no one will be able to resolve that minor fault).
> >
> > Not a big deal anyway, but since we're at it... Basically I think three things
> > to do for minor shmem support:
> >
> >   (1) UFFDIO_CONTINUE (resolving path)
> >   (2) Handle fault path for shmem minor fault (faulting path)
> >   (3) Enablement of UFFD_FEATURE_MINOR_SHMEM (from which point, user can detect
> >       and enable it)
> >
> > I have no preference on how you'd like to merge these steps (right now you did
> > 1 first, then 2+3 later; or as Hugh suggested do 1+2+3 together), but I'd still
> > hope item 3 should always be the last, if possible...
> 
> In that case, I'll split the patch which adds the faulting path in
> two: add the faulting path hook and registration mode, and then in a
> separate commit advertise the feature flag as available.
> 
> Then I'll order them like so, which I think is the order Hugh finds
> more natural:
> 1. MInor fault registration / faulting path
> 2. CONTINUE ioctl to resolve the faults
> 3. Advertise the feature as supported
> 
> Sound okay?

Good to me, thanks Axel.
Hugh Dickins April 27, 2021, 8:59 p.m. UTC | #10
On Tue, Apr 27, 2021 at 1:42 PM Peter Xu <peterx@redhat.com> wrote:
> On Tue, Apr 27, 2021 at 01:29:14PM -0700, Axel Rasmussen wrote:
> > On Tue, Apr 27, 2021 at 11:03 AM Peter Xu <peterx@redhat.com> wrote:
> > >
> > > On Tue, Apr 27, 2021 at 09:57:16AM -0700, Axel Rasmussen wrote:
> > > > I'd prefer to keep them separate, as they are not tiny patches (they
> > > > are roughly +200/-150 each). And, they really are quite independent -
> > > > at least in the sense that I can reorder them via rebase with no
> > > > conflicts, and the code builds at each commit in either orientation. I
> > > > think this implies they're easier to review separately, rather than
> > > > squashed.
> > > >
> > > > I don't have a strong feeling about the order. I slightly prefer
> > > > swapping them compared to this v4 series: first introduce minor
> > > > faults, then introduce CONTINUE.
> > > >
> > > > Since Peter also has no strong opinion, and Hugh it sounds like you
> > > > prefer it the other way around, I'll swap them as we had in some
> > > > previous version of this series: first introduce minor faults, then
> > > > introduce CONTINUE.
> > >
> > > Yes I have no strong opinion, but that's probably the least I prefer. :-)
> > >
> > > Because you'll declare UFFD_FEATURE_MINOR_SHMEM and enable this feature without
> > > the feature being completely implemented (without UFFDIO_CONTINUE, it's not
> > > complete since no one will be able to resolve that minor fault).
> > >
> > > Not a big deal anyway, but since we're at it... Basically I think three things
> > > to do for minor shmem support:
> > >
> > >   (1) UFFDIO_CONTINUE (resolving path)
> > >   (2) Handle fault path for shmem minor fault (faulting path)
> > >   (3) Enablement of UFFD_FEATURE_MINOR_SHMEM (from which point, user can detect
> > >       and enable it)
> > >
> > > I have no preference on how you'd like to merge these steps (right now you did
> > > 1 first, then 2+3 later; or as Hugh suggested do 1+2+3 together), but I'd still
> > > hope item 3 should always be the last, if possible...
> >
> > In that case, I'll split the patch which adds the faulting path in
> > two: add the faulting path hook and registration mode, and then in a
> > separate commit advertise the feature flag as available.
> >
> > Then I'll order them like so, which I think is the order Hugh finds
> > more natural:
> > 1. MInor fault registration / faulting path
> > 2. CONTINUE ioctl to resolve the faults
> > 3. Advertise the feature as supported
> >
> > Sound okay?
>
> Good to me, thanks Axel.

Okay.
diff mbox series

Patch

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 23fa2583bbd1..51d8c0127161 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -48,6 +48,83 @@  struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 	return dst_vma;
 }
 
+/*
+ * Install PTEs, to map dst_addr (within dst_vma) to page.
+ *
+ * This function handles MCOPY_ATOMIC_CONTINUE (which is always file-backed),
+ * whether or not dst_vma is VM_SHARED. It also handles the more general
+ * MCOPY_ATOMIC_NORMAL case, when dst_vma is *not* VM_SHARED (it may be file
+ * backed, or not).
+ *
+ * Note that MCOPY_ATOMIC_NORMAL for a VM_SHARED dst_vma is handled by
+ * shmem_mcopy_atomic_pte instead.
+ */
+static int mcopy_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+				    struct vm_area_struct *dst_vma,
+				    unsigned long dst_addr, struct page *page,
+				    bool newly_allocated, bool wp_copy)
+{
+	int ret;
+	pte_t _dst_pte, *dst_pte;
+	bool writable = dst_vma->vm_flags & VM_WRITE;
+	bool vm_shared = dst_vma->vm_flags & VM_SHARED;
+	bool page_in_cache = page->mapping;
+	spinlock_t *ptl;
+	struct inode *inode;
+	pgoff_t offset, max_off;
+
+	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+	if (page_in_cache && !vm_shared)
+		writable = false;
+	if (writable || !page_in_cache)
+		_dst_pte = pte_mkdirty(_dst_pte);
+	if (writable) {
+		if (wp_copy)
+			_dst_pte = pte_mkuffd_wp(_dst_pte);
+		else
+			_dst_pte = pte_mkwrite(_dst_pte);
+	}
+
+	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+	if (vma_is_shmem(dst_vma)) {
+		/* serialize against truncate with the page table lock */
+		inode = dst_vma->vm_file->f_inode;
+		offset = linear_page_index(dst_vma, dst_addr);
+		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+		ret = -EFAULT;
+		if (unlikely(offset >= max_off))
+			goto out_unlock;
+	}
+
+	ret = -EEXIST;
+	if (!pte_none(*dst_pte))
+		goto out_unlock;
+
+	if (page_in_cache)
+		page_add_file_rmap(page, false);
+	else
+		page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+
+	/*
+	 * Must happen after rmap, as mm_counter() checks mapping (via
+	 * PageAnon()), which is set by __page_set_anon_rmap().
+	 */
+	inc_mm_counter(dst_mm, mm_counter(page));
+
+	if (newly_allocated)
+		lru_cache_add_inactive_or_unevictable(page, dst_vma);
+
+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+	ret = 0;
+out_unlock:
+	pte_unmap_unlock(dst_pte, ptl);
+	return ret;
+}
+
 static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    pmd_t *dst_pmd,
 			    struct vm_area_struct *dst_vma,
@@ -56,13 +133,9 @@  static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    struct page **pagep,
 			    bool wp_copy)
 {
-	pte_t _dst_pte, *dst_pte;
-	spinlock_t *ptl;
 	void *page_kaddr;
 	int ret;
 	struct page *page;
-	pgoff_t offset, max_off;
-	struct inode *inode;
 
 	if (!*pagep) {
 		ret = -ENOMEM;
@@ -99,43 +172,12 @@  static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 	if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
 		goto out_release;
 
-	_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
-	if (dst_vma->vm_flags & VM_WRITE) {
-		if (wp_copy)
-			_dst_pte = pte_mkuffd_wp(_dst_pte);
-		else
-			_dst_pte = pte_mkwrite(_dst_pte);
-	}
-
-	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
-	if (dst_vma->vm_file) {
-		/* the shmem MAP_PRIVATE case requires checking the i_size */
-		inode = dst_vma->vm_file->f_inode;
-		offset = linear_page_index(dst_vma, dst_addr);
-		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-		ret = -EFAULT;
-		if (unlikely(offset >= max_off))
-			goto out_release_uncharge_unlock;
-	}
-	ret = -EEXIST;
-	if (!pte_none(*dst_pte))
-		goto out_release_uncharge_unlock;
-
-	inc_mm_counter(dst_mm, MM_ANONPAGES);
-	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
-	lru_cache_add_inactive_or_unevictable(page, dst_vma);
-
-	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
-	/* No need to invalidate - it was non-present before */
-	update_mmu_cache(dst_vma, dst_addr, dst_pte);
-
-	pte_unmap_unlock(dst_pte, ptl);
-	ret = 0;
+	ret = mcopy_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+				       page, true, wp_copy);
+	if (ret)
+		goto out_release;
 out:
 	return ret;
-out_release_uncharge_unlock:
-	pte_unmap_unlock(dst_pte, ptl);
 out_release:
 	put_page(page);
 	goto out;
@@ -176,6 +218,41 @@  static int mfill_zeropage_pte(struct mm_struct *dst_mm,
 	return ret;
 }
 
+/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
+static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
+				pmd_t *dst_pmd,
+				struct vm_area_struct *dst_vma,
+				unsigned long dst_addr,
+				bool wp_copy)
+{
+	struct inode *inode = file_inode(dst_vma->vm_file);
+	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+	struct page *page;
+	int ret;
+
+	ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
+	if (ret)
+		goto out;
+	if (!page) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = mcopy_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+				       page, false, wp_copy);
+	if (ret)
+		goto out_release;
+
+	unlock_page(page);
+	ret = 0;
+out:
+	return ret;
+out_release:
+	unlock_page(page);
+	put_page(page);
+	goto out;
+}
+
 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 {
 	pgd_t *pgd;
@@ -415,11 +492,16 @@  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 						unsigned long dst_addr,
 						unsigned long src_addr,
 						struct page **page,
-						bool zeropage,
+						enum mcopy_atomic_mode mode,
 						bool wp_copy)
 {
 	ssize_t err;
 
+	if (mode == MCOPY_ATOMIC_CONTINUE) {
+		return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+					    wp_copy);
+	}
+
 	/*
 	 * The normal page fault path for a shmem will invoke the
 	 * fault, fill the hole in the file and COW it right away. The
@@ -431,7 +513,7 @@  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 	 * and not in the radix tree.
 	 */
 	if (!(dst_vma->vm_flags & VM_SHARED)) {
-		if (!zeropage)
+		if (mode == MCOPY_ATOMIC_NORMAL)
 			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
 					       dst_addr, src_addr, page,
 					       wp_copy);
@@ -441,7 +523,8 @@  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 	} else {
 		VM_WARN_ON_ONCE(wp_copy);
 		err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-					     dst_addr, src_addr, zeropage,
+					     dst_addr, src_addr,
+					     mode != MCOPY_ATOMIC_NORMAL,
 					     page);
 	}
 
@@ -463,7 +546,6 @@  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	long copied;
 	struct page *page;
 	bool wp_copy;
-	bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
 
 	/*
 	 * Sanitize the command parameters:
@@ -526,7 +608,7 @@  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
-	if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+	if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
 		goto out_unlock;
 
 	/*
@@ -574,7 +656,7 @@  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 		BUG_ON(pmd_trans_huge(*dst_pmd));
 
 		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-				       src_addr, &page, zeropage, wp_copy);
+				       src_addr, &page, mcopy_mode, wp_copy);
 		cond_resched();
 
 		if (unlikely(err == -ENOENT)) {