Message ID | 20220619233449.181323-5-namit@vmware.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | userfaultfd: support access/write hints | expand |
On Sun, Jun 19, 2022 at 04:34:48PM -0700, Nadav Amit wrote: > From: Nadav Amit <namit@vmware.com> > > When userfaultfd provides a zeropage in response to ioctl, it provides a > readonly alias to the zero page. If the page is later written (which is > the likely scenario), page-fault occurs and the page-fault allocator > allocates a page and rewires the page-tables. > > This is an expensive flow for cases in which a page is likely be written > to. Users can use the copy ioctl to initialize zero page (by copying > zeros), but this is also wasteful. > > Allow userfaultfd users to efficiently map initialized zero-pages that > are writable. Introduce UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY, which, when > provided would map a clear page instead of an alias to the zero page. > > For consistency, introduce also UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY. > > Suggested-by: David Hildenbrand <david@redhat.com> > Cc: Mike Kravetz <mike.kravetz@oracle.com> > Cc: Hugh Dickins <hughd@google.com> > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Axel Rasmussen <axelrasmussen@google.com> > Cc: Peter Xu <peterx@redhat.com> > Cc: Mike Rapoport <rppt@linux.ibm.com> > Signed-off-by: Nadav Amit <namit@vmware.com> > --- > fs/userfaultfd.c | 14 +++++++++++-- > include/uapi/linux/userfaultfd.h | 2 ++ > mm/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++ > 3 files changed, 50 insertions(+), 2 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index a56983b594d5..ff073de78ea8 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -1770,6 +1770,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, > struct uffdio_zeropage uffdio_zeropage; > struct uffdio_zeropage __user *user_uffdio_zeropage; > struct userfaultfd_wake_range range; > + bool mode_dontwake, mode_access_likely, mode_write_likely; > + uffd_flags_t uffd_flags; > > user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; > > @@ -1788,8 +1790,16 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, > if (ret) > goto out; > ret = -EINVAL; > - if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) > - goto out; > + > + mode_dontwake = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE; > + mode_access_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY; > + mode_write_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY; > + > + if (mode_dontwake) > + return -EINVAL; Hmm.. Why? Note that the above uffdio_zeropage.mode check was for invalid mode flags only, and I think that should be kept, but still I don't see why we want to fail UFFDIO_ZEROPAGE_MODE_DONTWAKE users. > + > + uffd_flags = (mode_access_likely ? UFFD_FLAGS_ACCESS_LIKELY : 0) | > + (mode_write_likely ? UFFD_FLAGS_WRITE_LIKELY : 0); > > if (mmget_not_zero(ctx->mm)) { > ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > index 6ad93a13282e..b586b7c1e265 100644 > --- a/include/uapi/linux/userfaultfd.h > +++ b/include/uapi/linux/userfaultfd.h > @@ -286,6 +286,8 @@ struct uffdio_copy { > struct uffdio_zeropage { > struct uffdio_range range; > #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) > +#define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<2) > +#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY ((__u64)1<<3) > __u64 mode; > > /* > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 3172158d8faa..5dfbb1e80369 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -249,6 +249,38 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, > return ret; > } > > +static int mfill_clearpage_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > + struct vm_area_struct *dst_vma, > + unsigned long dst_addr, > + uffd_flags_t uffd_flags) > +{ > + struct page *page; > + int ret; > + > + ret = -ENOMEM; > + page = alloc_zeroed_user_highpage_movable(dst_vma, dst_addr); > + if (!page) > + goto out; > + > + /* The PTE is not marked as dirty unconditionally */ > + SetPageDirty(page); > + __SetPageUptodate(page); > + > + ret = -ENOMEM; Nit: can drop this since ret will always be -ENOMEM here.. Thanks, > + if (mem_cgroup_charge(page_folio(page), dst_vma->vm_mm, GFP_KERNEL)) > + goto out_release; > + > + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, > + page, true, uffd_flags); > + if (ret) > + goto out_release; > +out: > + return ret; > +out_release: > + put_page(page); > + goto out; > +} > + > /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ > static int mcontinue_atomic_pte(struct mm_struct *dst_mm, > pmd_t *dst_pmd, > @@ -511,6 +543,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, > err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, > dst_addr, src_addr, page, > uffd_flags); > + else if (!(uffd_flags & UFFD_FLAGS_WP) && > + (uffd_flags & UFFD_FLAGS_WRITE_LIKELY)) > + err = mfill_clearpage_pte(dst_mm, dst_pmd, dst_vma, > + dst_addr, uffd_flags); > else > err = mfill_zeropage_pte(dst_mm, dst_pmd, > dst_vma, dst_addr, uffd_flags); > -- > 2.25.1 >
On Jun 21, 2022, at 10:04 AM, Peter Xu <peterx@redhat.com> wrote: > ⚠ External Email > > On Sun, Jun 19, 2022 at 04:34:48PM -0700, Nadav Amit wrote: >> From: Nadav Amit <namit@vmware.com> >> >> When userfaultfd provides a zeropage in response to ioctl, it provides a >> readonly alias to the zero page. If the page is later written (which is >> the likely scenario), page-fault occurs and the page-fault allocator >> allocates a page and rewires the page-tables. >> >> This is an expensive flow for cases in which a page is likely be written >> to. Users can use the copy ioctl to initialize zero page (by copying >> zeros), but this is also wasteful. >> >> Allow userfaultfd users to efficiently map initialized zero-pages that >> are writable. Introduce UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY, which, when >> provided would map a clear page instead of an alias to the zero page. >> >> For consistency, introduce also UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY. >> >> Suggested-by: David Hildenbrand <david@redhat.com> >> Cc: Mike Kravetz <mike.kravetz@oracle.com> >> Cc: Hugh Dickins <hughd@google.com> >> Cc: Andrew Morton <akpm@linux-foundation.org> >> Cc: Axel Rasmussen <axelrasmussen@google.com> >> Cc: Peter Xu <peterx@redhat.com> >> Cc: Mike Rapoport <rppt@linux.ibm.com> >> Signed-off-by: Nadav Amit <namit@vmware.com> >> --- >> fs/userfaultfd.c | 14 +++++++++++-- >> include/uapi/linux/userfaultfd.h | 2 ++ >> mm/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++ >> 3 files changed, 50 insertions(+), 2 deletions(-) >> >> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c >> index a56983b594d5..ff073de78ea8 100644 >> --- a/fs/userfaultfd.c >> +++ b/fs/userfaultfd.c >> @@ -1770,6 +1770,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, >> struct uffdio_zeropage uffdio_zeropage; >> struct uffdio_zeropage __user *user_uffdio_zeropage; >> struct userfaultfd_wake_range range; >> + bool mode_dontwake, mode_access_likely, mode_write_likely; >> + uffd_flags_t uffd_flags; >> >> user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; >> >> @@ -1788,8 +1790,16 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, >> if (ret) >> goto out; >> ret = -EINVAL; >> - if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) >> - goto out; >> + >> + mode_dontwake = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE; >> + mode_access_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY; >> + mode_write_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY; >> + >> + if (mode_dontwake) >> + return -EINVAL; > > Hmm.. Why? > > Note that the above uffdio_zeropage.mode check was for invalid mode flags > only, and I think that should be kept, but still I don't see why we want to > fail UFFDIO_ZEROPAGE_MODE_DONTWAKE users. > >> + >> + uffd_flags = (mode_access_likely ? UFFD_FLAGS_ACCESS_LIKELY : 0) | >> + (mode_write_likely ? UFFD_FLAGS_WRITE_LIKELY : 0); >> >> if (mmget_not_zero(ctx->mm)) { >> ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, >> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h >> index 6ad93a13282e..b586b7c1e265 100644 >> --- a/include/uapi/linux/userfaultfd.h >> +++ b/include/uapi/linux/userfaultfd.h >> @@ -286,6 +286,8 @@ struct uffdio_copy { >> struct uffdio_zeropage { >> struct uffdio_range range; >> #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) >> +#define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<2) >> +#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY ((__u64)1<<3) >> __u64 mode; >> >> /* >> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c >> index 3172158d8faa..5dfbb1e80369 100644 >> --- a/mm/userfaultfd.c >> +++ b/mm/userfaultfd.c >> @@ -249,6 +249,38 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, >> return ret; >> } >> >> +static int mfill_clearpage_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, >> + struct vm_area_struct *dst_vma, >> + unsigned long dst_addr, >> + uffd_flags_t uffd_flags) >> +{ >> + struct page *page; >> + int ret; >> + >> + ret = -ENOMEM; >> + page = alloc_zeroed_user_highpage_movable(dst_vma, dst_addr); >> + if (!page) >> + goto out; >> + >> + /* The PTE is not marked as dirty unconditionally */ >> + SetPageDirty(page); >> + __SetPageUptodate(page); >> + >> + ret = -ENOMEM; > > Nit: can drop this since ret will always be -ENOMEM here.. I noticed. Just thought it is clearer this way, and more robust against future changes.
On Tue, Jun 21, 2022 at 05:17:05PM +0000, Nadav Amit wrote: > On Jun 21, 2022, at 10:04 AM, Peter Xu <peterx@redhat.com> wrote: > > > ⚠ External Email > > > > On Sun, Jun 19, 2022 at 04:34:48PM -0700, Nadav Amit wrote: > >> From: Nadav Amit <namit@vmware.com> > >> > >> When userfaultfd provides a zeropage in response to ioctl, it provides a > >> readonly alias to the zero page. If the page is later written (which is > >> the likely scenario), page-fault occurs and the page-fault allocator > >> allocates a page and rewires the page-tables. > >> > >> This is an expensive flow for cases in which a page is likely be written > >> to. Users can use the copy ioctl to initialize zero page (by copying > >> zeros), but this is also wasteful. > >> > >> Allow userfaultfd users to efficiently map initialized zero-pages that > >> are writable. Introduce UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY, which, when > >> provided would map a clear page instead of an alias to the zero page. > >> > >> For consistency, introduce also UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY. > >> > >> Suggested-by: David Hildenbrand <david@redhat.com> > >> Cc: Mike Kravetz <mike.kravetz@oracle.com> > >> Cc: Hugh Dickins <hughd@google.com> > >> Cc: Andrew Morton <akpm@linux-foundation.org> > >> Cc: Axel Rasmussen <axelrasmussen@google.com> > >> Cc: Peter Xu <peterx@redhat.com> > >> Cc: Mike Rapoport <rppt@linux.ibm.com> > >> Signed-off-by: Nadav Amit <namit@vmware.com> > >> --- > >> fs/userfaultfd.c | 14 +++++++++++-- > >> include/uapi/linux/userfaultfd.h | 2 ++ > >> mm/userfaultfd.c | 36 ++++++++++++++++++++++++++++++++ > >> 3 files changed, 50 insertions(+), 2 deletions(-) > >> > >> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > >> index a56983b594d5..ff073de78ea8 100644 > >> --- a/fs/userfaultfd.c > >> +++ b/fs/userfaultfd.c > >> @@ -1770,6 +1770,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, > >> struct uffdio_zeropage uffdio_zeropage; > >> struct uffdio_zeropage __user *user_uffdio_zeropage; > >> struct userfaultfd_wake_range range; > >> + bool mode_dontwake, mode_access_likely, mode_write_likely; > >> + uffd_flags_t uffd_flags; > >> > >> user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; > >> > >> @@ -1788,8 +1790,16 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, > >> if (ret) > >> goto out; > >> ret = -EINVAL; > >> - if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) > >> - goto out; > >> + > >> + mode_dontwake = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE; > >> + mode_access_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY; > >> + mode_write_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY; > >> + > >> + if (mode_dontwake) > >> + return -EINVAL; > > > > Hmm.. Why? > > > > Note that the above uffdio_zeropage.mode check was for invalid mode flags > > only, and I think that should be kept, but still I don't see why we want to > > fail UFFDIO_ZEROPAGE_MODE_DONTWAKE users. [1] > > > >> + > >> + uffd_flags = (mode_access_likely ? UFFD_FLAGS_ACCESS_LIKELY : 0) | > >> + (mode_write_likely ? UFFD_FLAGS_WRITE_LIKELY : 0); > >> > >> if (mmget_not_zero(ctx->mm)) { > >> ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, > >> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > >> index 6ad93a13282e..b586b7c1e265 100644 > >> --- a/include/uapi/linux/userfaultfd.h > >> +++ b/include/uapi/linux/userfaultfd.h > >> @@ -286,6 +286,8 @@ struct uffdio_copy { > >> struct uffdio_zeropage { > >> struct uffdio_range range; > >> #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) > >> +#define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<2) > >> +#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY ((__u64)1<<3) > >> __u64 mode; > >> > >> /* > >> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > >> index 3172158d8faa..5dfbb1e80369 100644 > >> --- a/mm/userfaultfd.c > >> +++ b/mm/userfaultfd.c > >> @@ -249,6 +249,38 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, > >> return ret; > >> } > >> > >> +static int mfill_clearpage_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > >> + struct vm_area_struct *dst_vma, > >> + unsigned long dst_addr, > >> + uffd_flags_t uffd_flags) > >> +{ > >> + struct page *page; > >> + int ret; > >> + > >> + ret = -ENOMEM; > >> + page = alloc_zeroed_user_highpage_movable(dst_vma, dst_addr); > >> + if (!page) > >> + goto out; > >> + > >> + /* The PTE is not marked as dirty unconditionally */ > >> + SetPageDirty(page); > >> + __SetPageUptodate(page); > >> + > >> + ret = -ENOMEM; > > > > Nit: can drop this since ret will always be -ENOMEM here.. > > I noticed. Just thought it is clearer this way, and more robust against > future changes. I'd rather leave that for future, but if you really prefer that no problem on my side too. Please just still check [1] above and that's the major real comment, just to make sure it's not overlooked..
On Jun 21, 2022, at 10:56 AM, Peter Xu <peterx@redhat.com> wrote: > ⚠ External Email >>> Nit: can drop this since ret will always be -ENOMEM here.. >> >> I noticed. Just thought it is clearer this way, and more robust against >> future changes. > > I'd rather leave that for future, but if you really prefer that no problem > on my side too. > > Please just still check [1] above and that's the major real comment, just > to make sure it's not overlooked.. Yes. I noticed it just after sending my email, and didn’t want to spam. Fixed. Thanks for noticing it!! Regards, Nadav
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index a56983b594d5..ff073de78ea8 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1770,6 +1770,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, struct uffdio_zeropage uffdio_zeropage; struct uffdio_zeropage __user *user_uffdio_zeropage; struct userfaultfd_wake_range range; + bool mode_dontwake, mode_access_likely, mode_write_likely; + uffd_flags_t uffd_flags; user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; @@ -1788,8 +1790,16 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (ret) goto out; ret = -EINVAL; - if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) - goto out; + + mode_dontwake = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE; + mode_access_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY; + mode_write_likely = uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY; + + if (mode_dontwake) + return -EINVAL; + + uffd_flags = (mode_access_likely ? UFFD_FLAGS_ACCESS_LIKELY : 0) | + (mode_write_likely ? UFFD_FLAGS_WRITE_LIKELY : 0); if (mmget_not_zero(ctx->mm)) { ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 6ad93a13282e..b586b7c1e265 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -286,6 +286,8 @@ struct uffdio_copy { struct uffdio_zeropage { struct uffdio_range range; #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<2) +#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY ((__u64)1<<3) __u64 mode; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3172158d8faa..5dfbb1e80369 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -249,6 +249,38 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, return ret; } +static int mfill_clearpage_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + uffd_flags_t uffd_flags) +{ + struct page *page; + int ret; + + ret = -ENOMEM; + page = alloc_zeroed_user_highpage_movable(dst_vma, dst_addr); + if (!page) + goto out; + + /* The PTE is not marked as dirty unconditionally */ + SetPageDirty(page); + __SetPageUptodate(page); + + ret = -ENOMEM; + if (mem_cgroup_charge(page_folio(page), dst_vma->vm_mm, GFP_KERNEL)) + goto out_release; + + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + page, true, uffd_flags); + if (ret) + goto out_release; +out: + return ret; +out_release: + put_page(page); + goto out; +} + /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, @@ -511,6 +543,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page, uffd_flags); + else if (!(uffd_flags & UFFD_FLAGS_WP) && + (uffd_flags & UFFD_FLAGS_WRITE_LIKELY)) + err = mfill_clearpage_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, uffd_flags); else err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr, uffd_flags);