Message ID | 20230308221932.1548827-5-axelrasmussen@google.com (mailing list archive) |
---|---|
State | Mainlined, archived |
Headers | show |
Series | mm: userfaultfd: refactor and add UFFDIO_CONTINUE_MODE_WP | expand |
On Wed, Mar 08, 2023 at 02:19:32PM -0800, Axel Rasmussen wrote: > UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new > PTE to resolve a missing fault, one can install a write-protected one. > This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in > combination. > > This was motivated by testing HugeTLB HGM [1], and in particular its > interaction with userfaultfd features. Existing userfaultfd code > supports using WP and MINOR modes together (i.e. you can register an > area with both enabled), but without this CONTINUE flag the combination > is in practice unusable. > > So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing > as UFFDIO_COPY_MODE_WP, but for *minor* faults. > > Update the selftest to do some very basic exercising of the new flag. > > [1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/ > > Acked-by: Peter Xu <peterx@redhat.com> > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> > --- > fs/userfaultfd.c | 8 ++++++-- > include/linux/userfaultfd_k.h | 3 ++- > include/uapi/linux/userfaultfd.h | 7 +++++++ > mm/userfaultfd.c | 5 +++-- > tools/testing/selftests/mm/userfaultfd.c | 4 ++++ > 5 files changed, 22 insertions(+), 5 deletions(-) > > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > index 005e5e306266..14059a0861bf 100644 > --- a/include/uapi/linux/userfaultfd.h > +++ b/include/uapi/linux/userfaultfd.h > @@ -297,6 +297,13 @@ struct uffdio_writeprotect { > struct uffdio_continue { > struct uffdio_range range; > #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) > + /* > + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on > + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the > + * write protected ioctl is implemented for the range > + * according to the uffdio_register.ioctls. > + */ > +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) Please add the description of the new flag to Documentation/ and to the userfaultfd man pages. > __u64 mode; > > /*
On Thu, Mar 9, 2023 at 1:11 AM Mike Rapoport <rppt@kernel.org> wrote: > > On Wed, Mar 08, 2023 at 02:19:32PM -0800, Axel Rasmussen wrote: > > UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new > > PTE to resolve a missing fault, one can install a write-protected one. > > This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in > > combination. > > > > This was motivated by testing HugeTLB HGM [1], and in particular its > > interaction with userfaultfd features. Existing userfaultfd code > > supports using WP and MINOR modes together (i.e. you can register an > > area with both enabled), but without this CONTINUE flag the combination > > is in practice unusable. > > > > So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing > > as UFFDIO_COPY_MODE_WP, but for *minor* faults. > > > > Update the selftest to do some very basic exercising of the new flag. > > > > [1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/ > > > > Acked-by: Peter Xu <peterx@redhat.com> > > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com> > > Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> > > > --- > > fs/userfaultfd.c | 8 ++++++-- > > include/linux/userfaultfd_k.h | 3 ++- > > include/uapi/linux/userfaultfd.h | 7 +++++++ > > mm/userfaultfd.c | 5 +++-- > > tools/testing/selftests/mm/userfaultfd.c | 4 ++++ > > 5 files changed, 22 insertions(+), 5 deletions(-) > > > > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > > index 005e5e306266..14059a0861bf 100644 > > --- a/include/uapi/linux/userfaultfd.h > > +++ b/include/uapi/linux/userfaultfd.h > > @@ -297,6 +297,13 @@ struct uffdio_writeprotect { > > struct uffdio_continue { > > struct uffdio_range range; > > #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) > > + /* > > + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on > > + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the > > + * write protected ioctl is implemented for the range > > + * according to the uffdio_register.ioctls. > > + */ > > +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) > > Please add the description of the new flag to Documentation/ and to the > userfaultfd man pages. Funny enough, neither flag is mentioned in Documentation/ today - I'll add a short passage about both. Happy to update the man pages as well, I'll send that patch separately. Thanks for reviewing! > > > __u64 mode; > > > > /* > > -- > Sincerely yours, > Mike.
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56e54e50414e..664019381e04 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1878,6 +1878,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_continue = (struct uffdio_continue __user *)arg; @@ -1902,13 +1903,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) uffdio_continue.range.start) { goto out; } - if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, uffdio_continue.range.len, - &ctx->mmap_changing); + &ctx->mmap_changing, flags); mmput(ctx->mm); } else { return -ESRCH; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 4d7425684171..9499cfcf83fa 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -82,7 +82,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long len, atomic_t *mmap_changing); extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing); + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 005e5e306266..14059a0861bf 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -297,6 +297,13 @@ struct uffdio_writeprotect { struct uffdio_continue { struct uffdio_range range; #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) __u64 mode; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index dd807924446f..2f64e0a9b234 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -693,10 +693,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, } ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags) { return mfill_atomic(dst_mm, start, 0, len, mmap_changing, - uffd_flags_set_mode(0, MFILL_ATOMIC_CONTINUE)); + uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } long uffd_wp_range(struct vm_area_struct *dst_vma, diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index 7f22844ed704..41c1f9abc481 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len) req.range.start = start; req.range.len = len; req.mode = 0; + if (test_uffdio_wp) + req.mode |= UFFDIO_CONTINUE_MODE_WP; if (ioctl(ufd, UFFDIO_CONTINUE, &req)) err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, @@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void) uffdio_register.range.start = (unsigned long)area_dst_alias; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure");