diff mbox series

[v1,3/5] userfaultfd: introduce write-likely mode for uffd operations

Message ID 20220622185038.71740-4-namit@vmware.com (mailing list archive)
State New
Headers show
Series userfaultfd: support access/write hints | expand

Commit Message

Nadav Amit June 22, 2022, 6:50 p.m. UTC
From: Nadav Amit <namit@vmware.com>

Either always setting the dirty bit or always leaving it clear does not
seem as the best policy. Leaving the bit clear introduces overhead on
the first write-access, which is required to set the bit.  Setting the
bit for pages the are eventually not written can require more TLB
flushes.

Let the userfaultfd users control whether PTEs are marked as dirty or
clean. Introduce UFFDIO_[op]_MODE_WRITE to enable userspace to indicate
whether pages are likely to be written and set the dirty-bit if they are
likely to be written.

Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
---
 fs/userfaultfd.c                 | 20 ++++++++++++++++----
 include/linux/userfaultfd_k.h    |  1 +
 include/uapi/linux/userfaultfd.h | 13 ++++++++++++-
 mm/hugetlb.c                     |  3 +++
 mm/shmem.c                       |  3 +++
 mm/userfaultfd.c                 | 13 ++++++++++---
 6 files changed, 45 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index abf176bd0349..13d73e37e230 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1727,7 +1727,8 @@  static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
 		goto out;
 	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP|
-				 UFFDIO_COPY_MODE_ACCESS_LIKELY))
+				 UFFDIO_COPY_MODE_ACCESS_LIKELY|
+				 UFFDIO_COPY_MODE_WRITE_LIKELY))
 		goto out;
 
 	mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP;
@@ -1735,6 +1736,8 @@  static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	uffd_flags = mode_wp ? UFFD_FLAGS_WP : UFFD_FLAGS_NONE;
 	if (uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY)
 		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WRITE_LIKELY)
+		uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
 
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
@@ -1787,11 +1790,14 @@  static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		goto out;
 	ret = -EINVAL;
 	if (uffdio_zeropage.mode & ~(UFFDIO_ZEROPAGE_MODE_DONTWAKE|
-				     UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY))
+				     UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY|
+				     UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY))
 		goto out;
 
 	if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY)
 		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+	if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY)
+		uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
 
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
@@ -1843,7 +1849,8 @@  static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 
 	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
 			       UFFDIO_WRITEPROTECT_MODE_WP |
-			       UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY))
+			       UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY |
+			       UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY))
 		return -EINVAL;
 
 	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
@@ -1855,6 +1862,8 @@  static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	uffd_flags = mode_wp ? UFFD_FLAGS_WP : UFFD_FLAGS_NONE;
 	if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY)
 		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+	if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY)
+		uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
 
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
@@ -1908,11 +1917,14 @@  static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 		goto out;
 	}
 	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE|
-				     UFFDIO_CONTINUE_MODE_ACCESS_LIKELY))
+				     UFFDIO_CONTINUE_MODE_ACCESS_LIKELY|
+				     UFFDIO_CONTINUE_MODE_WRITE_LIKELY))
 		goto out;
 
 	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_ACCESS_LIKELY)
 		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WRITE_LIKELY)
+		uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
 
 	if (mmget_not_zero(ctx->mm)) {
 		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index af268b2c2b27..59c43ea502e7 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -60,6 +60,7 @@  typedef unsigned int __bitwise uffd_flags_t;
 #define UFFD_FLAGS_NONE			((__force uffd_flags_t)0)
 #define UFFD_FLAGS_WP			((__force uffd_flags_t)BIT(0))
 #define UFFD_FLAGS_ACCESS_LIKELY	((__force uffd_flags_t)BIT(1))
+#define UFFD_FLAGS_WRITE_LIKELY		((__force uffd_flags_t)BIT(2))
 
 extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 				    struct vm_area_struct *dst_vma,
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index ff7150c878bb..7b6ab0b43475 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -206,7 +206,7 @@  struct uffdio_api {
 	 * write-protection mode is supported on both shmem and hugetlbfs.
 	 *
 	 * UFFD_FEATURE_ACCESS_HINTS indicates that the ioctl operations
-	 * support the UFFDIO_*_MODE_ACCESS_LIKELY hints.
+	 * support the UFFDIO_*_MODE_[ACCESS|WRITE]_LIKELY hints.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -261,9 +261,13 @@  struct uffdio_copy {
 	 * page is likely to be access in the near future. Providing the hint
 	 * properly can improve performance.
 	 *
+	 * UFFDIO_COPY_MODE_WRITE_LIKELY provides a hint to the kernel that the
+	 * page is likely to be written in the near future. Providing the hint
+	 * properly can improve performance.
 	 */
 #define UFFDIO_COPY_MODE_WP			((__u64)1<<1)
 #define UFFDIO_COPY_MODE_ACCESS_LIKELY		((__u64)1<<2)
+#define UFFDIO_COPY_MODE_WRITE_LIKELY		((__u64)1<<3)
 	__u64 mode;
 
 	/*
@@ -277,6 +281,7 @@  struct uffdio_zeropage {
 	struct uffdio_range range;
 #define UFFDIO_ZEROPAGE_MODE_DONTWAKE		((__u64)1<<0)
 #define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY	((__u64)1<<1)
+#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY	((__u64)1<<2)
 	__u64 mode;
 
 	/*
@@ -300,6 +305,10 @@  struct uffdio_writeprotect {
  * that the page is likely to be access in the near future. Providing
  * the hint properly can improve performance.
  *
+ * UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY: provides a hint to the kernel
+ * that the page is likely to be written in the near future. Providing
+ * the hint properly can improve performance.
+ *
  * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
  * therefore DONTWAKE flag is meaningless with WP=1.  Removing write
  * protection (WP=0) in response to a page fault wakes the faulting
@@ -308,6 +317,7 @@  struct uffdio_writeprotect {
 #define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
 #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
 #define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY	((__u64)1<<2)
+#define UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY	((__u64)1<<3)
 	__u64 mode;
 };
 
@@ -315,6 +325,7 @@  struct uffdio_continue {
 	struct uffdio_range range;
 #define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
 #define UFFDIO_CONTINUE_MODE_ACCESS_LIKELY	((__u64)1<<1)
+#define UFFDIO_CONTINUE_MODE_WRITE_LIKELY	((__u64)1<<2)
 	__u64 mode;
 
 	/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2beff8a4bf7c..46814fc7762f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5962,6 +5962,9 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		*pagep = NULL;
 	}
 
+	/* The PTE is not marked as dirty unconditionally */
+	SetPageDirty(page);
+
 	/*
 	 * The memory barrier inside __SetPageUptodate makes sure that
 	 * preceding stores to the page contents become visible before
diff --git a/mm/shmem.c b/mm/shmem.c
index 89c775275bae..7488cd186c32 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2404,6 +2404,9 @@  int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 	VM_BUG_ON(PageSwapBacked(page));
 	__SetPageLocked(page);
 	__SetPageSwapBacked(page);
+
+	/* The PTE is not marked as dirty unconditionally */
+	SetPageDirty(page);
 	__SetPageUptodate(page);
 
 	ret = -EFAULT;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 5051b9028722..6e767f1e7007 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -70,7 +70,6 @@  int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 	pgoff_t offset, max_off;
 
 	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
-	_dst_pte = pte_mkdirty(_dst_pte);
 	if (page_in_cache && !vm_shared)
 		writable = false;
 
@@ -83,14 +82,19 @@  int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 		writable = false;
 	}
 
-	if (writable)
+	if (writable) {
 		_dst_pte = pte_mkwrite(_dst_pte);
-	else
+
+		/* Marking RO entries as dirty can mess with other code */
+		if (uffd_flags & UFFD_FLAGS_WRITE_LIKELY)
+			_dst_pte = pte_mkdirty(_dst_pte);
+	} else {
 		/*
 		 * We need this to make sure write bit removed; as mk_pte()
 		 * could return a pte with write bit set.
 		 */
 		_dst_pte = pte_wrprotect(_dst_pte);
+	}
 
 	if (uffd_flags & UFFD_FLAGS_ACCESS_LIKELY)
 		_dst_pte = pte_mkyoung(_dst_pte);
@@ -180,6 +184,9 @@  static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 		*pagep = NULL;
 	}
 
+	/* The PTE is not marked as dirty unconditionally */
+	SetPageDirty(page);
+
 	/*
 	 * The memory barrier inside __SetPageUptodate makes sure that
 	 * preceding stores to the page contents become visible before