@@ -1727,7 +1727,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP|
- UFFDIO_COPY_MODE_ACCESS_LIKELY))
+ UFFDIO_COPY_MODE_ACCESS_LIKELY|
+ UFFDIO_COPY_MODE_WRITE_LIKELY))
goto out;
mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP;
@@ -1735,6 +1736,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
uffd_flags = mode_wp ? UFFD_FLAGS_WP : UFFD_FLAGS_NONE;
if (uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY)
uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+ if (uffdio_copy.mode & UFFDIO_COPY_MODE_WRITE_LIKELY)
+ uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
@@ -1787,11 +1790,14 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
ret = -EINVAL;
if (uffdio_zeropage.mode & ~(UFFDIO_ZEROPAGE_MODE_DONTWAKE|
- UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY))
+ UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY|
+ UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY))
goto out;
if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY)
uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+ if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY)
+ uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
@@ -1843,7 +1849,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
UFFDIO_WRITEPROTECT_MODE_WP |
- UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY))
+ UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY |
+ UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY))
return -EINVAL;
mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
@@ -1855,6 +1862,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
uffd_flags = mode_wp ? UFFD_FLAGS_WP : UFFD_FLAGS_NONE;
if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY)
uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+ if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY)
+ uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
if (mmget_not_zero(ctx->mm)) {
ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
@@ -1908,11 +1917,14 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
}
if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE|
- UFFDIO_CONTINUE_MODE_ACCESS_LIKELY))
+ UFFDIO_CONTINUE_MODE_ACCESS_LIKELY|
+ UFFDIO_CONTINUE_MODE_WRITE_LIKELY))
goto out;
if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_ACCESS_LIKELY)
uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+ if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WRITE_LIKELY)
+ uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
@@ -60,6 +60,7 @@ typedef unsigned int __bitwise uffd_flags_t;
#define UFFD_FLAGS_NONE ((__force uffd_flags_t)0)
#define UFFD_FLAGS_WP ((__force uffd_flags_t)BIT(0))
#define UFFD_FLAGS_ACCESS_LIKELY ((__force uffd_flags_t)BIT(1))
+#define UFFD_FLAGS_WRITE_LIKELY ((__force uffd_flags_t)BIT(2))
extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
@@ -206,7 +206,7 @@ struct uffdio_api {
* write-protection mode is supported on both shmem and hugetlbfs.
*
* UFFD_FEATURE_ACCESS_HINTS indicates that the ioctl operations
- * support the UFFDIO_*_MODE_ACCESS_LIKELY hints.
+ * support the UFFDIO_*_MODE_[ACCESS|WRITE]_LIKELY hints.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -261,9 +261,13 @@ struct uffdio_copy {
* page is likely to be access in the near future. Providing the hint
* properly can improve performance.
*
+ * UFFDIO_COPY_MODE_WRITE_LIKELY provides a hint to the kernel that the
+ * page is likely to be written in the near future. Providing the hint
+ * properly can improve performance.
*/
#define UFFDIO_COPY_MODE_WP ((__u64)1<<1)
#define UFFDIO_COPY_MODE_ACCESS_LIKELY ((__u64)1<<2)
+#define UFFDIO_COPY_MODE_WRITE_LIKELY ((__u64)1<<3)
__u64 mode;
/*
@@ -277,6 +281,7 @@ struct uffdio_zeropage {
struct uffdio_range range;
#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
#define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<1)
+#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY ((__u64)1<<2)
__u64 mode;
/*
@@ -300,6 +305,10 @@ struct uffdio_writeprotect {
* that the page is likely to be access in the near future. Providing
* the hint properly can improve performance.
*
+ * UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY: provides a hint to the kernel
+ * that the page is likely to be written in the near future. Providing
+ * the hint properly can improve performance.
+ *
* NOTE: Write protecting a region (WP=1) is unrelated to page faults,
* therefore DONTWAKE flag is meaningless with WP=1. Removing write
* protection (WP=0) in response to a page fault wakes the faulting
@@ -308,6 +317,7 @@ struct uffdio_writeprotect {
#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0)
#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1)
#define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY ((__u64)1<<2)
+#define UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY ((__u64)1<<3)
__u64 mode;
};
@@ -315,6 +325,7 @@ struct uffdio_continue {
struct uffdio_range range;
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
#define UFFDIO_CONTINUE_MODE_ACCESS_LIKELY ((__u64)1<<1)
+#define UFFDIO_CONTINUE_MODE_WRITE_LIKELY ((__u64)1<<2)
__u64 mode;
/*
@@ -5962,6 +5962,9 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
}
+ /* The PTE is not marked as dirty unconditionally */
+ SetPageDirty(page);
+
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
@@ -2404,6 +2404,9 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
VM_BUG_ON(PageSwapBacked(page));
__SetPageLocked(page);
__SetPageSwapBacked(page);
+
+ /* The PTE is not marked as dirty unconditionally */
+ SetPageDirty(page);
__SetPageUptodate(page);
ret = -EFAULT;
@@ -70,7 +70,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
pgoff_t offset, max_off;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- _dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
@@ -83,14 +82,19 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
writable = false;
}
- if (writable)
+ if (writable) {
_dst_pte = pte_mkwrite(_dst_pte);
- else
+
+ /* Marking RO entries as dirty can mess with other code */
+ if (uffd_flags & UFFD_FLAGS_WRITE_LIKELY)
+ _dst_pte = pte_mkdirty(_dst_pte);
+ } else {
/*
* We need this to make sure write bit removed; as mk_pte()
* could return a pte with write bit set.
*/
_dst_pte = pte_wrprotect(_dst_pte);
+ }
if (uffd_flags & UFFD_FLAGS_ACCESS_LIKELY)
_dst_pte = pte_mkyoung(_dst_pte);
@@ -180,6 +184,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
}
+ /* The PTE is not marked as dirty unconditionally */
+ SetPageDirty(page);
+
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before