diff mbox series

[v1] mm/userfaultfd: propagate uffd-wp bit when PTE-mapping the huge zeropage

Message ID 20230302175423.589164-1-david@redhat.com (mailing list archive)
State New
Headers show
Series [v1] mm/userfaultfd: propagate uffd-wp bit when PTE-mapping the huge zeropage | expand

Commit Message

David Hildenbrand March 2, 2023, 5:54 p.m. UTC
Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge
zeropage, resulting in the next write faults in the PMD range
not triggering uffd-wp events.

Various actions (partial MADV_DONTNEED, partial mremap, partial munmap,
partial mprotect) could trigger this. However, most importantly,
un-protecting a single sub-page from the userfaultfd-wp handler when
processing a uffd-wp event will PTE-map the shared huge zeropage and
lose the uffd-wp bit for the remainder of the PMD.

Let's properly propagate the uffd-wp bit to the PMDs.

---
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <inttypes.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
 #include <poll.h>
 #include <pthread.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
 #include <linux/userfaultfd.h>

 static size_t pagesize;
 static int uffd;
 static volatile bool uffd_triggered;

 #define barrier() __asm__ __volatile__("": : :"memory")

 static void uffd_wp_range(char *start, size_t size, bool wp)
 {
 	struct uffdio_writeprotect uffd_writeprotect;

 	uffd_writeprotect.range.start = (unsigned long) start;
 	uffd_writeprotect.range.len = size;
 	if (wp) {
 		uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
 	} else {
 		uffd_writeprotect.mode = 0;
 	}
 	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
 		fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
 		exit(1);
 	}
 }

 static void *uffd_thread_fn(void *arg)
 {
 	static struct uffd_msg msg;
 	ssize_t nread;

 	while (1) {
 		struct pollfd pollfd;
 		int nready;

 		pollfd.fd = uffd;
 		pollfd.events = POLLIN;
 		nready = poll(&pollfd, 1, -1);
 		if (nready == -1) {
 			fprintf(stderr, "poll() failed: %d\n", errno);
 			exit(1);
 		}

 		nread = read(uffd, &msg, sizeof(msg));
 		if (nread <= 0)
 			continue;

 		if (msg.event != UFFD_EVENT_PAGEFAULT ||
 		    !(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)) {
 			printf("FAIL: wrong uffd-wp event fired\n");
 			exit(1);
 		}

 		/* un-protect the single page. */
 		uffd_triggered = true;
 		uffd_wp_range((char *)(uintptr_t)msg.arg.pagefault.address,
 			      pagesize, false);
 	}
 	return arg;
 }

 static int setup_uffd(char *map, size_t size)
 {
 	struct uffdio_api uffdio_api;
 	struct uffdio_register uffdio_register;
 	pthread_t thread;

 	uffd = syscall(__NR_userfaultfd,
 		       O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
 	if (uffd < 0) {
 		fprintf(stderr, "syscall() failed: %d\n", errno);
 		return -errno;
 	}

 	uffdio_api.api = UFFD_API;
 	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
 	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
 		fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
 		return -errno;
 	}

 	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
 		fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
 		return -ENOSYS;
 	}

 	uffdio_register.range.start = (unsigned long) map;
 	uffdio_register.range.len = size;
 	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
 		fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
 		return -errno;
 	}

 	pthread_create(&thread, NULL, uffd_thread_fn, NULL);

 	return 0;
 }

 int main(void)
 {
 	const size_t size = 4 * 1024 * 1024ull;
 	char *map, *cur;

 	pagesize = getpagesize();

 	map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
 	if (map == MAP_FAILED) {
 		fprintf(stderr, "mmap() failed\n");
 		return -errno;
 	}

 	if (madvise(map, size, MADV_HUGEPAGE)) {
 		fprintf(stderr, "MADV_HUGEPAGE failed\n");
 		return -errno;
 	}

 	if (setup_uffd(map, size))
 		return 1;

 	/* Read the whole range, populating zeropages. */
 	madvise(map, size, MADV_POPULATE_READ);

 	/* Write-protect the whole range. */
 	uffd_wp_range(map, size, true);

 	/* Make sure uffd-wp triggers on each page. */
 	for (cur = map; cur < map + size; cur += pagesize) {
 		uffd_triggered = false;

 		barrier();
 		/* Trigger a write fault. */
 		*cur = 1;
 		barrier();

 		if (!uffd_triggered) {
 			printf("FAIL: uffd-wp did not trigger\n");
 			return 1;
 		}
 	}

 	printf("PASS: uffd-wp triggered\n");
 	return 0;
 }
---

Fixes: e06f1e1dd499 ("userfaultfd: wp: enabled write protection in userfaultfd API")
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Shaohua Li <shli@fb.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/huge_memory.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

Comments

Peter Xu March 2, 2023, 10:29 p.m. UTC | #1
On Thu, Mar 02, 2023 at 06:54:23PM +0100, David Hildenbrand wrote:
> Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge
> zeropage, resulting in the next write faults in the PMD range
> not triggering uffd-wp events.
> 
> Various actions (partial MADV_DONTNEED, partial mremap, partial munmap,
> partial mprotect) could trigger this. However, most importantly,
> un-protecting a single sub-page from the userfaultfd-wp handler when
> processing a uffd-wp event will PTE-map the shared huge zeropage and
> lose the uffd-wp bit for the remainder of the PMD.
> 
> Let's properly propagate the uffd-wp bit to the PMDs.

Ouch.. I thought this was reported once, probably it fell through the
cracks.

Acked-by: Peter Xu <peterx@redhat.com>

Thanks,
Andrew Morton March 3, 2023, 1:57 a.m. UTC | #2
On Thu,  2 Mar 2023 18:54:23 +0100 David Hildenbrand <david@redhat.com> wrote:

> Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge
> zeropage, resulting in the next write faults in the PMD range
> not triggering uffd-wp events.
> 
> Various actions (partial MADV_DONTNEED, partial mremap, partial munmap,
> partial mprotect) could trigger this. However, most importantly,
> un-protecting a single sub-page from the userfaultfd-wp handler when
> processing a uffd-wp event will PTE-map the shared huge zeropage and
> lose the uffd-wp bit for the remainder of the PMD.
> 
> Let's properly propagate the uffd-wp bit to the PMDs.
> 
> ...
>
> Fixes: e06f1e1dd499 ("userfaultfd: wp: enabled write protection in userfaultfd API")
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Jerome Glisse <jglisse@redhat.com>
> Cc: Shaohua Li <shli@fb.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>

Do you agree that a -stable backport is appropriate?
David Hildenbrand March 3, 2023, 9:12 a.m. UTC | #3
On 02.03.23 23:29, Peter Xu wrote:
> On Thu, Mar 02, 2023 at 06:54:23PM +0100, David Hildenbrand wrote:
>> Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge
>> zeropage, resulting in the next write faults in the PMD range
>> not triggering uffd-wp events.
>>
>> Various actions (partial MADV_DONTNEED, partial mremap, partial munmap,
>> partial mprotect) could trigger this. However, most importantly,
>> un-protecting a single sub-page from the userfaultfd-wp handler when
>> processing a uffd-wp event will PTE-map the shared huge zeropage and
>> lose the uffd-wp bit for the remainder of the PMD.
>>
>> Let's properly propagate the uffd-wp bit to the PMDs.
> 
> Ouch.. I thought this was reported once, probably it fell through the
> cracks.

Yes, I reported it a while ago, but our understanding back then was that 
primarily MADV_DONTNEED would trigger it (which my reproducer back then 
did), and e.g., QEMU would make sure to not have concurrent 
MADV_DONTNEED while doing background snapshots.

I realized only yesterday when retesting my patch that that a simple 
unprotect is already sufficient to mess it up.

> 
> Acked-by: Peter Xu <peterx@redhat.com>

Thanks!
David Hildenbrand March 3, 2023, 9:12 a.m. UTC | #4
>> Fixes: e06f1e1dd499 ("userfaultfd: wp: enabled write protection in userfaultfd API")
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Peter Xu <peterx@redhat.com>
>> Cc: Jerome Glisse <jglisse@redhat.com>
>> Cc: Shaohua Li <shli@fb.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
> 
> Do you agree that a -stable backport is appropriate?

Yes, actually I wanted to include the tag but somehow I dropped it. Thanks!
Peter Xu March 3, 2023, 2:32 p.m. UTC | #5
On Fri, Mar 03, 2023 at 10:12:26AM +0100, David Hildenbrand wrote:
> On 02.03.23 23:29, Peter Xu wrote:
> > On Thu, Mar 02, 2023 at 06:54:23PM +0100, David Hildenbrand wrote:
> > > Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge
> > > zeropage, resulting in the next write faults in the PMD range
> > > not triggering uffd-wp events.
> > > 
> > > Various actions (partial MADV_DONTNEED, partial mremap, partial munmap,
> > > partial mprotect) could trigger this. However, most importantly,
> > > un-protecting a single sub-page from the userfaultfd-wp handler when
> > > processing a uffd-wp event will PTE-map the shared huge zeropage and
> > > lose the uffd-wp bit for the remainder of the PMD.
> > > 
> > > Let's properly propagate the uffd-wp bit to the PMDs.
> > 
> > Ouch.. I thought this was reported once, probably it fell through the
> > cracks.
> 
> Yes, I reported it a while ago, but our understanding back then was that
> primarily MADV_DONTNEED would trigger it (which my reproducer back then
> did), and e.g., QEMU would make sure to not have concurrent MADV_DONTNEED
> while doing background snapshots.
> 
> I realized only yesterday when retesting my patch that that a simple
> unprotect is already sufficient to mess it up.

IIRC the rational was it was fine for the original design because we didn't
plan to protect none ptes, so missing zero ptes would be the same as none,
which was not a problem.

However that'll just make apps like QEMU stops working when they want to
trap none ptes by using a round of pre-read, then the workaround will not
work either.  Copy stable will make the workaround also work for stables.
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4fc43859e59a..032fb0ef9cd1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2037,7 +2037,7 @@  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgtable_t pgtable;
-	pmd_t _pmd;
+	pmd_t _pmd, old_pmd;
 	int i;
 
 	/*
@@ -2048,7 +2048,7 @@  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	 *
 	 * See Documentation/mm/mmu_notifier.rst
 	 */
-	pmdp_huge_clear_flush(vma, haddr, pmd);
+	old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pmd_populate(mm, &_pmd, pgtable);
@@ -2057,6 +2057,8 @@  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 		pte_t *pte, entry;
 		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
 		entry = pte_mkspecial(entry);
+		if (pmd_uffd_wp(old_pmd))
+			entry = pte_mkuffd_wp(entry);
 		pte = pte_offset_map(&_pmd, haddr);
 		VM_BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, haddr, pte, entry);