Message ID | 20230412164241.328259-1-peterx@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | selftests/mm: Split / Refactor userfault test | expand |
On Wed, Apr 12, 2023 at 9:42 AM Peter Xu <peterx@redhat.com> wrote: > > Move common utility functions into uffd-common.[ch] files from the original > userfaultfd.c. This prepares for a split of userfaultfd.c into two tests: > one to only cover the old but powerful stress test, the other one covers > all the functional tests. > > This movement is kind of a brute-force effort for now, with light touch-ups > but nothing should really change. There's chances to optimize more, but > let's leave that for later. > > Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org> > Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Axel Rasmussen <axelrasmussen@google.com> > --- > tools/testing/selftests/mm/Makefile | 2 + > tools/testing/selftests/mm/uffd-common.c | 611 ++++++++++++++++++++ > tools/testing/selftests/mm/uffd-common.h | 117 ++++ > tools/testing/selftests/mm/userfaultfd.c | 694 +---------------------- > 4 files changed, 731 insertions(+), 693 deletions(-) > create mode 100644 tools/testing/selftests/mm/uffd-common.c > create mode 100644 tools/testing/selftests/mm/uffd-common.h > > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile > index 5f7626550e5f..36467c15ca00 100644 > --- a/tools/testing/selftests/mm/Makefile > +++ b/tools/testing/selftests/mm/Makefile > @@ -108,6 +108,8 @@ include ../lib.mk > > $(TEST_GEN_PROGS): vm_util.c > > +$(OUTPUT)/userfaultfd: uffd-common.c > + > ifeq ($(MACHINE),x86_64) > BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) > BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) > diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c > new file mode 100644 > index 000000000000..c57757c2a36f > --- /dev/null > +++ b/tools/testing/selftests/mm/uffd-common.c > @@ -0,0 +1,611 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Userfaultfd tests util functions > + * > + * Copyright (C) 2015-2023 Red Hat, Inc. > + */ > + > +#include "uffd-common.h" > + > +#define BASE_PMD_ADDR ((void *)(1UL << 30)) > + > +volatile bool test_uffdio_copy_eexist = true; > +unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; > +char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; > +int mem_fd, uffd = -1, uffd_flags, finished, *pipefd, test_type; > +bool map_shared, test_collapse, test_dev_userfaultfd; > +bool test_uffdio_wp = true, test_uffdio_minor = false; > +unsigned long long *count_verify; > +uffd_test_ops_t *uffd_test_ops; > + > +static void anon_release_pages(char *rel_area) > +{ > + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) > + err("madvise(MADV_DONTNEED) failed"); > +} > + > +static void anon_allocate_area(void **alloc_area, bool is_src) > +{ > + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, > + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); > +} > + > +static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) > +{ > +} > + > +static void hugetlb_release_pages(char *rel_area) > +{ > + if (!map_shared) { > + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) > + err("madvise(MADV_DONTNEED) failed"); > + } else { > + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) > + err("madvise(MADV_REMOVE) failed"); > + } > +} > + > +static void hugetlb_allocate_area(void **alloc_area, bool is_src) > +{ > + off_t size = nr_pages * page_size; > + off_t offset = is_src ? 0 : size; > + void *area_alias = NULL; > + char **alloc_area_alias; > + > + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, > + (map_shared ? MAP_SHARED : MAP_PRIVATE) | > + (is_src ? 0 : MAP_NORESERVE), > + mem_fd, offset); > + if (*alloc_area == MAP_FAILED) > + err("mmap of hugetlbfs file failed"); > + > + if (map_shared) { > + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, > + MAP_SHARED, mem_fd, offset); > + if (area_alias == MAP_FAILED) > + err("mmap of hugetlb file alias failed"); > + } > + > + if (is_src) { > + alloc_area_alias = &area_src_alias; > + } else { > + alloc_area_alias = &area_dst_alias; > + } > + if (area_alias) > + *alloc_area_alias = area_alias; > +} > + > +static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) > +{ > + if (!map_shared) > + return; > + > + *start = (unsigned long) area_dst_alias + offset; > +} > + > +static void shmem_release_pages(char *rel_area) > +{ > + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) > + err("madvise(MADV_REMOVE) failed"); > +} > + > +static void shmem_allocate_area(void **alloc_area, bool is_src) > +{ > + void *area_alias = NULL; > + size_t bytes = nr_pages * page_size; > + unsigned long offset = is_src ? 0 : bytes; > + char *p = NULL, *p_alias = NULL; > + > + if (test_collapse) { > + p = BASE_PMD_ADDR; > + if (!is_src) > + /* src map + alias + interleaved hpages */ > + p += 2 * (bytes + hpage_size); > + p_alias = p; > + p_alias += bytes; > + p_alias += hpage_size; /* Prevent src/dst VMA merge */ > + } > + > + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, > + mem_fd, offset); > + if (*alloc_area == MAP_FAILED) > + err("mmap of memfd failed"); > + if (test_collapse && *alloc_area != p) > + err("mmap of memfd failed at %p", p); > + > + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, > + mem_fd, offset); > + if (area_alias == MAP_FAILED) > + err("mmap of memfd alias failed"); > + if (test_collapse && area_alias != p_alias) > + err("mmap of anonymous memory failed at %p", p_alias); > + > + if (is_src) > + area_src_alias = area_alias; > + else > + area_dst_alias = area_alias; > +} > + > +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) > +{ > + *start = (unsigned long)area_dst_alias + offset; > +} > + > +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) > +{ > + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) > + err("Did not find expected %d number of hugepages", > + expect_nr_hpages); > +} > + > +struct uffd_test_ops anon_uffd_test_ops = { > + .allocate_area = anon_allocate_area, > + .release_pages = anon_release_pages, > + .alias_mapping = noop_alias_mapping, > + .check_pmd_mapping = NULL, > +}; > + > +struct uffd_test_ops shmem_uffd_test_ops = { > + .allocate_area = shmem_allocate_area, > + .release_pages = shmem_release_pages, > + .alias_mapping = shmem_alias_mapping, > + .check_pmd_mapping = shmem_check_pmd_mapping, > +}; > + > +struct uffd_test_ops hugetlb_uffd_test_ops = { > + .allocate_area = hugetlb_allocate_area, > + .release_pages = hugetlb_release_pages, > + .alias_mapping = hugetlb_alias_mapping, > + .check_pmd_mapping = NULL, > +}; > + > +void uffd_stats_report(struct uffd_stats *stats, int n_cpus) > +{ > + int i; > + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; > + > + for (i = 0; i < n_cpus; i++) { > + miss_total += stats[i].missing_faults; > + wp_total += stats[i].wp_faults; > + minor_total += stats[i].minor_faults; > + } > + > + printf("userfaults: "); > + if (miss_total) { > + printf("%llu missing (", miss_total); > + for (i = 0; i < n_cpus; i++) > + printf("%lu+", stats[i].missing_faults); > + printf("\b) "); > + } > + if (wp_total) { > + printf("%llu wp (", wp_total); > + for (i = 0; i < n_cpus; i++) > + printf("%lu+", stats[i].wp_faults); > + printf("\b) "); > + } > + if (minor_total) { > + printf("%llu minor (", minor_total); > + for (i = 0; i < n_cpus; i++) > + printf("%lu+", stats[i].minor_faults); > + printf("\b)"); > + } > + printf("\n"); > +} > + > +static int __userfaultfd_open_dev(void) > +{ > + int fd, _uffd; > + > + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); > + if (fd < 0) > + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); > + > + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); > + if (_uffd < 0) > + errexit(errno == ENOTTY ? KSFT_SKIP : 1, > + "creating userfaultfd failed"); > + close(fd); > + return _uffd; > +} > + > +void userfaultfd_open(uint64_t *features) > +{ > + struct uffdio_api uffdio_api; > + > + if (test_dev_userfaultfd) > + uffd = __userfaultfd_open_dev(); > + else { > + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); > + if (uffd < 0) > + errexit(errno == ENOSYS ? KSFT_SKIP : 1, > + "creating userfaultfd failed"); > + } > + uffd_flags = fcntl(uffd, F_GETFD, NULL); > + > + uffdio_api.api = UFFD_API; > + uffdio_api.features = *features; > + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) > + err("UFFDIO_API failed.\nPlease make sure to " > + "run with either root or ptrace capability."); > + if (uffdio_api.api != UFFD_API) > + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); > + > + *features = uffdio_api.features; > +} > + > +static inline void munmap_area(void **area) > +{ > + if (*area) > + if (munmap(*area, nr_pages * page_size)) > + err("munmap"); > + > + *area = NULL; > +} > + > +static void uffd_test_ctx_clear(void) > +{ > + size_t i; > + > + if (pipefd) { > + for (i = 0; i < nr_cpus * 2; ++i) { > + if (close(pipefd[i])) > + err("close pipefd"); > + } > + free(pipefd); > + pipefd = NULL; > + } > + > + if (count_verify) { > + free(count_verify); > + count_verify = NULL; > + } > + > + if (uffd != -1) { > + if (close(uffd)) > + err("close uffd"); > + uffd = -1; > + } > + > + munmap_area((void **)&area_src); > + munmap_area((void **)&area_src_alias); > + munmap_area((void **)&area_dst); > + munmap_area((void **)&area_dst_alias); > + munmap_area((void **)&area_remap); > +} > + > +void uffd_test_ctx_init(uint64_t features) > +{ > + unsigned long nr, cpu; > + > + uffd_test_ctx_clear(); > + > + uffd_test_ops->allocate_area((void **)&area_src, true); > + uffd_test_ops->allocate_area((void **)&area_dst, false); > + > + userfaultfd_open(&features); > + > + count_verify = malloc(nr_pages * sizeof(unsigned long long)); > + if (!count_verify) > + err("count_verify"); > + > + for (nr = 0; nr < nr_pages; nr++) { > + *area_mutex(area_src, nr) = > + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; > + count_verify[nr] = *area_count(area_src, nr) = 1; > + /* > + * In the transition between 255 to 256, powerpc will > + * read out of order in my_bcmp and see both bytes as > + * zero, so leave a placeholder below always non-zero > + * after the count, to avoid my_bcmp to trigger false > + * positives. > + */ > + *(area_count(area_src, nr) + 1) = 1; > + } > + > + /* > + * After initialization of area_src, we must explicitly release pages > + * for area_dst to make sure it's fully empty. Otherwise we could have > + * some area_dst pages be errornously initialized with zero pages, > + * hence we could hit memory corruption later in the test. > + * > + * One example is when THP is globally enabled, above allocate_area() > + * calls could have the two areas merged into a single VMA (as they > + * will have the same VMA flags so they're mergeable). When we > + * initialize the area_src above, it's possible that some part of > + * area_dst could have been faulted in via one huge THP that will be > + * shared between area_src and area_dst. It could cause some of the > + * area_dst won't be trapped by missing userfaults. > + * > + * This release_pages() will guarantee even if that happened, we'll > + * proactively split the thp and drop any accidentally initialized > + * pages within area_dst. > + */ > + uffd_test_ops->release_pages(area_dst); > + > + pipefd = malloc(sizeof(int) * nr_cpus * 2); > + if (!pipefd) > + err("pipefd"); > + for (cpu = 0; cpu < nr_cpus; cpu++) > + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) > + err("pipe"); > +} > + > +uint64_t get_expected_ioctls(uint64_t mode) > +{ > + uint64_t ioctls = UFFD_API_RANGE_IOCTLS; > + > + if (test_type == TEST_HUGETLB) > + ioctls &= ~(1 << _UFFDIO_ZEROPAGE); > + > + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) > + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); > + > + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) > + ioctls &= ~(1 << _UFFDIO_CONTINUE); > + > + return ioctls; > +} > + > +void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) > +{ > + uint64_t expected = get_expected_ioctls(mode); > + uint64_t actual = ioctls & expected; > + > + if (actual != expected) { > + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, > + expected, actual); > + } > +} > + > +void wp_range(int ufd, __u64 start, __u64 len, bool wp) > +{ > + struct uffdio_writeprotect prms; > + > + /* Write protection page faults */ > + prms.range.start = start; > + prms.range.len = len; > + /* Undo write-protect, do wakeup after that */ > + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; > + > + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) > + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); > +} > + > +static void continue_range(int ufd, __u64 start, __u64 len) > +{ > + struct uffdio_continue req; > + int ret; > + > + req.range.start = start; > + req.range.len = len; > + req.mode = 0; > + if (test_uffdio_wp) > + req.mode |= UFFDIO_CONTINUE_MODE_WP; > + > + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) > + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, > + (uint64_t)start); > + > + /* > + * Error handling within the kernel for continue is subtly different > + * from copy or zeropage, so it may be a source of bugs. Trigger an > + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. > + */ > + req.mapped = 0; > + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); > + if (ret >= 0 || req.mapped != -EEXIST) > + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, > + ret, (int64_t) req.mapped); > +} > + > +int uffd_read_msg(int ufd, struct uffd_msg *msg) > +{ > + int ret = read(uffd, msg, sizeof(*msg)); > + > + if (ret != sizeof(*msg)) { > + if (ret < 0) { > + if (errno == EAGAIN || errno == EINTR) > + return 1; > + err("blocking read error"); > + } else { > + err("short read"); > + } > + } > + > + return 0; > +} > + > +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats) > +{ > + unsigned long offset; > + > + if (msg->event != UFFD_EVENT_PAGEFAULT) > + err("unexpected msg event %u", msg->event); > + > + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { > + /* Write protect page faults */ > + wp_range(uffd, msg->arg.pagefault.address, page_size, false); > + stats->wp_faults++; > + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { > + uint8_t *area; > + int b; > + > + /* > + * Minor page faults > + * > + * To prove we can modify the original range for testing > + * purposes, we're going to bit flip this range before > + * continuing. > + * > + * Note that this requires all minor page fault tests operate on > + * area_dst (non-UFFD-registered) and area_dst_alias > + * (UFFD-registered). > + */ > + > + area = (uint8_t *)(area_dst + > + ((char *)msg->arg.pagefault.address - > + area_dst_alias)); > + for (b = 0; b < page_size; ++b) > + area[b] = ~area[b]; > + continue_range(uffd, msg->arg.pagefault.address, page_size); > + stats->minor_faults++; > + } else { > + /* > + * Missing page faults. > + * > + * Here we force a write check for each of the missing mode > + * faults. It's guaranteed because the only threads that > + * will trigger uffd faults are the locking threads, and > + * their first instruction to touch the missing page will > + * always be pthread_mutex_lock(). > + * > + * Note that here we relied on an NPTL glibc impl detail to > + * always read the lock type at the entry of the lock op > + * (pthread_mutex_t.__data.__type, offset 0x10) before > + * doing any locking operations to guarantee that. It's > + * actually not good to rely on this impl detail because > + * logically a pthread-compatible lib can implement the > + * locks without types and we can fail when linking with > + * them. However since we used to find bugs with this > + * strict check we still keep it around. Hopefully this > + * could be a good hint when it fails again. If one day > + * it'll break on some other impl of glibc we'll revisit. > + */ > + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) > + err("unexpected write fault"); > + > + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; > + offset &= ~(page_size-1); > + > + if (copy_page(uffd, offset)) > + stats->missing_faults++; > + } > +} > + > +void *uffd_poll_thread(void *arg) > +{ > + struct uffd_stats *stats = (struct uffd_stats *)arg; > + unsigned long cpu = stats->cpu; > + struct pollfd pollfd[2]; > + struct uffd_msg msg; > + struct uffdio_register uffd_reg; > + int ret; > + char tmp_chr; > + > + pollfd[0].fd = uffd; > + pollfd[0].events = POLLIN; > + pollfd[1].fd = pipefd[cpu*2]; > + pollfd[1].events = POLLIN; > + > + for (;;) { > + ret = poll(pollfd, 2, -1); > + if (ret <= 0) { > + if (errno == EINTR || errno == EAGAIN) > + continue; > + err("poll error: %d", ret); > + } > + if (pollfd[1].revents) { > + if (!(pollfd[1].revents & POLLIN)) > + err("pollfd[1].revents %d", pollfd[1].revents); > + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) > + err("read pipefd error"); > + break; > + } > + if (!(pollfd[0].revents & POLLIN)) > + err("pollfd[0].revents %d", pollfd[0].revents); > + if (uffd_read_msg(uffd, &msg)) > + continue; > + switch (msg.event) { > + default: > + err("unexpected msg event %u\n", msg.event); > + break; > + case UFFD_EVENT_PAGEFAULT: > + uffd_handle_page_fault(&msg, stats); > + break; > + case UFFD_EVENT_FORK: > + close(uffd); > + uffd = msg.arg.fork.ufd; > + pollfd[0].fd = uffd; > + break; > + case UFFD_EVENT_REMOVE: > + uffd_reg.range.start = msg.arg.remove.start; > + uffd_reg.range.len = msg.arg.remove.end - > + msg.arg.remove.start; > + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) > + err("remove failure"); > + break; > + case UFFD_EVENT_REMAP: > + area_remap = area_dst; /* save for later unmap */ > + area_dst = (char *)(unsigned long)msg.arg.remap.to; > + break; > + } > + } > + > + return NULL; > +} > + > +static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, > + unsigned long offset) > +{ > + uffd_test_ops->alias_mapping(&uffdio_copy->dst, > + uffdio_copy->len, > + offset); > + if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { > + /* real retval in ufdio_copy.copy */ > + if (uffdio_copy->copy != -EEXIST) > + err("UFFDIO_COPY retry error: %"PRId64, > + (int64_t)uffdio_copy->copy); > + } else { > + err("UFFDIO_COPY retry unexpected: %"PRId64, > + (int64_t)uffdio_copy->copy); > + } > +} > + > +static void wake_range(int ufd, unsigned long addr, unsigned long len) > +{ > + struct uffdio_range uffdio_wake; > + > + uffdio_wake.start = addr; > + uffdio_wake.len = len; > + > + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) > + fprintf(stderr, "error waking %lu\n", > + addr), exit(1); > +} > + > +int __copy_page(int ufd, unsigned long offset, bool retry) > +{ > + struct uffdio_copy uffdio_copy; > + > + if (offset >= nr_pages * page_size) > + err("unexpected offset %lu\n", offset); > + uffdio_copy.dst = (unsigned long) area_dst + offset; > + uffdio_copy.src = (unsigned long) area_src + offset; > + uffdio_copy.len = page_size; > + if (test_uffdio_wp) > + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; > + else > + uffdio_copy.mode = 0; > + uffdio_copy.copy = 0; > + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { > + /* real retval in ufdio_copy.copy */ > + if (uffdio_copy.copy != -EEXIST) > + err("UFFDIO_COPY error: %"PRId64, > + (int64_t)uffdio_copy.copy); > + wake_range(ufd, uffdio_copy.dst, page_size); > + } else if (uffdio_copy.copy != page_size) { > + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); > + } else { > + if (test_uffdio_copy_eexist && retry) { > + test_uffdio_copy_eexist = false; > + retry_copy_page(ufd, &uffdio_copy, offset); > + } > + return 1; > + } > + return 0; > +} > + > +int copy_page(int ufd, unsigned long offset) > +{ > + return __copy_page(ufd, offset, false); > +} > diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h > new file mode 100644 > index 000000000000..d9430cfdcb19 > --- /dev/null > +++ b/tools/testing/selftests/mm/uffd-common.h > @@ -0,0 +1,117 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Userfaultfd tests common header > + * > + * Copyright (C) 2015-2023 Red Hat, Inc. > + */ > +#ifndef __UFFD_COMMON_H__ > +#define __UFFD_COMMON_H__ > + > +#define _GNU_SOURCE > +#include <stdio.h> > +#include <errno.h> > +#include <unistd.h> > +#include <stdlib.h> > +#include <sys/types.h> > +#include <sys/stat.h> > +#include <fcntl.h> > +#include <time.h> > +#include <signal.h> > +#include <poll.h> > +#include <string.h> > +#include <linux/mman.h> > +#include <sys/mman.h> > +#include <sys/syscall.h> > +#include <sys/ioctl.h> > +#include <sys/wait.h> > +#include <pthread.h> > +#include <linux/userfaultfd.h> > +#include <setjmp.h> > +#include <stdbool.h> > +#include <assert.h> > +#include <inttypes.h> > +#include <stdint.h> > +#include <sys/random.h> > + > +#include "../kselftest.h" > +#include "vm_util.h" > + > +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) > + > +#define _err(fmt, ...) \ > + do { \ > + int ret = errno; \ > + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ > + fprintf(stderr, " (errno=%d, @%s:%d)\n", \ > + ret, __FILE__, __LINE__); \ > + } while (0) > + > +#define errexit(exitcode, fmt, ...) \ > + do { \ > + _err(fmt, ##__VA_ARGS__); \ > + exit(exitcode); \ > + } while (0) > + > +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) > + > +/* pthread_mutex_t starts at page offset 0 */ > +#define area_mutex(___area, ___nr) \ > + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) > +/* > + * count is placed in the page after pthread_mutex_t naturally aligned > + * to avoid non alignment faults on non-x86 archs. > + */ > +#define area_count(___area, ___nr) \ > + ((volatile unsigned long long *) ((unsigned long) \ > + ((___area) + (___nr)*page_size + \ > + sizeof(pthread_mutex_t) + \ > + sizeof(unsigned long long) - 1) & \ > + ~(unsigned long)(sizeof(unsigned long long) \ > + - 1))) > + > +/* Userfaultfd test statistics */ > +struct uffd_stats { > + int cpu; > + unsigned long missing_faults; > + unsigned long wp_faults; > + unsigned long minor_faults; > +}; > + > +struct uffd_test_ops { > + void (*allocate_area)(void **alloc_area, bool is_src); > + void (*release_pages)(char *rel_area); > + void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); > + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); > +}; > +typedef struct uffd_test_ops uffd_test_ops_t; > + > +extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; > +extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; > +extern int mem_fd, uffd, uffd_flags, finished, *pipefd, test_type; > +extern bool map_shared, test_collapse, test_dev_userfaultfd; > +extern bool test_uffdio_wp, test_uffdio_minor; > +extern unsigned long long *count_verify; > +extern volatile bool test_uffdio_copy_eexist; > + > +extern uffd_test_ops_t anon_uffd_test_ops; > +extern uffd_test_ops_t shmem_uffd_test_ops; > +extern uffd_test_ops_t hugetlb_uffd_test_ops; > +extern uffd_test_ops_t *uffd_test_ops; > + > +void uffd_stats_report(struct uffd_stats *stats, int n_cpus); > +void uffd_test_ctx_init(uint64_t features); > +void userfaultfd_open(uint64_t *features); > +uint64_t get_expected_ioctls(uint64_t mode); > +void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls); > +int uffd_read_msg(int ufd, struct uffd_msg *msg); > +void wp_range(int ufd, __u64 start, __u64 len, bool wp); > +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats); > +int __copy_page(int ufd, unsigned long offset, bool retry); > +int copy_page(int ufd, unsigned long offset); > +void *uffd_poll_thread(void *arg); > + > +#define TEST_ANON 1 > +#define TEST_HUGETLB 2 > +#define TEST_SHMEM 3 > + > +#endif > diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c > index 3487ec0bfcc8..c68a9aeefc41 100644 > --- a/tools/testing/selftests/mm/userfaultfd.c > +++ b/tools/testing/selftests/mm/userfaultfd.c > @@ -34,96 +34,20 @@ > * transfer (UFFDIO_COPY). > */ > > -#define _GNU_SOURCE > -#include <stdio.h> > -#include <errno.h> > -#include <unistd.h> > -#include <stdlib.h> > -#include <sys/types.h> > -#include <sys/stat.h> > -#include <fcntl.h> > -#include <time.h> > -#include <signal.h> > -#include <poll.h> > -#include <string.h> > -#include <linux/mman.h> > -#include <sys/mman.h> > -#include <sys/syscall.h> > -#include <sys/ioctl.h> > -#include <sys/wait.h> > -#include <pthread.h> > -#include <linux/userfaultfd.h> > -#include <setjmp.h> > -#include <stdbool.h> > -#include <assert.h> > -#include <inttypes.h> > -#include <stdint.h> > -#include <sys/random.h> > - > -#include "../kselftest.h" > -#include "vm_util.h" > +#include "uffd-common.h" > > #ifdef __NR_userfaultfd > > -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; > - > #define BOUNCE_RANDOM (1<<0) > #define BOUNCE_RACINGFAULTS (1<<1) > #define BOUNCE_VERIFY (1<<2) > #define BOUNCE_POLL (1<<3) > static int bounces; > > -#define TEST_ANON 1 > -#define TEST_HUGETLB 2 > -#define TEST_SHMEM 3 > -static int test_type; > - > -#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) > - > -#define BASE_PMD_ADDR ((void *)(1UL << 30)) > - > -/* test using /dev/userfaultfd, instead of userfaultfd(2) */ > -static bool test_dev_userfaultfd; > - > /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ > #define ALARM_INTERVAL_SECS 10 > -static volatile bool test_uffdio_copy_eexist = true; > -/* Whether to test uffd write-protection */ > -static bool test_uffdio_wp = true; > -/* Whether to test uffd minor faults */ > -static bool test_uffdio_minor = false; > -static bool map_shared; > -static int mem_fd; > -static unsigned long long *count_verify; > -static int uffd = -1; > -static int uffd_flags, finished, *pipefd; > -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; > static char *zeropage; > pthread_attr_t attr; > -static bool test_collapse; > - > -/* Userfaultfd test statistics */ > -struct uffd_stats { > - int cpu; > - unsigned long missing_faults; > - unsigned long wp_faults; > - unsigned long minor_faults; > -}; > - > -/* pthread_mutex_t starts at page offset 0 */ > -#define area_mutex(___area, ___nr) \ > - ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) > -/* > - * count is placed in the page after pthread_mutex_t naturally aligned > - * to avoid non alignment faults on non-x86 archs. > - */ > -#define area_count(___area, ___nr) \ > - ((volatile unsigned long long *) ((unsigned long) \ > - ((___area) + (___nr)*page_size + \ > - sizeof(pthread_mutex_t) + \ > - sizeof(unsigned long long) - 1) & \ > - ~(unsigned long)(sizeof(unsigned long long) \ > - - 1))) > > #define swap(a, b) \ > do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) > @@ -166,22 +90,6 @@ static void usage(void) > exit(1); > } > > -#define _err(fmt, ...) \ > - do { \ > - int ret = errno; \ > - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ > - fprintf(stderr, " (errno=%d, line=%d)\n", \ > - ret, __LINE__); \ > - } while (0) > - > -#define errexit(exitcode, fmt, ...) \ > - do { \ > - _err(fmt, ##__VA_ARGS__); \ > - exit(exitcode); \ > - } while (0) > - > -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) > - > static void uffd_stats_reset(struct uffd_stats *uffd_stats, > unsigned long n_cpus) > { > @@ -195,189 +103,6 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats, > } > } > > -static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) > -{ > - int i; > - unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; > - > - for (i = 0; i < n_cpus; i++) { > - miss_total += stats[i].missing_faults; > - wp_total += stats[i].wp_faults; > - minor_total += stats[i].minor_faults; > - } > - > - printf("userfaults: "); > - if (miss_total) { > - printf("%llu missing (", miss_total); > - for (i = 0; i < n_cpus; i++) > - printf("%lu+", stats[i].missing_faults); > - printf("\b) "); > - } > - if (wp_total) { > - printf("%llu wp (", wp_total); > - for (i = 0; i < n_cpus; i++) > - printf("%lu+", stats[i].wp_faults); > - printf("\b) "); > - } > - if (minor_total) { > - printf("%llu minor (", minor_total); > - for (i = 0; i < n_cpus; i++) > - printf("%lu+", stats[i].minor_faults); > - printf("\b)"); > - } > - printf("\n"); > -} > - > -static void anon_release_pages(char *rel_area) > -{ > - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) > - err("madvise(MADV_DONTNEED) failed"); > -} > - > -static void anon_allocate_area(void **alloc_area, bool is_src) > -{ > - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, > - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); > -} > - > -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) > -{ > -} > - > -static void hugetlb_release_pages(char *rel_area) > -{ > - if (!map_shared) { > - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) > - err("madvise(MADV_DONTNEED) failed"); > - } else { > - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) > - err("madvise(MADV_REMOVE) failed"); > - } > -} > - > -static void hugetlb_allocate_area(void **alloc_area, bool is_src) > -{ > - off_t size = nr_pages * page_size; > - off_t offset = is_src ? 0 : size; > - void *area_alias = NULL; > - char **alloc_area_alias; > - > - *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, > - (map_shared ? MAP_SHARED : MAP_PRIVATE) | > - (is_src ? 0 : MAP_NORESERVE), > - mem_fd, offset); > - if (*alloc_area == MAP_FAILED) > - err("mmap of hugetlbfs file failed"); > - > - if (map_shared) { > - area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, > - MAP_SHARED, mem_fd, offset); > - if (area_alias == MAP_FAILED) > - err("mmap of hugetlb file alias failed"); > - } > - > - if (is_src) { > - alloc_area_alias = &area_src_alias; > - } else { > - alloc_area_alias = &area_dst_alias; > - } > - if (area_alias) > - *alloc_area_alias = area_alias; > -} > - > -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) > -{ > - if (!map_shared) > - return; > - > - *start = (unsigned long) area_dst_alias + offset; > -} > - > -static void shmem_release_pages(char *rel_area) > -{ > - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) > - err("madvise(MADV_REMOVE) failed"); > -} > - > -static void shmem_allocate_area(void **alloc_area, bool is_src) > -{ > - void *area_alias = NULL; > - size_t bytes = nr_pages * page_size; > - unsigned long offset = is_src ? 0 : bytes; > - char *p = NULL, *p_alias = NULL; > - > - if (test_collapse) { > - p = BASE_PMD_ADDR; > - if (!is_src) > - /* src map + alias + interleaved hpages */ > - p += 2 * (bytes + hpage_size); > - p_alias = p; > - p_alias += bytes; > - p_alias += hpage_size; /* Prevent src/dst VMA merge */ > - } > - > - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, > - mem_fd, offset); > - if (*alloc_area == MAP_FAILED) > - err("mmap of memfd failed"); > - if (test_collapse && *alloc_area != p) > - err("mmap of memfd failed at %p", p); > - > - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, > - mem_fd, offset); > - if (area_alias == MAP_FAILED) > - err("mmap of memfd alias failed"); > - if (test_collapse && area_alias != p_alias) > - err("mmap of anonymous memory failed at %p", p_alias); > - > - if (is_src) > - area_src_alias = area_alias; > - else > - area_dst_alias = area_alias; > -} > - > -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) > -{ > - *start = (unsigned long)area_dst_alias + offset; > -} > - > -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) > -{ > - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) > - err("Did not find expected %d number of hugepages", > - expect_nr_hpages); > -} > - > -struct uffd_test_ops { > - void (*allocate_area)(void **alloc_area, bool is_src); > - void (*release_pages)(char *rel_area); > - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); > - void (*check_pmd_mapping)(void *p, int expect_nr_hpages); > -}; > - > -static struct uffd_test_ops anon_uffd_test_ops = { > - .allocate_area = anon_allocate_area, > - .release_pages = anon_release_pages, > - .alias_mapping = noop_alias_mapping, > - .check_pmd_mapping = NULL, > -}; > - > -static struct uffd_test_ops shmem_uffd_test_ops = { > - .allocate_area = shmem_allocate_area, > - .release_pages = shmem_release_pages, > - .alias_mapping = shmem_alias_mapping, > - .check_pmd_mapping = shmem_check_pmd_mapping, > -}; > - > -static struct uffd_test_ops hugetlb_uffd_test_ops = { > - .allocate_area = hugetlb_allocate_area, > - .release_pages = hugetlb_release_pages, > - .alias_mapping = hugetlb_alias_mapping, > - .check_pmd_mapping = NULL, > -}; > - > -static struct uffd_test_ops *uffd_test_ops; > - > static inline uint64_t uffd_minor_feature(void) > { > if (test_type == TEST_HUGETLB && map_shared) > @@ -388,171 +113,6 @@ static inline uint64_t uffd_minor_feature(void) > return 0; > } > > -static uint64_t get_expected_ioctls(uint64_t mode) > -{ > - uint64_t ioctls = UFFD_API_RANGE_IOCTLS; > - > - if (test_type == TEST_HUGETLB) > - ioctls &= ~(1 << _UFFDIO_ZEROPAGE); > - > - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) > - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); > - > - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) > - ioctls &= ~(1 << _UFFDIO_CONTINUE); > - > - return ioctls; > -} > - > -static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) > -{ > - uint64_t expected = get_expected_ioctls(mode); > - uint64_t actual = ioctls & expected; > - > - if (actual != expected) { > - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, > - expected, actual); > - } > -} > - > -static int __userfaultfd_open_dev(void) > -{ > - int fd, _uffd; > - > - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); > - if (fd < 0) > - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); > - > - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); > - if (_uffd < 0) > - errexit(errno == ENOTTY ? KSFT_SKIP : 1, > - "creating userfaultfd failed"); > - close(fd); > - return _uffd; > -} > - > -static void userfaultfd_open(uint64_t *features) > -{ > - struct uffdio_api uffdio_api; > - > - if (test_dev_userfaultfd) > - uffd = __userfaultfd_open_dev(); > - else { > - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); > - if (uffd < 0) > - errexit(errno == ENOSYS ? KSFT_SKIP : 1, > - "creating userfaultfd failed"); > - } > - uffd_flags = fcntl(uffd, F_GETFD, NULL); > - > - uffdio_api.api = UFFD_API; > - uffdio_api.features = *features; > - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) > - err("UFFDIO_API failed.\nPlease make sure to " > - "run with either root or ptrace capability."); > - if (uffdio_api.api != UFFD_API) > - err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); > - > - *features = uffdio_api.features; > -} > - > -static inline void munmap_area(void **area) > -{ > - if (*area) > - if (munmap(*area, nr_pages * page_size)) > - err("munmap"); > - > - *area = NULL; > -} > - > -static void uffd_test_ctx_clear(void) > -{ > - size_t i; > - > - if (pipefd) { > - for (i = 0; i < nr_cpus * 2; ++i) { > - if (close(pipefd[i])) > - err("close pipefd"); > - } > - free(pipefd); > - pipefd = NULL; > - } > - > - if (count_verify) { > - free(count_verify); > - count_verify = NULL; > - } > - > - if (uffd != -1) { > - if (close(uffd)) > - err("close uffd"); > - uffd = -1; > - } > - > - munmap_area((void **)&area_src); > - munmap_area((void **)&area_src_alias); > - munmap_area((void **)&area_dst); > - munmap_area((void **)&area_dst_alias); > - munmap_area((void **)&area_remap); > -} > - > -static void uffd_test_ctx_init(uint64_t features) > -{ > - unsigned long nr, cpu; > - > - uffd_test_ctx_clear(); > - > - uffd_test_ops->allocate_area((void **)&area_src, true); > - uffd_test_ops->allocate_area((void **)&area_dst, false); > - > - userfaultfd_open(&features); > - > - count_verify = malloc(nr_pages * sizeof(unsigned long long)); > - if (!count_verify) > - err("count_verify"); > - > - for (nr = 0; nr < nr_pages; nr++) { > - *area_mutex(area_src, nr) = > - (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; > - count_verify[nr] = *area_count(area_src, nr) = 1; > - /* > - * In the transition between 255 to 256, powerpc will > - * read out of order in my_bcmp and see both bytes as > - * zero, so leave a placeholder below always non-zero > - * after the count, to avoid my_bcmp to trigger false > - * positives. > - */ > - *(area_count(area_src, nr) + 1) = 1; > - } > - > - /* > - * After initialization of area_src, we must explicitly release pages > - * for area_dst to make sure it's fully empty. Otherwise we could have > - * some area_dst pages be errornously initialized with zero pages, > - * hence we could hit memory corruption later in the test. > - * > - * One example is when THP is globally enabled, above allocate_area() > - * calls could have the two areas merged into a single VMA (as they > - * will have the same VMA flags so they're mergeable). When we > - * initialize the area_src above, it's possible that some part of > - * area_dst could have been faulted in via one huge THP that will be > - * shared between area_src and area_dst. It could cause some of the > - * area_dst won't be trapped by missing userfaults. > - * > - * This release_pages() will guarantee even if that happened, we'll > - * proactively split the thp and drop any accidentally initialized > - * pages within area_dst. > - */ > - uffd_test_ops->release_pages(area_dst); > - > - pipefd = malloc(sizeof(int) * nr_cpus * 2); > - if (!pipefd) > - err("pipefd"); > - for (cpu = 0; cpu < nr_cpus; cpu++) > - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) > - err("pipe"); > -} > - > static int my_bcmp(char *str1, char *str2, size_t n) > { > unsigned long i; > @@ -562,47 +122,6 @@ static int my_bcmp(char *str1, char *str2, size_t n) > return 0; > } > > -static void wp_range(int ufd, __u64 start, __u64 len, bool wp) > -{ > - struct uffdio_writeprotect prms; > - > - /* Write protection page faults */ > - prms.range.start = start; > - prms.range.len = len; > - /* Undo write-protect, do wakeup after that */ > - prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; > - > - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) > - err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); > -} > - > -static void continue_range(int ufd, __u64 start, __u64 len) > -{ > - struct uffdio_continue req; > - int ret; > - > - req.range.start = start; > - req.range.len = len; > - req.mode = 0; > - if (test_uffdio_wp) > - req.mode |= UFFDIO_CONTINUE_MODE_WP; > - > - if (ioctl(ufd, UFFDIO_CONTINUE, &req)) > - err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, > - (uint64_t)start); > - > - /* > - * Error handling within the kernel for continue is subtly different > - * from copy or zeropage, so it may be a source of bugs. Trigger an > - * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. > - */ > - req.mapped = 0; > - ret = ioctl(ufd, UFFDIO_CONTINUE, &req); > - if (ret >= 0 || req.mapped != -EEXIST) > - err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, > - ret, (int64_t) req.mapped); > -} > - > static void *locking_thread(void *arg) > { > unsigned long cpu = (unsigned long) arg; > @@ -635,222 +154,11 @@ static void *locking_thread(void *arg) > return NULL; > } > > -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, > - unsigned long offset) > -{ > - uffd_test_ops->alias_mapping(&uffdio_copy->dst, > - uffdio_copy->len, > - offset); > - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { > - /* real retval in ufdio_copy.copy */ > - if (uffdio_copy->copy != -EEXIST) > - err("UFFDIO_COPY retry error: %"PRId64, > - (int64_t)uffdio_copy->copy); > - } else { > - err("UFFDIO_COPY retry unexpected: %"PRId64, > - (int64_t)uffdio_copy->copy); > - } > -} > - > -static void wake_range(int ufd, unsigned long addr, unsigned long len) > -{ > - struct uffdio_range uffdio_wake; > - > - uffdio_wake.start = addr; > - uffdio_wake.len = len; > - > - if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) > - fprintf(stderr, "error waking %lu\n", > - addr), exit(1); > -} > - > -static int __copy_page(int ufd, unsigned long offset, bool retry) > -{ > - struct uffdio_copy uffdio_copy; > - > - if (offset >= nr_pages * page_size) > - err("unexpected offset %lu\n", offset); > - uffdio_copy.dst = (unsigned long) area_dst + offset; > - uffdio_copy.src = (unsigned long) area_src + offset; > - uffdio_copy.len = page_size; > - if (test_uffdio_wp) > - uffdio_copy.mode = UFFDIO_COPY_MODE_WP; > - else > - uffdio_copy.mode = 0; > - uffdio_copy.copy = 0; > - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { > - /* real retval in ufdio_copy.copy */ > - if (uffdio_copy.copy != -EEXIST) > - err("UFFDIO_COPY error: %"PRId64, > - (int64_t)uffdio_copy.copy); > - wake_range(ufd, uffdio_copy.dst, page_size); > - } else if (uffdio_copy.copy != page_size) { > - err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); > - } else { > - if (test_uffdio_copy_eexist && retry) { > - test_uffdio_copy_eexist = false; > - retry_copy_page(ufd, &uffdio_copy, offset); > - } > - return 1; > - } > - return 0; > -} > - > static int copy_page_retry(int ufd, unsigned long offset) > { > return __copy_page(ufd, offset, true); > } > > -static int copy_page(int ufd, unsigned long offset) > -{ > - return __copy_page(ufd, offset, false); > -} > - > -static int uffd_read_msg(int ufd, struct uffd_msg *msg) > -{ > - int ret = read(uffd, msg, sizeof(*msg)); > - > - if (ret != sizeof(*msg)) { > - if (ret < 0) { > - if (errno == EAGAIN || errno == EINTR) > - return 1; > - err("blocking read error"); > - } else { > - err("short read"); > - } > - } > - > - return 0; > -} > - > -static void uffd_handle_page_fault(struct uffd_msg *msg, > - struct uffd_stats *stats) > -{ > - unsigned long offset; > - > - if (msg->event != UFFD_EVENT_PAGEFAULT) > - err("unexpected msg event %u", msg->event); > - > - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { > - /* Write protect page faults */ > - wp_range(uffd, msg->arg.pagefault.address, page_size, false); > - stats->wp_faults++; > - } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { > - uint8_t *area; > - int b; > - > - /* > - * Minor page faults > - * > - * To prove we can modify the original range for testing > - * purposes, we're going to bit flip this range before > - * continuing. > - * > - * Note that this requires all minor page fault tests operate on > - * area_dst (non-UFFD-registered) and area_dst_alias > - * (UFFD-registered). > - */ > - > - area = (uint8_t *)(area_dst + > - ((char *)msg->arg.pagefault.address - > - area_dst_alias)); > - for (b = 0; b < page_size; ++b) > - area[b] = ~area[b]; > - continue_range(uffd, msg->arg.pagefault.address, page_size); > - stats->minor_faults++; > - } else { > - /* > - * Missing page faults. > - * > - * Here we force a write check for each of the missing mode > - * faults. It's guaranteed because the only threads that > - * will trigger uffd faults are the locking threads, and > - * their first instruction to touch the missing page will > - * always be pthread_mutex_lock(). > - * > - * Note that here we relied on an NPTL glibc impl detail to > - * always read the lock type at the entry of the lock op > - * (pthread_mutex_t.__data.__type, offset 0x10) before > - * doing any locking operations to guarantee that. It's > - * actually not good to rely on this impl detail because > - * logically a pthread-compatible lib can implement the > - * locks without types and we can fail when linking with > - * them. However since we used to find bugs with this > - * strict check we still keep it around. Hopefully this > - * could be a good hint when it fails again. If one day > - * it'll break on some other impl of glibc we'll revisit. > - */ > - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) > - err("unexpected write fault"); > - > - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; > - offset &= ~(page_size-1); > - > - if (copy_page(uffd, offset)) > - stats->missing_faults++; > - } > -} > - > -static void *uffd_poll_thread(void *arg) > -{ > - struct uffd_stats *stats = (struct uffd_stats *)arg; > - unsigned long cpu = stats->cpu; > - struct pollfd pollfd[2]; > - struct uffd_msg msg; > - struct uffdio_register uffd_reg; > - int ret; > - char tmp_chr; > - > - pollfd[0].fd = uffd; > - pollfd[0].events = POLLIN; > - pollfd[1].fd = pipefd[cpu*2]; > - pollfd[1].events = POLLIN; > - > - for (;;) { > - ret = poll(pollfd, 2, -1); > - if (ret <= 0) { > - if (errno == EINTR || errno == EAGAIN) > - continue; > - err("poll error: %d", ret); > - } > - if (pollfd[1].revents & POLLIN) { > - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) > - err("read pipefd error"); > - break; > - } > - if (!(pollfd[0].revents & POLLIN)) > - err("pollfd[0].revents %d", pollfd[0].revents); > - if (uffd_read_msg(uffd, &msg)) > - continue; > - switch (msg.event) { > - default: > - err("unexpected msg event %u\n", msg.event); > - break; > - case UFFD_EVENT_PAGEFAULT: > - uffd_handle_page_fault(&msg, stats); > - break; > - case UFFD_EVENT_FORK: > - close(uffd); > - uffd = msg.arg.fork.ufd; > - pollfd[0].fd = uffd; > - break; > - case UFFD_EVENT_REMOVE: > - uffd_reg.range.start = msg.arg.remove.start; > - uffd_reg.range.len = msg.arg.remove.end - > - msg.arg.remove.start; > - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) > - err("remove failure"); > - break; > - case UFFD_EVENT_REMAP: > - area_remap = area_dst; /* save for later unmap */ > - area_dst = (char *)(unsigned long)msg.arg.remap.to; > - break; > - } > - } > - > - return NULL; > -} > - > pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; > > static void *uffd_read_thread(void *arg) > -- > 2.39.1 >
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 5f7626550e5f..36467c15ca00 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -108,6 +108,8 @@ include ../lib.mk $(TEST_GEN_PROGS): vm_util.c +$(OUTPUT)/userfaultfd: uffd-common.c + ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c new file mode 100644 index 000000000000..c57757c2a36f --- /dev/null +++ b/tools/testing/selftests/mm/uffd-common.c @@ -0,0 +1,611 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Userfaultfd tests util functions + * + * Copyright (C) 2015-2023 Red Hat, Inc. + */ + +#include "uffd-common.h" + +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + +volatile bool test_uffdio_copy_eexist = true; +unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; +char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +int mem_fd, uffd = -1, uffd_flags, finished, *pipefd, test_type; +bool map_shared, test_collapse, test_dev_userfaultfd; +bool test_uffdio_wp = true, test_uffdio_minor = false; +unsigned long long *count_verify; +uffd_test_ops_t *uffd_test_ops; + +static void anon_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); +} + +static void anon_allocate_area(void **alloc_area, bool is_src) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +} + +static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ +} + +static void hugetlb_release_pages(char *rel_area) +{ + if (!map_shared) { + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + } else { + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); + } +} + +static void hugetlb_allocate_area(void **alloc_area, bool is_src) +{ + off_t size = nr_pages * page_size; + off_t offset = is_src ? 0 : size; + void *area_alias = NULL; + char **alloc_area_alias; + + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + (is_src ? 0 : MAP_NORESERVE), + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of hugetlbfs file failed"); + + if (map_shared) { + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of hugetlb file alias failed"); + } + + if (is_src) { + alloc_area_alias = &area_src_alias; + } else { + alloc_area_alias = &area_dst_alias; + } + if (area_alias) + *alloc_area_alias = area_alias; +} + +static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + if (!map_shared) + return; + + *start = (unsigned long) area_dst_alias + offset; +} + +static void shmem_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); +} + +static void shmem_allocate_area(void **alloc_area, bool is_src) +{ + void *area_alias = NULL; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } + + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); + + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); + + if (is_src) + area_src_alias = area_alias; + else + area_dst_alias = area_alias; +} + +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + *start = (unsigned long)area_dst_alias + offset; +} + +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + +struct uffd_test_ops anon_uffd_test_ops = { + .allocate_area = anon_allocate_area, + .release_pages = anon_release_pages, + .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, +}; + +struct uffd_test_ops shmem_uffd_test_ops = { + .allocate_area = shmem_allocate_area, + .release_pages = shmem_release_pages, + .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, +}; + +struct uffd_test_ops hugetlb_uffd_test_ops = { + .allocate_area = hugetlb_allocate_area, + .release_pages = hugetlb_release_pages, + .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, +}; + +void uffd_stats_report(struct uffd_stats *stats, int n_cpus) +{ + int i; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; + + for (i = 0; i < n_cpus; i++) { + miss_total += stats[i].missing_faults; + wp_total += stats[i].wp_faults; + minor_total += stats[i].minor_faults; + } + + printf("userfaults: "); + if (miss_total) { + printf("%llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].missing_faults); + printf("\b) "); + } + if (wp_total) { + printf("%llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].wp_faults); + printf("\b) "); + } + if (minor_total) { + printf("%llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); + printf("\b)"); + } + printf("\n"); +} + +static int __userfaultfd_open_dev(void) +{ + int fd, _uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); + + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); + if (_uffd < 0) + errexit(errno == ENOTTY ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + close(fd); + return _uffd; +} + +void userfaultfd_open(uint64_t *features) +{ + struct uffdio_api uffdio_api; + + if (test_dev_userfaultfd) + uffd = __userfaultfd_open_dev(); + else { + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); + if (uffd < 0) + errexit(errno == ENOSYS ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + } + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = *features; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + err("UFFDIO_API failed.\nPlease make sure to " + "run with either root or ptrace capability."); + if (uffdio_api.api != UFFD_API) + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); + + *features = uffdio_api.features; +} + +static inline void munmap_area(void **area) +{ + if (*area) + if (munmap(*area, nr_pages * page_size)) + err("munmap"); + + *area = NULL; +} + +static void uffd_test_ctx_clear(void) +{ + size_t i; + + if (pipefd) { + for (i = 0; i < nr_cpus * 2; ++i) { + if (close(pipefd[i])) + err("close pipefd"); + } + free(pipefd); + pipefd = NULL; + } + + if (count_verify) { + free(count_verify); + count_verify = NULL; + } + + if (uffd != -1) { + if (close(uffd)) + err("close uffd"); + uffd = -1; + } + + munmap_area((void **)&area_src); + munmap_area((void **)&area_src_alias); + munmap_area((void **)&area_dst); + munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); +} + +void uffd_test_ctx_init(uint64_t features) +{ + unsigned long nr, cpu; + + uffd_test_ctx_clear(); + + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); + + userfaultfd_open(&features); + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) + err("count_verify"); + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; + } + + /* + * After initialization of area_src, we must explicitly release pages + * for area_dst to make sure it's fully empty. Otherwise we could have + * some area_dst pages be errornously initialized with zero pages, + * hence we could hit memory corruption later in the test. + * + * One example is when THP is globally enabled, above allocate_area() + * calls could have the two areas merged into a single VMA (as they + * will have the same VMA flags so they're mergeable). When we + * initialize the area_src above, it's possible that some part of + * area_dst could have been faulted in via one huge THP that will be + * shared between area_src and area_dst. It could cause some of the + * area_dst won't be trapped by missing userfaults. + * + * This release_pages() will guarantee even if that happened, we'll + * proactively split the thp and drop any accidentally initialized + * pages within area_dst. + */ + uffd_test_ops->release_pages(area_dst); + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) + err("pipefd"); + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) + err("pipe"); +} + +uint64_t get_expected_ioctls(uint64_t mode) +{ + uint64_t ioctls = UFFD_API_RANGE_IOCTLS; + + if (test_type == TEST_HUGETLB) + ioctls &= ~(1 << _UFFDIO_ZEROPAGE); + + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); + + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) + ioctls &= ~(1 << _UFFDIO_CONTINUE); + + return ioctls; +} + +void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) +{ + uint64_t expected = get_expected_ioctls(mode); + uint64_t actual = ioctls & expected; + + if (actual != expected) { + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, + expected, actual); + } +} + +void wp_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_writeprotect prms; + + /* Write protection page faults */ + prms.range.start = start; + prms.range.len = len; + /* Undo write-protect, do wakeup after that */ + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; + + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); +} + +static void continue_range(int ufd, __u64 start, __u64 len) +{ + struct uffdio_continue req; + int ret; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + if (test_uffdio_wp) + req.mode |= UFFDIO_CONTINUE_MODE_WP; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, + (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, (int64_t) req.mapped); +} + +int uffd_read_msg(int ufd, struct uffd_msg *msg) +{ + int ret = read(uffd, msg, sizeof(*msg)); + + if (ret != sizeof(*msg)) { + if (ret < 0) { + if (errno == EAGAIN || errno == EINTR) + return 1; + err("blocking read error"); + } else { + err("short read"); + } + } + + return 0; +} + +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ + wp_range(uffd, msg->arg.pagefault.address, page_size, false); + stats->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size); + stats->minor_faults++; + } else { + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + err("unexpected write fault"); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + if (copy_page(uffd, offset)) + stats->missing_faults++; + } +} + +void *uffd_poll_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + unsigned long cpu = stats->cpu; + struct pollfd pollfd[2]; + struct uffd_msg msg; + struct uffdio_register uffd_reg; + int ret; + char tmp_chr; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].events = POLLIN; + + for (;;) { + ret = poll(pollfd, 2, -1); + if (ret <= 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + err("poll error: %d", ret); + } + if (pollfd[1].revents) { + if (!(pollfd[1].revents & POLLIN)) + err("pollfd[1].revents %d", pollfd[1].revents); + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + err("read pipefd error"); + break; + } + if (!(pollfd[0].revents & POLLIN)) + err("pollfd[0].revents %d", pollfd[0].revents); + if (uffd_read_msg(uffd, &msg)) + continue; + switch (msg.event) { + default: + err("unexpected msg event %u\n", msg.event); + break; + case UFFD_EVENT_PAGEFAULT: + uffd_handle_page_fault(&msg, stats); + break; + case UFFD_EVENT_FORK: + close(uffd); + uffd = msg.arg.fork.ufd; + pollfd[0].fd = uffd; + break; + case UFFD_EVENT_REMOVE: + uffd_reg.range.start = msg.arg.remove.start; + uffd_reg.range.len = msg.arg.remove.end - + msg.arg.remove.start; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + err("remove failure"); + break; + case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ + area_dst = (char *)(unsigned long)msg.arg.remap.to; + break; + } + } + + return NULL; +} + +static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_copy->dst, + uffdio_copy->len, + offset); + if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy->copy != -EEXIST) + err("UFFDIO_COPY retry error: %"PRId64, + (int64_t)uffdio_copy->copy); + } else { + err("UFFDIO_COPY retry unexpected: %"PRId64, + (int64_t)uffdio_copy->copy); + } +} + +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); +} + +int __copy_page(int ufd, unsigned long offset, bool retry) +{ + struct uffdio_copy uffdio_copy; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu\n", offset); + uffdio_copy.dst = (unsigned long) area_dst + offset; + uffdio_copy.src = (unsigned long) area_src + offset; + uffdio_copy.len = page_size; + if (test_uffdio_wp) + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; + else + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy.copy != -EEXIST) + err("UFFDIO_COPY error: %"PRId64, + (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); + } else if (uffdio_copy.copy != page_size) { + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); + } else { + if (test_uffdio_copy_eexist && retry) { + test_uffdio_copy_eexist = false; + retry_copy_page(ufd, &uffdio_copy, offset); + } + return 1; + } + return 0; +} + +int copy_page(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, false); +} diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h new file mode 100644 index 000000000000..d9430cfdcb19 --- /dev/null +++ b/tools/testing/selftests/mm/uffd-common.h @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Userfaultfd tests common header + * + * Copyright (C) 2015-2023 Red Hat, Inc. + */ +#ifndef __UFFD_COMMON_H__ +#define __UFFD_COMMON_H__ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> +#include <signal.h> +#include <poll.h> +#include <string.h> +#include <linux/mman.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/wait.h> +#include <pthread.h> +#include <linux/userfaultfd.h> +#include <setjmp.h> +#include <stdbool.h> +#include <assert.h> +#include <inttypes.h> +#include <stdint.h> +#include <sys/random.h> + +#include "../kselftest.h" +#include "vm_util.h" + +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) + +#define _err(fmt, ...) \ + do { \ + int ret = errno; \ + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ + fprintf(stderr, " (errno=%d, @%s:%d)\n", \ + ret, __FILE__, __LINE__); \ + } while (0) + +#define errexit(exitcode, fmt, ...) \ + do { \ + _err(fmt, ##__VA_ARGS__); \ + exit(exitcode); \ + } while (0) + +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) + +/* pthread_mutex_t starts at page offset 0 */ +#define area_mutex(___area, ___nr) \ + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +#define area_count(___area, ___nr) \ + ((volatile unsigned long long *) ((unsigned long) \ + ((___area) + (___nr)*page_size + \ + sizeof(pthread_mutex_t) + \ + sizeof(unsigned long long) - 1) & \ + ~(unsigned long)(sizeof(unsigned long long) \ + - 1))) + +/* Userfaultfd test statistics */ +struct uffd_stats { + int cpu; + unsigned long missing_faults; + unsigned long wp_faults; + unsigned long minor_faults; +}; + +struct uffd_test_ops { + void (*allocate_area)(void **alloc_area, bool is_src); + void (*release_pages)(char *rel_area); + void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); +}; +typedef struct uffd_test_ops uffd_test_ops_t; + +extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; +extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +extern int mem_fd, uffd, uffd_flags, finished, *pipefd, test_type; +extern bool map_shared, test_collapse, test_dev_userfaultfd; +extern bool test_uffdio_wp, test_uffdio_minor; +extern unsigned long long *count_verify; +extern volatile bool test_uffdio_copy_eexist; + +extern uffd_test_ops_t anon_uffd_test_ops; +extern uffd_test_ops_t shmem_uffd_test_ops; +extern uffd_test_ops_t hugetlb_uffd_test_ops; +extern uffd_test_ops_t *uffd_test_ops; + +void uffd_stats_report(struct uffd_stats *stats, int n_cpus); +void uffd_test_ctx_init(uint64_t features); +void userfaultfd_open(uint64_t *features); +uint64_t get_expected_ioctls(uint64_t mode); +void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls); +int uffd_read_msg(int ufd, struct uffd_msg *msg); +void wp_range(int ufd, __u64 start, __u64 len, bool wp); +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats); +int __copy_page(int ufd, unsigned long offset, bool retry); +int copy_page(int ufd, unsigned long offset); +void *uffd_poll_thread(void *arg); + +#define TEST_ANON 1 +#define TEST_HUGETLB 2 +#define TEST_SHMEM 3 + +#endif diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c index 3487ec0bfcc8..c68a9aeefc41 100644 --- a/tools/testing/selftests/mm/userfaultfd.c +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -34,96 +34,20 @@ * transfer (UFFDIO_COPY). */ -#define _GNU_SOURCE -#include <stdio.h> -#include <errno.h> -#include <unistd.h> -#include <stdlib.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <time.h> -#include <signal.h> -#include <poll.h> -#include <string.h> -#include <linux/mman.h> -#include <sys/mman.h> -#include <sys/syscall.h> -#include <sys/ioctl.h> -#include <sys/wait.h> -#include <pthread.h> -#include <linux/userfaultfd.h> -#include <setjmp.h> -#include <stdbool.h> -#include <assert.h> -#include <inttypes.h> -#include <stdint.h> -#include <sys/random.h> - -#include "../kselftest.h" -#include "vm_util.h" +#include "uffd-common.h" #ifdef __NR_userfaultfd -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; - #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) #define BOUNCE_VERIFY (1<<2) #define BOUNCE_POLL (1<<3) static int bounces; -#define TEST_ANON 1 -#define TEST_HUGETLB 2 -#define TEST_SHMEM 3 -static int test_type; - -#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) - -#define BASE_PMD_ADDR ((void *)(1UL << 30)) - -/* test using /dev/userfaultfd, instead of userfaultfd(2) */ -static bool test_dev_userfaultfd; - /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ #define ALARM_INTERVAL_SECS 10 -static volatile bool test_uffdio_copy_eexist = true; -/* Whether to test uffd write-protection */ -static bool test_uffdio_wp = true; -/* Whether to test uffd minor faults */ -static bool test_uffdio_minor = false; -static bool map_shared; -static int mem_fd; -static unsigned long long *count_verify; -static int uffd = -1; -static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; static char *zeropage; pthread_attr_t attr; -static bool test_collapse; - -/* Userfaultfd test statistics */ -struct uffd_stats { - int cpu; - unsigned long missing_faults; - unsigned long wp_faults; - unsigned long minor_faults; -}; - -/* pthread_mutex_t starts at page offset 0 */ -#define area_mutex(___area, ___nr) \ - ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) -/* - * count is placed in the page after pthread_mutex_t naturally aligned - * to avoid non alignment faults on non-x86 archs. - */ -#define area_count(___area, ___nr) \ - ((volatile unsigned long long *) ((unsigned long) \ - ((___area) + (___nr)*page_size + \ - sizeof(pthread_mutex_t) + \ - sizeof(unsigned long long) - 1) & \ - ~(unsigned long)(sizeof(unsigned long long) \ - - 1))) #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) @@ -166,22 +90,6 @@ static void usage(void) exit(1); } -#define _err(fmt, ...) \ - do { \ - int ret = errno; \ - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ - fprintf(stderr, " (errno=%d, line=%d)\n", \ - ret, __LINE__); \ - } while (0) - -#define errexit(exitcode, fmt, ...) \ - do { \ - _err(fmt, ##__VA_ARGS__); \ - exit(exitcode); \ - } while (0) - -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) - static void uffd_stats_reset(struct uffd_stats *uffd_stats, unsigned long n_cpus) { @@ -195,189 +103,6 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats, } } -static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) -{ - int i; - unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; - - for (i = 0; i < n_cpus; i++) { - miss_total += stats[i].missing_faults; - wp_total += stats[i].wp_faults; - minor_total += stats[i].minor_faults; - } - - printf("userfaults: "); - if (miss_total) { - printf("%llu missing (", miss_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].missing_faults); - printf("\b) "); - } - if (wp_total) { - printf("%llu wp (", wp_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].wp_faults); - printf("\b) "); - } - if (minor_total) { - printf("%llu minor (", minor_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].minor_faults); - printf("\b)"); - } - printf("\n"); -} - -static void anon_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); -} - -static void anon_allocate_area(void **alloc_area, bool is_src) -{ - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); -} - -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ -} - -static void hugetlb_release_pages(char *rel_area) -{ - if (!map_shared) { - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - } else { - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); - } -} - -static void hugetlb_allocate_area(void **alloc_area, bool is_src) -{ - off_t size = nr_pages * page_size; - off_t offset = is_src ? 0 : size; - void *area_alias = NULL; - char **alloc_area_alias; - - *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | - (is_src ? 0 : MAP_NORESERVE), - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of hugetlbfs file failed"); - - if (map_shared) { - area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of hugetlb file alias failed"); - } - - if (is_src) { - alloc_area_alias = &area_src_alias; - } else { - alloc_area_alias = &area_dst_alias; - } - if (area_alias) - *alloc_area_alias = area_alias; -} - -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - if (!map_shared) - return; - - *start = (unsigned long) area_dst_alias + offset; -} - -static void shmem_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); -} - -static void shmem_allocate_area(void **alloc_area, bool is_src) -{ - void *area_alias = NULL; - size_t bytes = nr_pages * page_size; - unsigned long offset = is_src ? 0 : bytes; - char *p = NULL, *p_alias = NULL; - - if (test_collapse) { - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); - p_alias = p; - p_alias += bytes; - p_alias += hpage_size; /* Prevent src/dst VMA merge */ - } - - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of memfd failed"); - if (test_collapse && *alloc_area != p) - err("mmap of memfd failed at %p", p); - - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of memfd alias failed"); - if (test_collapse && area_alias != p_alias) - err("mmap of anonymous memory failed at %p", p_alias); - - if (is_src) - area_src_alias = area_alias; - else - area_dst_alias = area_alias; -} - -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - *start = (unsigned long)area_dst_alias + offset; -} - -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) -{ - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) - err("Did not find expected %d number of hugepages", - expect_nr_hpages); -} - -struct uffd_test_ops { - void (*allocate_area)(void **alloc_area, bool is_src); - void (*release_pages)(char *rel_area); - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); - void (*check_pmd_mapping)(void *p, int expect_nr_hpages); -}; - -static struct uffd_test_ops anon_uffd_test_ops = { - .allocate_area = anon_allocate_area, - .release_pages = anon_release_pages, - .alias_mapping = noop_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops shmem_uffd_test_ops = { - .allocate_area = shmem_allocate_area, - .release_pages = shmem_release_pages, - .alias_mapping = shmem_alias_mapping, - .check_pmd_mapping = shmem_check_pmd_mapping, -}; - -static struct uffd_test_ops hugetlb_uffd_test_ops = { - .allocate_area = hugetlb_allocate_area, - .release_pages = hugetlb_release_pages, - .alias_mapping = hugetlb_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops *uffd_test_ops; - static inline uint64_t uffd_minor_feature(void) { if (test_type == TEST_HUGETLB && map_shared) @@ -388,171 +113,6 @@ static inline uint64_t uffd_minor_feature(void) return 0; } -static uint64_t get_expected_ioctls(uint64_t mode) -{ - uint64_t ioctls = UFFD_API_RANGE_IOCTLS; - - if (test_type == TEST_HUGETLB) - ioctls &= ~(1 << _UFFDIO_ZEROPAGE); - - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); - - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) - ioctls &= ~(1 << _UFFDIO_CONTINUE); - - return ioctls; -} - -static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) -{ - uint64_t expected = get_expected_ioctls(mode); - uint64_t actual = ioctls & expected; - - if (actual != expected) { - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, - expected, actual); - } -} - -static int __userfaultfd_open_dev(void) -{ - int fd, _uffd; - - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); - if (fd < 0) - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); - - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); - if (_uffd < 0) - errexit(errno == ENOTTY ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - close(fd); - return _uffd; -} - -static void userfaultfd_open(uint64_t *features) -{ - struct uffdio_api uffdio_api; - - if (test_dev_userfaultfd) - uffd = __userfaultfd_open_dev(); - else { - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); - if (uffd < 0) - errexit(errno == ENOSYS ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - } - uffd_flags = fcntl(uffd, F_GETFD, NULL); - - uffdio_api.api = UFFD_API; - uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - err("UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability."); - if (uffdio_api.api != UFFD_API) - err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); - - *features = uffdio_api.features; -} - -static inline void munmap_area(void **area) -{ - if (*area) - if (munmap(*area, nr_pages * page_size)) - err("munmap"); - - *area = NULL; -} - -static void uffd_test_ctx_clear(void) -{ - size_t i; - - if (pipefd) { - for (i = 0; i < nr_cpus * 2; ++i) { - if (close(pipefd[i])) - err("close pipefd"); - } - free(pipefd); - pipefd = NULL; - } - - if (count_verify) { - free(count_verify); - count_verify = NULL; - } - - if (uffd != -1) { - if (close(uffd)) - err("close uffd"); - uffd = -1; - } - - munmap_area((void **)&area_src); - munmap_area((void **)&area_src_alias); - munmap_area((void **)&area_dst); - munmap_area((void **)&area_dst_alias); - munmap_area((void **)&area_remap); -} - -static void uffd_test_ctx_init(uint64_t features) -{ - unsigned long nr, cpu; - - uffd_test_ctx_clear(); - - uffd_test_ops->allocate_area((void **)&area_src, true); - uffd_test_ops->allocate_area((void **)&area_dst, false); - - userfaultfd_open(&features); - - count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) - err("count_verify"); - - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = - (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; - /* - * In the transition between 255 to 256, powerpc will - * read out of order in my_bcmp and see both bytes as - * zero, so leave a placeholder below always non-zero - * after the count, to avoid my_bcmp to trigger false - * positives. - */ - *(area_count(area_src, nr) + 1) = 1; - } - - /* - * After initialization of area_src, we must explicitly release pages - * for area_dst to make sure it's fully empty. Otherwise we could have - * some area_dst pages be errornously initialized with zero pages, - * hence we could hit memory corruption later in the test. - * - * One example is when THP is globally enabled, above allocate_area() - * calls could have the two areas merged into a single VMA (as they - * will have the same VMA flags so they're mergeable). When we - * initialize the area_src above, it's possible that some part of - * area_dst could have been faulted in via one huge THP that will be - * shared between area_src and area_dst. It could cause some of the - * area_dst won't be trapped by missing userfaults. - * - * This release_pages() will guarantee even if that happened, we'll - * proactively split the thp and drop any accidentally initialized - * pages within area_dst. - */ - uffd_test_ops->release_pages(area_dst); - - pipefd = malloc(sizeof(int) * nr_cpus * 2); - if (!pipefd) - err("pipefd"); - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) - err("pipe"); -} - static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; @@ -562,47 +122,6 @@ static int my_bcmp(char *str1, char *str2, size_t n) return 0; } -static void wp_range(int ufd, __u64 start, __u64 len, bool wp) -{ - struct uffdio_writeprotect prms; - - /* Write protection page faults */ - prms.range.start = start; - prms.range.len = len; - /* Undo write-protect, do wakeup after that */ - prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; - - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) - err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); -} - -static void continue_range(int ufd, __u64 start, __u64 len) -{ - struct uffdio_continue req; - int ret; - - req.range.start = start; - req.range.len = len; - req.mode = 0; - if (test_uffdio_wp) - req.mode |= UFFDIO_CONTINUE_MODE_WP; - - if (ioctl(ufd, UFFDIO_CONTINUE, &req)) - err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, - (uint64_t)start); - - /* - * Error handling within the kernel for continue is subtly different - * from copy or zeropage, so it may be a source of bugs. Trigger an - * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. - */ - req.mapped = 0; - ret = ioctl(ufd, UFFDIO_CONTINUE, &req); - if (ret >= 0 || req.mapped != -EEXIST) - err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, - ret, (int64_t) req.mapped); -} - static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -635,222 +154,11 @@ static void *locking_thread(void *arg) return NULL; } -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_copy->dst, - uffdio_copy->len, - offset); - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy->copy != -EEXIST) - err("UFFDIO_COPY retry error: %"PRId64, - (int64_t)uffdio_copy->copy); - } else { - err("UFFDIO_COPY retry unexpected: %"PRId64, - (int64_t)uffdio_copy->copy); - } -} - -static void wake_range(int ufd, unsigned long addr, unsigned long len) -{ - struct uffdio_range uffdio_wake; - - uffdio_wake.start = addr; - uffdio_wake.len = len; - - if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) - fprintf(stderr, "error waking %lu\n", - addr), exit(1); -} - -static int __copy_page(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_copy uffdio_copy; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu\n", offset); - uffdio_copy.dst = (unsigned long) area_dst + offset; - uffdio_copy.src = (unsigned long) area_src + offset; - uffdio_copy.len = page_size; - if (test_uffdio_wp) - uffdio_copy.mode = UFFDIO_COPY_MODE_WP; - else - uffdio_copy.mode = 0; - uffdio_copy.copy = 0; - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy.copy != -EEXIST) - err("UFFDIO_COPY error: %"PRId64, - (int64_t)uffdio_copy.copy); - wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { - err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); - } else { - if (test_uffdio_copy_eexist && retry) { - test_uffdio_copy_eexist = false; - retry_copy_page(ufd, &uffdio_copy, offset); - } - return 1; - } - return 0; -} - static int copy_page_retry(int ufd, unsigned long offset) { return __copy_page(ufd, offset, true); } -static int copy_page(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, false); -} - -static int uffd_read_msg(int ufd, struct uffd_msg *msg) -{ - int ret = read(uffd, msg, sizeof(*msg)); - - if (ret != sizeof(*msg)) { - if (ret < 0) { - if (errno == EAGAIN || errno == EINTR) - return 1; - err("blocking read error"); - } else { - err("short read"); - } - } - - return 0; -} - -static void uffd_handle_page_fault(struct uffd_msg *msg, - struct uffd_stats *stats) -{ - unsigned long offset; - - if (msg->event != UFFD_EVENT_PAGEFAULT) - err("unexpected msg event %u", msg->event); - - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { - /* Write protect page faults */ - wp_range(uffd, msg->arg.pagefault.address, page_size, false); - stats->wp_faults++; - } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { - uint8_t *area; - int b; - - /* - * Minor page faults - * - * To prove we can modify the original range for testing - * purposes, we're going to bit flip this range before - * continuing. - * - * Note that this requires all minor page fault tests operate on - * area_dst (non-UFFD-registered) and area_dst_alias - * (UFFD-registered). - */ - - area = (uint8_t *)(area_dst + - ((char *)msg->arg.pagefault.address - - area_dst_alias)); - for (b = 0; b < page_size; ++b) - area[b] = ~area[b]; - continue_range(uffd, msg->arg.pagefault.address, page_size); - stats->minor_faults++; - } else { - /* - * Missing page faults. - * - * Here we force a write check for each of the missing mode - * faults. It's guaranteed because the only threads that - * will trigger uffd faults are the locking threads, and - * their first instruction to touch the missing page will - * always be pthread_mutex_lock(). - * - * Note that here we relied on an NPTL glibc impl detail to - * always read the lock type at the entry of the lock op - * (pthread_mutex_t.__data.__type, offset 0x10) before - * doing any locking operations to guarantee that. It's - * actually not good to rely on this impl detail because - * logically a pthread-compatible lib can implement the - * locks without types and we can fail when linking with - * them. However since we used to find bugs with this - * strict check we still keep it around. Hopefully this - * could be a good hint when it fails again. If one day - * it'll break on some other impl of glibc we'll revisit. - */ - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - err("unexpected write fault"); - - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); - - if (copy_page(uffd, offset)) - stats->missing_faults++; - } -} - -static void *uffd_poll_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - unsigned long cpu = stats->cpu; - struct pollfd pollfd[2]; - struct uffd_msg msg; - struct uffdio_register uffd_reg; - int ret; - char tmp_chr; - - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd[cpu*2]; - pollfd[1].events = POLLIN; - - for (;;) { - ret = poll(pollfd, 2, -1); - if (ret <= 0) { - if (errno == EINTR || errno == EAGAIN) - continue; - err("poll error: %d", ret); - } - if (pollfd[1].revents & POLLIN) { - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) - err("read pipefd error"); - break; - } - if (!(pollfd[0].revents & POLLIN)) - err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) - continue; - switch (msg.event) { - default: - err("unexpected msg event %u\n", msg.event); - break; - case UFFD_EVENT_PAGEFAULT: - uffd_handle_page_fault(&msg, stats); - break; - case UFFD_EVENT_FORK: - close(uffd); - uffd = msg.arg.fork.ufd; - pollfd[0].fd = uffd; - break; - case UFFD_EVENT_REMOVE: - uffd_reg.range.start = msg.arg.remove.start; - uffd_reg.range.len = msg.arg.remove.end - - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) - err("remove failure"); - break; - case UFFD_EVENT_REMAP: - area_remap = area_dst; /* save for later unmap */ - area_dst = (char *)(unsigned long)msg.arg.remap.to; - break; - } - } - - return NULL; -} - pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; static void *uffd_read_thread(void *arg)