Message ID | 1442944034-2148-2-git-send-email-ross.zwisler@linux.intel.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Tue, Sep 22, 2015 at 10:47 AM, Ross Zwisler <ross.zwisler@linux.intel.com> wrote: > The purpose of this test is to validate that the DAX hugepage fault > handler is working correctly. The DAX PMD fault handler in v4.3-rc1 and > -rc2 has an issue where it tries to zero at an undefined address, > causing a BUG(). Without the zeroing code in place at all this test > will find data corruption as the newly allocated huge page will be > filled with random garbage. > > This test is being added to the "destructive" group, and is currently > only run as part of 'make check-TESTS'. We also specifically call out a > device named "/dev/pmem0" to avoid running into a known bug with PMD > page faults on struct page backed devices (/dev/pmem0m). This will be > broadened when that bug is addressed. > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > --- > Makefile.am | 11 +++-- > lib/test-dax-pmd.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 134 insertions(+), 3 deletions(-) > create mode 100644 lib/test-dax-pmd.c > > diff --git a/Makefile.am b/Makefile.am > index e5b4b49..329cce6 100644 > --- a/Makefile.am > +++ b/Makefile.am > @@ -78,7 +78,8 @@ endif > if ENABLE_DESTRUCTIVE > ndctl_SOURCES += lib/blk_namespaces.c \ > lib/pmem_namespaces.c \ > - lib/test-pcommit.c > + lib/test-pcommit.c \ > + lib/test-dax-pmd.c > ndctl_SOURCES += builtin-bat.c > endif > > @@ -116,13 +117,17 @@ TESTS = lib/test-libndctl lib/test-dpa-alloc lib/test-parent-uuid > check_PROGRAMS = lib/test-libndctl lib/test-dpa-alloc lib/test-parent-uuid > > if ENABLE_DESTRUCTIVE > -TESTS += lib/test-blk-ns lib/test-pmem-ns lib/test-pcommit > -check_PROGRAMS += lib/test-blk-ns lib/test-pmem-ns lib/test-pcommit > +TESTS += lib/test-blk-ns lib/test-pmem-ns lib/test-pcommit lib/test-dax-pmd > +check_PROGRAMS += lib/test-blk-ns lib/test-pmem-ns lib/test-pcommit \ > + lib/test-dax-pmd > endif > > lib_test_libndctl_SOURCES = lib/test-libndctl.c lib/test-core.c > lib_test_libndctl_LDADD = lib/libndctl.la $(UUID_LIBS) $(KMOD_LIBS) > > +lib_test_dax_pmd_SOURCES = lib/test-dax-pmd.c > +lib_test_dax_pmd_LDADD = lib/libndctl.la $(KMOD_LIBS) > + > lib_test_pcommit_SOURCES = lib/test-pcommit.c > lib_test_pcommit_LDADD = lib/libndctl.la $(KMOD_LIBS) > > diff --git a/lib/test-dax-pmd.c b/lib/test-dax-pmd.c > new file mode 100644 > index 0000000..ec35312 > --- /dev/null > +++ b/lib/test-dax-pmd.c > @@ -0,0 +1,126 @@ > +/* > + * test-dax-pmd: Exercise the DAX PMD page fault path > + * > + * Copyright (c) 2015, Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU Lesser General Public License, > + * version 2.1, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT ANY > + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS > + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for > + * more details. > + */ > +#include <sys/types.h> > +#include <sys/mman.h> > +#include <sys/stat.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <errno.h> > +#include <fcntl.h> > +#include <stdio.h> > +#include <test.h> > + > +/* > + * This will give us a 50 MiB partition. This does take a while to fill in > + * with random data, but we really a partition this large so that the ext4 > + * block allocator will give us 2MiB aligned blocks. > + */ > +#define PART_OFFSET_MB 2 > +#define PART_END_MB 52 > +#define MB(a) ((a) * 1024UL * 1024UL) > +#define PAGE(a) ((a)*0x1000) > +#define PART_SIZE MB(PART_END_MB - PART_OFFSET_MB) > +#define DEV "/dev/pmem0" > +#define PART "/dev/pmem0p1" Which pmem device is this expecting to find? If this is the "e820" one then use libndctl to find the namespace name by bus, region, etc... ...bonus points to find the device and auto-convert it from the default pmem0m to pmem0. > +#define MNT "/mnt/dax" > +#define MMAP_SIZE MB(4) > + > +static void sys(char *command) > +{ > + int rc = 0; > + > + rc = system(command); > + if (rc) { > + rc = WEXITSTATUS(rc); > + exit(rc); > + } > +} > + > +/* > + * The purpose of this test is to validate that the DAX hugepage fault handler > + * is working correctly. The DAX PMD fault handler in v4.3-rc1 and -rc2 has > + * an issue where it tries to zero at an undefined address, causing a BUG(). > + * Without the zeroing code in place at all this test will find data > + * corruption as the newly allocated huge page will be filled with random > + * garbage. > + */ > +static int test_dax_pmd(void) > +{ > + char *data_array = (char*) 0x10200000; /* request a 2MiB aligned address with mmap() */ > + char command[128]; > + int rc = 0; > + int fd; > + > + if (access(DEV, F_OK) < 0) > + return TEST_SKIP; > + > + /* > + * Set up a configuration that will give us a huge page fault. > + * Getting PMD faults is actually pretty tricky - I ended up being > + * able to get them by having a 2 MiB aligned partition, making ext4 > + * with a 4096 block size and a 2 MiB stride, and by explicitly asking > + * mmap() to give me a 2 MiB aligned address. > + */ > + sys("parted -s " DEV " mktable msdos"); > + snprintf(command, sizeof(command), "parted -s -a optimal " DEV " mkpart Primary %uMiB %uMiB", > + PART_OFFSET_MB, PART_END_MB); > + sys(command); > + snprintf(command, sizeof(command), "dd if=/dev/urandom of=" PART " bs=%d count=%lu", > + PAGE(1), PART_SIZE/PAGE(1)); Why does the test need to partition the pmem device vs just placing the filesystem on the raw device directly, and why does it need to fill it with random data? > + sys(command); > + sys("mkfs.ext4 -E stride=512 -b 4096 " PART); > + sys("mkdir -p " MNT); > + sys("mount -o dax " PART " " MNT); > + > + fd = open(MNT "/data", O_RDWR|O_CREAT, S_IRUSR|S_IWUSR); > + if (fd < 0) { > + perror("fd"); > + return 1; > + } > + > + /* > + * Write to a 10 MiB offset to increase the file size. The entire > + * mmap() we set up next will be over a hole. > + */ > + pwrite(fd, "a", 1, MB(10)); fallocate? > + > + data_array = mmap(data_array, MMAP_SIZE, PROT_READ|PROT_WRITE, > + MAP_SHARED, fd, 0); > + > + if ((long unsigned)data_array & (MB(2)-1)) { > + rc = EINVAL; > + goto out; > + } > + > + /* > + * Write to the first byte of the hole, causing a DAX PMD page fault. > + * If everything works correctly the second byte should be cleared by > + * the fault handler, and should read as zero. > + */ > + data_array[0] = 0xff; > + if (data_array[1] != 0) > + rc = EIO; I wonder if there is a way to verify we actually got a 2MB mapping from userspace vs falling back to 4K? > + out: > + munmap(data_array, MMAP_SIZE); > + close(fd); > + > + sys("umount " MNT); > + return rc; > +} > + > +int __attribute__((weak)) main(int argc, char *argv[]) > +{ This needs a "ndctl_test_attempt(test, KERNEL_VERSION(4, 3, 0))" somewhere since the pmd faulting for DAX is new in 4.3. This is useful for documenting when features arrived for backports that claim to be "4.3" feature compatible.
On Tue, Sep 22, 2015 at 11:19:00AM -0700, Dan Williams wrote: > On Tue, Sep 22, 2015 at 10:47 AM, Ross Zwisler > <ross.zwisler@linux.intel.com> wrote: > > The purpose of this test is to validate that the DAX hugepage fault > > handler is working correctly. The DAX PMD fault handler in v4.3-rc1 and > > -rc2 has an issue where it tries to zero at an undefined address, > > causing a BUG(). Without the zeroing code in place at all this test > > will find data corruption as the newly allocated huge page will be > > filled with random garbage. > > > > This test is being added to the "destructive" group, and is currently > > only run as part of 'make check-TESTS'. We also specifically call out a > > device named "/dev/pmem0" to avoid running into a known bug with PMD > > page faults on struct page backed devices (/dev/pmem0m). This will be > > broadened when that bug is addressed. > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > +/* > > + * This will give us a 50 MiB partition. This does take a while to fill in > > + * with random data, but we really a partition this large so that the ext4 > > + * block allocator will give us 2MiB aligned blocks. > > + */ > > +#define PART_OFFSET_MB 2 > > +#define PART_END_MB 52 > > +#define MB(a) ((a) * 1024UL * 1024UL) > > +#define PAGE(a) ((a)*0x1000) > > +#define PART_SIZE MB(PART_END_MB - PART_OFFSET_MB) > > +#define DEV "/dev/pmem0" > > +#define PART "/dev/pmem0p1" > > Which pmem device is this expecting to find? If this is the "e820" > one then use libndctl to find the namespace name by bus, region, > etc... > > ...bonus points to find the device and auto-convert it from the > default pmem0m to pmem0. Sure, will do. > > + /* > > + * Set up a configuration that will give us a huge page fault. > > + * Getting PMD faults is actually pretty tricky - I ended up being > > + * able to get them by having a 2 MiB aligned partition, making ext4 > > + * with a 4096 block size and a 2 MiB stride, and by explicitly asking > > + * mmap() to give me a 2 MiB aligned address. > > + */ > > + sys("parted -s " DEV " mktable msdos"); > > + snprintf(command, sizeof(command), "parted -s -a optimal " DEV " mkpart Primary %uMiB %uMiB", > > + PART_OFFSET_MB, PART_END_MB); > > + sys(command); > > + snprintf(command, sizeof(command), "dd if=/dev/urandom of=" PART " bs=%d count=%lu", > > + PAGE(1), PART_SIZE/PAGE(1)); > > Why does the test need to partition the pmem device vs just placing > the filesystem on the raw device directly, and why does it need to > fill it with random data? I use a partition vs using the raw device because I want to fill it with the minimum amount of random data. I need it filled with random data to make sure that the zeroing of newly allocated huge pages is working properly - that is the code that was broken in the PMD fault handler. Without the random data write I can't easily detect corruption. > > + /* > > + * Write to a 10 MiB offset to increase the file size. The entire > > + * mmap() we set up next will be over a hole. > > + */ > > + pwrite(fd, "a", 1, MB(10)); > > fallocate? I agree that it is cleaner use fallocate(), but for some reason this seems to defeat the ext4 block allocator. With the distant write I end up getting a PFN that is 2MiB aligned, but with fallocate() I end up with a PFN that fails the (pfn & PG_PMD_COLOUR) test, so we end up falling back to 4k faults. We probably need to figure out why this is happening so PMD faults are more widely supported, but for this test I was just trying to get things working. > > + /* > > + * Write to the first byte of the hole, causing a DAX PMD page fault. > > + * If everything works correctly the second byte should be cleared by > > + * the fault handler, and should read as zero. > > + */ > > + data_array[0] = 0xff; > > + if (data_array[1] != 0) > > + rc = EIO; > > I wonder if there is a way to verify we actually got a 2MB mapping > from userspace vs falling back to 4K? I don't know of a way yet, but I agree it would be awesome. I was hoping that some of the HugePages_* counters in /proc/meminfo would be effected, but they don't seem to be. If anyone knows of a way to verify from userspace that you've actually gotten a 2MiB page, please let me know. :) > > + out: > > + munmap(data_array, MMAP_SIZE); > > + close(fd); > > + > > + sys("umount " MNT); > > + return rc; > > +} > > + > > +int __attribute__((weak)) main(int argc, char *argv[]) > > +{ > > This needs a "ndctl_test_attempt(test, KERNEL_VERSION(4, 3, 0))" > somewhere since the pmd faulting for DAX is new in 4.3. This is > useful for documenting when features arrived for backports that claim > to be "4.3" feature compatible. Sure, I'll add it.
On Tue, Sep 22, 2015 at 1:54 PM, Ross Zwisler <ross.zwisler@linux.intel.com> wrote: > On Tue, Sep 22, 2015 at 11:19:00AM -0700, Dan Williams wrote: >> On Tue, Sep 22, 2015 at 10:47 AM, Ross Zwisler >> <ross.zwisler@linux.intel.com> wrote: >> > The purpose of this test is to validate that the DAX hugepage fault >> > handler is working correctly. The DAX PMD fault handler in v4.3-rc1 and >> > -rc2 has an issue where it tries to zero at an undefined address, >> > causing a BUG(). Without the zeroing code in place at all this test >> > will find data corruption as the newly allocated huge page will be >> > filled with random garbage. >> > >> > This test is being added to the "destructive" group, and is currently >> > only run as part of 'make check-TESTS'. We also specifically call out a >> > device named "/dev/pmem0" to avoid running into a known bug with PMD >> > page faults on struct page backed devices (/dev/pmem0m). This will be >> > broadened when that bug is addressed. >> > >> > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > >> > +/* >> > + * This will give us a 50 MiB partition. This does take a while to fill in >> > + * with random data, but we really a partition this large so that the ext4 >> > + * block allocator will give us 2MiB aligned blocks. >> > + */ >> > +#define PART_OFFSET_MB 2 >> > +#define PART_END_MB 52 >> > +#define MB(a) ((a) * 1024UL * 1024UL) >> > +#define PAGE(a) ((a)*0x1000) >> > +#define PART_SIZE MB(PART_END_MB - PART_OFFSET_MB) >> > +#define DEV "/dev/pmem0" >> > +#define PART "/dev/pmem0p1" >> >> Which pmem device is this expecting to find? If this is the "e820" >> one then use libndctl to find the namespace name by bus, region, >> etc... >> >> ...bonus points to find the device and auto-convert it from the >> default pmem0m to pmem0. > > Sure, will do. > >> > + /* >> > + * Set up a configuration that will give us a huge page fault. >> > + * Getting PMD faults is actually pretty tricky - I ended up being >> > + * able to get them by having a 2 MiB aligned partition, making ext4 >> > + * with a 4096 block size and a 2 MiB stride, and by explicitly asking >> > + * mmap() to give me a 2 MiB aligned address. >> > + */ >> > + sys("parted -s " DEV " mktable msdos"); >> > + snprintf(command, sizeof(command), "parted -s -a optimal " DEV " mkpart Primary %uMiB %uMiB", >> > + PART_OFFSET_MB, PART_END_MB); >> > + sys(command); >> > + snprintf(command, sizeof(command), "dd if=/dev/urandom of=" PART " bs=%d count=%lu", >> > + PAGE(1), PART_SIZE/PAGE(1)); >> >> Why does the test need to partition the pmem device vs just placing >> the filesystem on the raw device directly, and why does it need to >> fill it with random data? > > I use a partition vs using the raw device because I want to fill it with the > minimum amount of random data. I need it filled with random data to make sure > that the zeroing of newly allocated huge pages is working properly - that is > the code that was broken in the PMD fault handler. Without the random data > write I can't easily detect corruption. ...but it's only slow because it's are reading from urandom, right? Doing: char buf[SZ_1M]; memset(buf, 0xff, SZ_1M); for (...) write(..., buf, SZ_1M); ...to the raw device before creating the filesystem should be fast enough. > >> > + /* >> > + * Write to a 10 MiB offset to increase the file size. The entire >> > + * mmap() we set up next will be over a hole. >> > + */ >> > + pwrite(fd, "a", 1, MB(10)); >> >> fallocate? > > I agree that it is cleaner use fallocate(), but for some reason this seems to > defeat the ext4 block allocator. With the distant write I end up getting a > PFN that is 2MiB aligned, but with fallocate() I end up with a PFN that fails > the (pfn & PG_PMD_COLOUR) test, so we end up falling back to 4k faults. > > We probably need to figure out why this is happening so PMD faults are more > widely supported, but for this test I was just trying to get things working. Ok. > >> > + /* >> > + * Write to the first byte of the hole, causing a DAX PMD page fault. >> > + * If everything works correctly the second byte should be cleared by >> > + * the fault handler, and should read as zero. >> > + */ >> > + data_array[0] = 0xff; >> > + if (data_array[1] != 0) >> > + rc = EIO; >> >> I wonder if there is a way to verify we actually got a 2MB mapping >> from userspace vs falling back to 4K? > > I don't know of a way yet, but I agree it would be awesome. I was hoping that > some of the HugePages_* counters in /proc/meminfo would be effected, but they > don't seem to be. If anyone knows of a way to verify from userspace that > you've actually gotten a 2MiB page, please let me know. :) The VM has no idea about these "huge" DAX mappings given they didn't come out of the allocator. Seems like some diagnostics that we would need to add. Perhaps a tracepoint for now.
diff --git a/Makefile.am b/Makefile.am index e5b4b49..329cce6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -78,7 +78,8 @@ endif if ENABLE_DESTRUCTIVE ndctl_SOURCES += lib/blk_namespaces.c \ lib/pmem_namespaces.c \ - lib/test-pcommit.c + lib/test-pcommit.c \ + lib/test-dax-pmd.c ndctl_SOURCES += builtin-bat.c endif @@ -116,13 +117,17 @@ TESTS = lib/test-libndctl lib/test-dpa-alloc lib/test-parent-uuid check_PROGRAMS = lib/test-libndctl lib/test-dpa-alloc lib/test-parent-uuid if ENABLE_DESTRUCTIVE -TESTS += lib/test-blk-ns lib/test-pmem-ns lib/test-pcommit -check_PROGRAMS += lib/test-blk-ns lib/test-pmem-ns lib/test-pcommit +TESTS += lib/test-blk-ns lib/test-pmem-ns lib/test-pcommit lib/test-dax-pmd +check_PROGRAMS += lib/test-blk-ns lib/test-pmem-ns lib/test-pcommit \ + lib/test-dax-pmd endif lib_test_libndctl_SOURCES = lib/test-libndctl.c lib/test-core.c lib_test_libndctl_LDADD = lib/libndctl.la $(UUID_LIBS) $(KMOD_LIBS) +lib_test_dax_pmd_SOURCES = lib/test-dax-pmd.c +lib_test_dax_pmd_LDADD = lib/libndctl.la $(KMOD_LIBS) + lib_test_pcommit_SOURCES = lib/test-pcommit.c lib_test_pcommit_LDADD = lib/libndctl.la $(KMOD_LIBS) diff --git a/lib/test-dax-pmd.c b/lib/test-dax-pmd.c new file mode 100644 index 0000000..ec35312 --- /dev/null +++ b/lib/test-dax-pmd.c @@ -0,0 +1,126 @@ +/* + * test-dax-pmd: Exercise the DAX PMD page fault path + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU Lesser General Public License, + * version 2.1, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + */ +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <test.h> + +/* + * This will give us a 50 MiB partition. This does take a while to fill in + * with random data, but we really a partition this large so that the ext4 + * block allocator will give us 2MiB aligned blocks. + */ +#define PART_OFFSET_MB 2 +#define PART_END_MB 52 +#define MB(a) ((a) * 1024UL * 1024UL) +#define PAGE(a) ((a)*0x1000) +#define PART_SIZE MB(PART_END_MB - PART_OFFSET_MB) +#define DEV "/dev/pmem0" +#define PART "/dev/pmem0p1" +#define MNT "/mnt/dax" +#define MMAP_SIZE MB(4) + +static void sys(char *command) +{ + int rc = 0; + + rc = system(command); + if (rc) { + rc = WEXITSTATUS(rc); + exit(rc); + } +} + +/* + * The purpose of this test is to validate that the DAX hugepage fault handler + * is working correctly. The DAX PMD fault handler in v4.3-rc1 and -rc2 has + * an issue where it tries to zero at an undefined address, causing a BUG(). + * Without the zeroing code in place at all this test will find data + * corruption as the newly allocated huge page will be filled with random + * garbage. + */ +static int test_dax_pmd(void) +{ + char *data_array = (char*) 0x10200000; /* request a 2MiB aligned address with mmap() */ + char command[128]; + int rc = 0; + int fd; + + if (access(DEV, F_OK) < 0) + return TEST_SKIP; + + /* + * Set up a configuration that will give us a huge page fault. + * Getting PMD faults is actually pretty tricky - I ended up being + * able to get them by having a 2 MiB aligned partition, making ext4 + * with a 4096 block size and a 2 MiB stride, and by explicitly asking + * mmap() to give me a 2 MiB aligned address. + */ + sys("parted -s " DEV " mktable msdos"); + snprintf(command, sizeof(command), "parted -s -a optimal " DEV " mkpart Primary %uMiB %uMiB", + PART_OFFSET_MB, PART_END_MB); + sys(command); + snprintf(command, sizeof(command), "dd if=/dev/urandom of=" PART " bs=%d count=%lu", + PAGE(1), PART_SIZE/PAGE(1)); + sys(command); + sys("mkfs.ext4 -E stride=512 -b 4096 " PART); + sys("mkdir -p " MNT); + sys("mount -o dax " PART " " MNT); + + fd = open(MNT "/data", O_RDWR|O_CREAT, S_IRUSR|S_IWUSR); + if (fd < 0) { + perror("fd"); + return 1; + } + + /* + * Write to a 10 MiB offset to increase the file size. The entire + * mmap() we set up next will be over a hole. + */ + pwrite(fd, "a", 1, MB(10)); + + data_array = mmap(data_array, MMAP_SIZE, PROT_READ|PROT_WRITE, + MAP_SHARED, fd, 0); + + if ((long unsigned)data_array & (MB(2)-1)) { + rc = EINVAL; + goto out; + } + + /* + * Write to the first byte of the hole, causing a DAX PMD page fault. + * If everything works correctly the second byte should be cleared by + * the fault handler, and should read as zero. + */ + data_array[0] = 0xff; + if (data_array[1] != 0) + rc = EIO; + out: + munmap(data_array, MMAP_SIZE); + close(fd); + + sys("umount " MNT); + return rc; +} + +int __attribute__((weak)) main(int argc, char *argv[]) +{ + return test_dax_pmd(); +}
The purpose of this test is to validate that the DAX hugepage fault handler is working correctly. The DAX PMD fault handler in v4.3-rc1 and -rc2 has an issue where it tries to zero at an undefined address, causing a BUG(). Without the zeroing code in place at all this test will find data corruption as the newly allocated huge page will be filled with random garbage. This test is being added to the "destructive" group, and is currently only run as part of 'make check-TESTS'. We also specifically call out a device named "/dev/pmem0" to avoid running into a known bug with PMD page faults on struct page backed devices (/dev/pmem0m). This will be broadened when that bug is addressed. Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> --- Makefile.am | 11 +++-- lib/test-dax-pmd.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 3 deletions(-) create mode 100644 lib/test-dax-pmd.c