Message ID | 20171227065620.20889-1-haozhong.zhang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote: > When a file supporting DAX is used as vNVDIMM backend, mmap it with > MAP_SYNC flag in addition can guarantee the persistence of guest write > to the backend file without other QEMU actions (e.g., periodic fsync() > by QEMU). > > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the > backend file. On such failures, QEMU retries mmap without MAP_SYNC and > MAP_SHARED_VALIDATE. > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> If users rely on MAP_SYNC then don't you need to fail allocation if you can't use it? > --- > util/mmap-alloc.c | 36 +++++++++++++++++++++++++++++++++++- > 1 file changed, 35 insertions(+), 1 deletion(-) > > diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c > index 2fd8cbcc6f..37b302f057 100644 > --- a/util/mmap-alloc.c > +++ b/util/mmap-alloc.c > @@ -18,7 +18,18 @@ > > #ifdef CONFIG_LINUX > #include <sys/vfs.h> > + > +/* > + * MAP_SHARED_VALIDATE and MAP_SYNC were introduced in 4.15 kernel, so > + * they may not be defined when compiling on older kernels. > + */ > +#ifndef MAP_SHARED_VALIDATE > +#define MAP_SHARED_VALIDATE 0x3 > #endif > +#ifndef MAP_SYNC > +#define MAP_SYNC 0x80000 > +#endif > +#endif /* CONFIG_LINUX */ > > size_t qemu_fd_getpagesize(int fd) > { This stuff is arch-specific. I think you want to import this into standard-headers rather than duplicate things from Linux. > @@ -97,6 +108,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) > #endif > size_t offset; > void *ptr1; > + int xflags = 0; > > if (ptr == MAP_FAILED) { > return MAP_FAILED; > @@ -107,12 +119,34 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) > assert(align >= getpagesize()); > > offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr; > + > +#if defined(__linux__) > + /* > + * If 'fd' refers to a file supporting DAX, mmap it with MAP_SYNC > + * will guarantee the guest write persistence without other > + * actions in QEMU (e.g., fsync() in QEMU). > + * > + * MAP_SHARED_VALIDATE ensures mmap with MAP_SYNC fails if > + * MAP_SYNC is not supported by the kernel or the file. > + * > + * On failures of mmap with xflags, QEMU will retry mmap without > + * xflags. If all you are doing is retrying without, then I don't think you even need VALIDATE. > + */ > + xflags = shared ? (MAP_SHARED_VALIDATE | MAP_SYNC) : 0; > +#endif > + > + retry_mmap_fd: > ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE, > MAP_FIXED | > (fd == -1 ? MAP_ANONYMOUS : 0) | > - (shared ? MAP_SHARED : MAP_PRIVATE), > + (shared ? MAP_SHARED : MAP_PRIVATE) | xflags, > fd, 0); > if (ptr1 == MAP_FAILED) { > + if (xflags) { > + xflags = 0; > + goto retry_mmap_fd; > + } > + > munmap(ptr, total); > return MAP_FAILED; > } > -- > 2.14.1
On 01/02/18 18:02 +0200, Michael S. Tsirkin wrote: > On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote: > > When a file supporting DAX is used as vNVDIMM backend, mmap it with > > MAP_SYNC flag in addition can guarantee the persistence of guest write > > to the backend file without other QEMU actions (e.g., periodic fsync() > > by QEMU). > > > > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap > > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the > > backend file. On such failures, QEMU retries mmap without MAP_SYNC and > > MAP_SHARED_VALIDATE. > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > If users rely on MAP_SYNC then don't you need to fail allocation > if you can't use it? MAP_SYNC is supported since Linux kernel 4.15 and only needed for mmap files on nvdimm. qemu_ram_mmap() has no way to check whether its parameter 'fd' points to files on nvdimm, except by looking up sysfs. However, accessing sysfs may be denied by certain SELinux policies. The missing of MAP_SYNC should not affect the primary functionality of vNVDIMM when using files on host nvdimm as backend, except the guarantee of write persistence in case of qemu/host crash. We may check the kernel support of MAP_SYNC and the type of vNVDIMM backend in some management utility (e.g., libvirt?), and deny to launch QEMU if MAP_SYNC is not supported while files on host NVDIMM are in use. > > > --- > > util/mmap-alloc.c | 36 +++++++++++++++++++++++++++++++++++- > > 1 file changed, 35 insertions(+), 1 deletion(-) > > > > diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c > > index 2fd8cbcc6f..37b302f057 100644 > > --- a/util/mmap-alloc.c > > +++ b/util/mmap-alloc.c > > @@ -18,7 +18,18 @@ > > > > #ifdef CONFIG_LINUX > > #include <sys/vfs.h> > > + > > +/* > > + * MAP_SHARED_VALIDATE and MAP_SYNC were introduced in 4.15 kernel, so > > + * they may not be defined when compiling on older kernels. > > + */ > > +#ifndef MAP_SHARED_VALIDATE > > +#define MAP_SHARED_VALIDATE 0x3 > > #endif > > +#ifndef MAP_SYNC > > +#define MAP_SYNC 0x80000 > > +#endif > > +#endif /* CONFIG_LINUX */ > > > > size_t qemu_fd_getpagesize(int fd) > > { > > This stuff is arch-specific. I think you want to import this into > standard-headers rather than duplicate things from Linux. I'll move them to osdep.h. > > > > @@ -97,6 +108,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) > > #endif > > size_t offset; > > void *ptr1; > > + int xflags = 0; > > > > if (ptr == MAP_FAILED) { > > return MAP_FAILED; > > @@ -107,12 +119,34 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) > > assert(align >= getpagesize()); > > > > offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr; > > + > > +#if defined(__linux__) > > + /* > > + * If 'fd' refers to a file supporting DAX, mmap it with MAP_SYNC > > + * will guarantee the guest write persistence without other > > + * actions in QEMU (e.g., fsync() in QEMU). > > + * > > + * MAP_SHARED_VALIDATE ensures mmap with MAP_SYNC fails if > > + * MAP_SYNC is not supported by the kernel or the file. > > + * > > + * On failures of mmap with xflags, QEMU will retry mmap without > > + * xflags. > > If all you are doing is retrying without, then I don't think you > even need VALIDATE. Yes, MAP_SHARED_VALIDATE is not necessary. I'll remove it from xflags. Thanks, Haozhong > > > + */ > > + xflags = shared ? (MAP_SHARED_VALIDATE | MAP_SYNC) : 0; > > +#endif > > + > > + retry_mmap_fd: > > ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE, > > MAP_FIXED | > > (fd == -1 ? MAP_ANONYMOUS : 0) | > > - (shared ? MAP_SHARED : MAP_PRIVATE), > > + (shared ? MAP_SHARED : MAP_PRIVATE) | xflags, > > fd, 0); > > if (ptr1 == MAP_FAILED) { > > + if (xflags) { > > + xflags = 0; > > + goto retry_mmap_fd; > > + } > > + > > munmap(ptr, total); > > return MAP_FAILED; > > } > > -- > > 2.14.1
On Wed, Jan 03, 2018 at 11:16:39AM +0800, Haozhong Zhang wrote: > On 01/02/18 18:02 +0200, Michael S. Tsirkin wrote: > > On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote: > > > When a file supporting DAX is used as vNVDIMM backend, mmap it with > > > MAP_SYNC flag in addition can guarantee the persistence of guest write > > > to the backend file without other QEMU actions (e.g., periodic fsync() > > > by QEMU). > > > > > > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap > > > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the > > > backend file. On such failures, QEMU retries mmap without MAP_SYNC and > > > MAP_SHARED_VALIDATE. > > > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > > > If users rely on MAP_SYNC then don't you need to fail allocation > > if you can't use it? > > MAP_SYNC is supported since Linux kernel 4.15 and only needed for mmap > files on nvdimm. qemu_ram_mmap() has no way to check whether its > parameter 'fd' points to files on nvdimm, except by looking up > sysfs. However, accessing sysfs may be denied by certain SELinux > policies. > > The missing of MAP_SYNC should not affect the primary functionality of > vNVDIMM when using files on host nvdimm as backend, except the > guarantee of write persistence in case of qemu/host crash. > > We may check the kernel support of MAP_SYNC and the type of vNVDIMM > backend in some management utility (e.g., libvirt?), and deny to > launch QEMU if MAP_SYNC is not supported while files on host NVDIMM > are in use. Instead of making libvirt check if MAP_SYNC is supported and just hope it won't fail, it would be safer to let libvirt tell QEMU that MAP_SYNC must never fail. However, it looks like kernel 4.14 won't even fail if MAP_SYNC is specified. How exactly can userspace detect if MAP_SYNC is really supported?
On 01/03/18 11:45 -0200, Eduardo Habkost wrote: > On Wed, Jan 03, 2018 at 11:16:39AM +0800, Haozhong Zhang wrote: > > On 01/02/18 18:02 +0200, Michael S. Tsirkin wrote: > > > On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote: > > > > When a file supporting DAX is used as vNVDIMM backend, mmap it with > > > > MAP_SYNC flag in addition can guarantee the persistence of guest write > > > > to the backend file without other QEMU actions (e.g., periodic fsync() > > > > by QEMU). > > > > > > > > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap > > > > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the > > > > backend file. On such failures, QEMU retries mmap without MAP_SYNC and > > > > MAP_SHARED_VALIDATE. > > > > > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > > > > > If users rely on MAP_SYNC then don't you need to fail allocation > > > if you can't use it? > > > > MAP_SYNC is supported since Linux kernel 4.15 and only needed for mmap > > files on nvdimm. qemu_ram_mmap() has no way to check whether its > > parameter 'fd' points to files on nvdimm, except by looking up > > sysfs. However, accessing sysfs may be denied by certain SELinux > > policies. > > > > The missing of MAP_SYNC should not affect the primary functionality of > > vNVDIMM when using files on host nvdimm as backend, except the > > guarantee of write persistence in case of qemu/host crash. > > > > We may check the kernel support of MAP_SYNC and the type of vNVDIMM > > backend in some management utility (e.g., libvirt?), and deny to > > launch QEMU if MAP_SYNC is not supported while files on host NVDIMM > > are in use. > > Instead of making libvirt check if MAP_SYNC is supported and just > hope it won't fail, it would be safer to let libvirt tell QEMU > that MAP_SYNC must never fail. For example, add an option "sync" to memory-backend-file, and pass the it to qemu_ram_mmap()? > > However, it looks like kernel 4.14 won't even fail if MAP_SYNC is > specified. How exactly can userspace detect if MAP_SYNC is > really supported? Use MAP_SYNC with MAP_SHARED_VALIDATE (both introduced in 4.15 kernel). Linux kernel 4.15 and later validate whether the MAP_SYNC is supported. Because MAP_SHARED_VALIDATE is defined equally to (MAP_SHARED | MAP_PRIVATE), it always fails on older kernels which do not support MAP_SYNC as well. If we agree to introduce an option "sync" or likelihood, we can do the above check in qemu_ram_mmap(). Haozhong
On Thu, Jan 04, 2018 at 09:23:28AM +0800, Haozhong Zhang wrote: > On 01/03/18 11:45 -0200, Eduardo Habkost wrote: > > On Wed, Jan 03, 2018 at 11:16:39AM +0800, Haozhong Zhang wrote: > > > On 01/02/18 18:02 +0200, Michael S. Tsirkin wrote: > > > > On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote: > > > > > When a file supporting DAX is used as vNVDIMM backend, mmap it with > > > > > MAP_SYNC flag in addition can guarantee the persistence of guest write > > > > > to the backend file without other QEMU actions (e.g., periodic fsync() > > > > > by QEMU). > > > > > > > > > > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap > > > > > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the > > > > > backend file. On such failures, QEMU retries mmap without MAP_SYNC and > > > > > MAP_SHARED_VALIDATE. > > > > > > > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> > > > > > > > > If users rely on MAP_SYNC then don't you need to fail allocation > > > > if you can't use it? > > > > > > MAP_SYNC is supported since Linux kernel 4.15 and only needed for mmap > > > files on nvdimm. qemu_ram_mmap() has no way to check whether its > > > parameter 'fd' points to files on nvdimm, except by looking up > > > sysfs. However, accessing sysfs may be denied by certain SELinux > > > policies. > > > > > > The missing of MAP_SYNC should not affect the primary functionality of > > > vNVDIMM when using files on host nvdimm as backend, except the > > > guarantee of write persistence in case of qemu/host crash. > > > > > > We may check the kernel support of MAP_SYNC and the type of vNVDIMM > > > backend in some management utility (e.g., libvirt?), and deny to > > > launch QEMU if MAP_SYNC is not supported while files on host NVDIMM > > > are in use. > > > > Instead of making libvirt check if MAP_SYNC is supported and just > > hope it won't fail, it would be safer to let libvirt tell QEMU > > that MAP_SYNC must never fail. > > For example, add an option "sync" to memory-backend-file, and pass the > it to qemu_ram_mmap()? Yes. It could be a OnOffAuto option, "auto" would make QEMU try to use MAP_SYNC but not fail if it's unavailable. "on" would make QEMU ensure MAP_SYNC is really enabled. > > > > > However, it looks like kernel 4.14 won't even fail if MAP_SYNC is > > specified. How exactly can userspace detect if MAP_SYNC is > > really supported? > > Use MAP_SYNC with MAP_SHARED_VALIDATE (both introduced in 4.15 > kernel). Linux kernel 4.15 and later validate whether the MAP_SYNC is > supported. Because MAP_SHARED_VALIDATE is defined equally to > (MAP_SHARED | MAP_PRIVATE), it always fails on older kernels which do > not support MAP_SYNC as well. Nice. > > If we agree to introduce an option "sync" or likelihood, we can do the > above check in qemu_ram_mmap(). Sounds good to me. Thanks!
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index 2fd8cbcc6f..37b302f057 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -18,7 +18,18 @@ #ifdef CONFIG_LINUX #include <sys/vfs.h> + +/* + * MAP_SHARED_VALIDATE and MAP_SYNC were introduced in 4.15 kernel, so + * they may not be defined when compiling on older kernels. + */ +#ifndef MAP_SHARED_VALIDATE +#define MAP_SHARED_VALIDATE 0x3 #endif +#ifndef MAP_SYNC +#define MAP_SYNC 0x80000 +#endif +#endif /* CONFIG_LINUX */ size_t qemu_fd_getpagesize(int fd) { @@ -97,6 +108,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) #endif size_t offset; void *ptr1; + int xflags = 0; if (ptr == MAP_FAILED) { return MAP_FAILED; @@ -107,12 +119,34 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) assert(align >= getpagesize()); offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr; + +#if defined(__linux__) + /* + * If 'fd' refers to a file supporting DAX, mmap it with MAP_SYNC + * will guarantee the guest write persistence without other + * actions in QEMU (e.g., fsync() in QEMU). + * + * MAP_SHARED_VALIDATE ensures mmap with MAP_SYNC fails if + * MAP_SYNC is not supported by the kernel or the file. + * + * On failures of mmap with xflags, QEMU will retry mmap without + * xflags. + */ + xflags = shared ? (MAP_SHARED_VALIDATE | MAP_SYNC) : 0; +#endif + + retry_mmap_fd: ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE, MAP_FIXED | (fd == -1 ? MAP_ANONYMOUS : 0) | - (shared ? MAP_SHARED : MAP_PRIVATE), + (shared ? MAP_SHARED : MAP_PRIVATE) | xflags, fd, 0); if (ptr1 == MAP_FAILED) { + if (xflags) { + xflags = 0; + goto retry_mmap_fd; + } + munmap(ptr, total); return MAP_FAILED; }
When a file supporting DAX is used as vNVDIMM backend, mmap it with MAP_SYNC flag in addition can guarantee the persistence of guest write to the backend file without other QEMU actions (e.g., periodic fsync() by QEMU). By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the backend file. On such failures, QEMU retries mmap without MAP_SYNC and MAP_SHARED_VALIDATE. Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com> --- util/mmap-alloc.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-)