diff mbox

util/mmap-alloc: support MAP_SYNC in qemu_ram_mmap()

Message ID 20171227065620.20889-1-haozhong.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Haozhong Zhang Dec. 27, 2017, 6:56 a.m. UTC
When a file supporting DAX is used as vNVDIMM backend, mmap it with
MAP_SYNC flag in addition can guarantee the persistence of guest write
to the backend file without other QEMU actions (e.g., periodic fsync()
by QEMU).

By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap
with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the
backend file. On such failures, QEMU retries mmap without MAP_SYNC and
MAP_SHARED_VALIDATE.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
 util/mmap-alloc.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

Comments

Michael S. Tsirkin Jan. 2, 2018, 4:02 p.m. UTC | #1
On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote:
> When a file supporting DAX is used as vNVDIMM backend, mmap it with
> MAP_SYNC flag in addition can guarantee the persistence of guest write
> to the backend file without other QEMU actions (e.g., periodic fsync()
> by QEMU).
> 
> By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap
> with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the
> backend file. On such failures, QEMU retries mmap without MAP_SYNC and
> MAP_SHARED_VALIDATE.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>

If users rely on MAP_SYNC then don't you need to fail allocation
if you can't use it?

> ---
>  util/mmap-alloc.c | 36 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> index 2fd8cbcc6f..37b302f057 100644
> --- a/util/mmap-alloc.c
> +++ b/util/mmap-alloc.c
> @@ -18,7 +18,18 @@
>  
>  #ifdef CONFIG_LINUX
>  #include <sys/vfs.h>
> +
> +/*
> + * MAP_SHARED_VALIDATE and MAP_SYNC were introduced in 4.15 kernel, so
> + * they may not be defined when compiling on older kernels.
> + */
> +#ifndef MAP_SHARED_VALIDATE
> +#define MAP_SHARED_VALIDATE   0x3
>  #endif
> +#ifndef MAP_SYNC
> +#define MAP_SYNC              0x80000
> +#endif
> +#endif /* CONFIG_LINUX */
>  
>  size_t qemu_fd_getpagesize(int fd)
>  {

This stuff is arch-specific. I think you want to import this into
standard-headers rather than duplicate things from Linux.


> @@ -97,6 +108,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
>  #endif
>      size_t offset;
>      void *ptr1;
> +    int xflags = 0;
>  
>      if (ptr == MAP_FAILED) {
>          return MAP_FAILED;
> @@ -107,12 +119,34 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
>      assert(align >= getpagesize());
>  
>      offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
> +
> +#if defined(__linux__)
> +    /*
> +     * If 'fd' refers to a file supporting DAX, mmap it with MAP_SYNC
> +     * will guarantee the guest write persistence without other
> +     * actions in QEMU (e.g., fsync() in QEMU).
> +     *
> +     * MAP_SHARED_VALIDATE ensures mmap with MAP_SYNC fails if
> +     * MAP_SYNC is not supported by the kernel or the file.
> +     *
> +     * On failures of mmap with xflags, QEMU will retry mmap without
> +     * xflags.

If all you are doing is retrying without, then I don't think you
even need VALIDATE.

> +     */
> +    xflags = shared ? (MAP_SHARED_VALIDATE | MAP_SYNC) : 0;
> +#endif
> +
> + retry_mmap_fd:
>      ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
>                  MAP_FIXED |
>                  (fd == -1 ? MAP_ANONYMOUS : 0) |
> -                (shared ? MAP_SHARED : MAP_PRIVATE),
> +                (shared ? MAP_SHARED : MAP_PRIVATE) | xflags,
>                  fd, 0);
>      if (ptr1 == MAP_FAILED) {
> +        if (xflags) {
> +            xflags = 0;
> +            goto retry_mmap_fd;
> +        }
> +
>          munmap(ptr, total);
>          return MAP_FAILED;
>      }
> -- 
> 2.14.1
Haozhong Zhang Jan. 3, 2018, 3:16 a.m. UTC | #2
On 01/02/18 18:02 +0200, Michael S. Tsirkin wrote:
> On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote:
> > When a file supporting DAX is used as vNVDIMM backend, mmap it with
> > MAP_SYNC flag in addition can guarantee the persistence of guest write
> > to the backend file without other QEMU actions (e.g., periodic fsync()
> > by QEMU).
> > 
> > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap
> > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the
> > backend file. On such failures, QEMU retries mmap without MAP_SYNC and
> > MAP_SHARED_VALIDATE.
> > 
> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> 
> If users rely on MAP_SYNC then don't you need to fail allocation
> if you can't use it?

MAP_SYNC is supported since Linux kernel 4.15 and only needed for mmap
files on nvdimm. qemu_ram_mmap() has no way to check whether its
parameter 'fd' points to files on nvdimm, except by looking up
sysfs. However, accessing sysfs may be denied by certain SELinux
policies.

The missing of MAP_SYNC should not affect the primary functionality of
vNVDIMM when using files on host nvdimm as backend, except the
guarantee of write persistence in case of qemu/host crash.

We may check the kernel support of MAP_SYNC and the type of vNVDIMM
backend in some management utility (e.g., libvirt?), and deny to
launch QEMU if MAP_SYNC is not supported while files on host NVDIMM
are in use.

> 
> > ---
> >  util/mmap-alloc.c | 36 +++++++++++++++++++++++++++++++++++-
> >  1 file changed, 35 insertions(+), 1 deletion(-)
> > 
> > diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> > index 2fd8cbcc6f..37b302f057 100644
> > --- a/util/mmap-alloc.c
> > +++ b/util/mmap-alloc.c
> > @@ -18,7 +18,18 @@
> >  
> >  #ifdef CONFIG_LINUX
> >  #include <sys/vfs.h>
> > +
> > +/*
> > + * MAP_SHARED_VALIDATE and MAP_SYNC were introduced in 4.15 kernel, so
> > + * they may not be defined when compiling on older kernels.
> > + */
> > +#ifndef MAP_SHARED_VALIDATE
> > +#define MAP_SHARED_VALIDATE   0x3
> >  #endif
> > +#ifndef MAP_SYNC
> > +#define MAP_SYNC              0x80000
> > +#endif
> > +#endif /* CONFIG_LINUX */
> >  
> >  size_t qemu_fd_getpagesize(int fd)
> >  {
> 
> This stuff is arch-specific. I think you want to import this into
> standard-headers rather than duplicate things from Linux.

I'll move them to osdep.h.

> 
> 
> > @@ -97,6 +108,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
> >  #endif
> >      size_t offset;
> >      void *ptr1;
> > +    int xflags = 0;
> >  
> >      if (ptr == MAP_FAILED) {
> >          return MAP_FAILED;
> > @@ -107,12 +119,34 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
> >      assert(align >= getpagesize());
> >  
> >      offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
> > +
> > +#if defined(__linux__)
> > +    /*
> > +     * If 'fd' refers to a file supporting DAX, mmap it with MAP_SYNC
> > +     * will guarantee the guest write persistence without other
> > +     * actions in QEMU (e.g., fsync() in QEMU).
> > +     *
> > +     * MAP_SHARED_VALIDATE ensures mmap with MAP_SYNC fails if
> > +     * MAP_SYNC is not supported by the kernel or the file.
> > +     *
> > +     * On failures of mmap with xflags, QEMU will retry mmap without
> > +     * xflags.
> 
> If all you are doing is retrying without, then I don't think you
> even need VALIDATE.

Yes, MAP_SHARED_VALIDATE is not necessary. I'll remove it from xflags.

Thanks,
Haozhong

> 
> > +     */
> > +    xflags = shared ? (MAP_SHARED_VALIDATE | MAP_SYNC) : 0;
> > +#endif
> > +
> > + retry_mmap_fd:
> >      ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
> >                  MAP_FIXED |
> >                  (fd == -1 ? MAP_ANONYMOUS : 0) |
> > -                (shared ? MAP_SHARED : MAP_PRIVATE),
> > +                (shared ? MAP_SHARED : MAP_PRIVATE) | xflags,
> >                  fd, 0);
> >      if (ptr1 == MAP_FAILED) {
> > +        if (xflags) {
> > +            xflags = 0;
> > +            goto retry_mmap_fd;
> > +        }
> > +
> >          munmap(ptr, total);
> >          return MAP_FAILED;
> >      }
> > -- 
> > 2.14.1
Eduardo Habkost Jan. 3, 2018, 1:45 p.m. UTC | #3
On Wed, Jan 03, 2018 at 11:16:39AM +0800, Haozhong Zhang wrote:
> On 01/02/18 18:02 +0200, Michael S. Tsirkin wrote:
> > On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote:
> > > When a file supporting DAX is used as vNVDIMM backend, mmap it with
> > > MAP_SYNC flag in addition can guarantee the persistence of guest write
> > > to the backend file without other QEMU actions (e.g., periodic fsync()
> > > by QEMU).
> > > 
> > > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap
> > > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the
> > > backend file. On such failures, QEMU retries mmap without MAP_SYNC and
> > > MAP_SHARED_VALIDATE.
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > 
> > If users rely on MAP_SYNC then don't you need to fail allocation
> > if you can't use it?
> 
> MAP_SYNC is supported since Linux kernel 4.15 and only needed for mmap
> files on nvdimm. qemu_ram_mmap() has no way to check whether its
> parameter 'fd' points to files on nvdimm, except by looking up
> sysfs. However, accessing sysfs may be denied by certain SELinux
> policies.
> 
> The missing of MAP_SYNC should not affect the primary functionality of
> vNVDIMM when using files on host nvdimm as backend, except the
> guarantee of write persistence in case of qemu/host crash.
> 
> We may check the kernel support of MAP_SYNC and the type of vNVDIMM
> backend in some management utility (e.g., libvirt?), and deny to
> launch QEMU if MAP_SYNC is not supported while files on host NVDIMM
> are in use.

Instead of making libvirt check if MAP_SYNC is supported and just
hope it won't fail, it would be safer to let libvirt tell QEMU
that MAP_SYNC must never fail.

However, it looks like kernel 4.14 won't even fail if MAP_SYNC is
specified.  How exactly can userspace detect if MAP_SYNC is
really supported?
Haozhong Zhang Jan. 4, 2018, 1:23 a.m. UTC | #4
On 01/03/18 11:45 -0200, Eduardo Habkost wrote:
> On Wed, Jan 03, 2018 at 11:16:39AM +0800, Haozhong Zhang wrote:
> > On 01/02/18 18:02 +0200, Michael S. Tsirkin wrote:
> > > On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote:
> > > > When a file supporting DAX is used as vNVDIMM backend, mmap it with
> > > > MAP_SYNC flag in addition can guarantee the persistence of guest write
> > > > to the backend file without other QEMU actions (e.g., periodic fsync()
> > > > by QEMU).
> > > > 
> > > > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap
> > > > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the
> > > > backend file. On such failures, QEMU retries mmap without MAP_SYNC and
> > > > MAP_SHARED_VALIDATE.
> > > > 
> > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > 
> > > If users rely on MAP_SYNC then don't you need to fail allocation
> > > if you can't use it?
> > 
> > MAP_SYNC is supported since Linux kernel 4.15 and only needed for mmap
> > files on nvdimm. qemu_ram_mmap() has no way to check whether its
> > parameter 'fd' points to files on nvdimm, except by looking up
> > sysfs. However, accessing sysfs may be denied by certain SELinux
> > policies.
> > 
> > The missing of MAP_SYNC should not affect the primary functionality of
> > vNVDIMM when using files on host nvdimm as backend, except the
> > guarantee of write persistence in case of qemu/host crash.
> > 
> > We may check the kernel support of MAP_SYNC and the type of vNVDIMM
> > backend in some management utility (e.g., libvirt?), and deny to
> > launch QEMU if MAP_SYNC is not supported while files on host NVDIMM
> > are in use.
> 
> Instead of making libvirt check if MAP_SYNC is supported and just
> hope it won't fail, it would be safer to let libvirt tell QEMU
> that MAP_SYNC must never fail.

For example, add an option "sync" to memory-backend-file, and pass the
it to qemu_ram_mmap()?

> 
> However, it looks like kernel 4.14 won't even fail if MAP_SYNC is
> specified.  How exactly can userspace detect if MAP_SYNC is
> really supported?

Use MAP_SYNC with MAP_SHARED_VALIDATE (both introduced in 4.15
kernel). Linux kernel 4.15 and later validate whether the MAP_SYNC is
supported. Because MAP_SHARED_VALIDATE is defined equally to
(MAP_SHARED | MAP_PRIVATE), it always fails on older kernels which do
not support MAP_SYNC as well.

If we agree to introduce an option "sync" or likelihood, we can do the
above check in qemu_ram_mmap().


Haozhong
Eduardo Habkost Jan. 4, 2018, 11:57 a.m. UTC | #5
On Thu, Jan 04, 2018 at 09:23:28AM +0800, Haozhong Zhang wrote:
> On 01/03/18 11:45 -0200, Eduardo Habkost wrote:
> > On Wed, Jan 03, 2018 at 11:16:39AM +0800, Haozhong Zhang wrote:
> > > On 01/02/18 18:02 +0200, Michael S. Tsirkin wrote:
> > > > On Wed, Dec 27, 2017 at 02:56:20PM +0800, Haozhong Zhang wrote:
> > > > > When a file supporting DAX is used as vNVDIMM backend, mmap it with
> > > > > MAP_SYNC flag in addition can guarantee the persistence of guest write
> > > > > to the backend file without other QEMU actions (e.g., periodic fsync()
> > > > > by QEMU).
> > > > > 
> > > > > By using MAP_SHARED_VALIDATE flag with MAP_SYNC, we can ensure mmap
> > > > > with MAP_SYNC fails if MAP_SYNC is not supported by the kernel or the
> > > > > backend file. On such failures, QEMU retries mmap without MAP_SYNC and
> > > > > MAP_SHARED_VALIDATE.
> > > > > 
> > > > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > > > 
> > > > If users rely on MAP_SYNC then don't you need to fail allocation
> > > > if you can't use it?
> > > 
> > > MAP_SYNC is supported since Linux kernel 4.15 and only needed for mmap
> > > files on nvdimm. qemu_ram_mmap() has no way to check whether its
> > > parameter 'fd' points to files on nvdimm, except by looking up
> > > sysfs. However, accessing sysfs may be denied by certain SELinux
> > > policies.
> > > 
> > > The missing of MAP_SYNC should not affect the primary functionality of
> > > vNVDIMM when using files on host nvdimm as backend, except the
> > > guarantee of write persistence in case of qemu/host crash.
> > > 
> > > We may check the kernel support of MAP_SYNC and the type of vNVDIMM
> > > backend in some management utility (e.g., libvirt?), and deny to
> > > launch QEMU if MAP_SYNC is not supported while files on host NVDIMM
> > > are in use.
> > 
> > Instead of making libvirt check if MAP_SYNC is supported and just
> > hope it won't fail, it would be safer to let libvirt tell QEMU
> > that MAP_SYNC must never fail.
> 
> For example, add an option "sync" to memory-backend-file, and pass the
> it to qemu_ram_mmap()?

Yes.  It could be a OnOffAuto option, "auto" would make QEMU try
to use MAP_SYNC but not fail if it's unavailable. "on" would
make QEMU ensure MAP_SYNC is really enabled.

> 
> > 
> > However, it looks like kernel 4.14 won't even fail if MAP_SYNC is
> > specified.  How exactly can userspace detect if MAP_SYNC is
> > really supported?
> 
> Use MAP_SYNC with MAP_SHARED_VALIDATE (both introduced in 4.15
> kernel). Linux kernel 4.15 and later validate whether the MAP_SYNC is
> supported. Because MAP_SHARED_VALIDATE is defined equally to
> (MAP_SHARED | MAP_PRIVATE), it always fails on older kernels which do
> not support MAP_SYNC as well.

Nice.

> 
> If we agree to introduce an option "sync" or likelihood, we can do the
> above check in qemu_ram_mmap().

Sounds good to me.  Thanks!
diff mbox

Patch

diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 2fd8cbcc6f..37b302f057 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -18,7 +18,18 @@ 
 
 #ifdef CONFIG_LINUX
 #include <sys/vfs.h>
+
+/*
+ * MAP_SHARED_VALIDATE and MAP_SYNC were introduced in 4.15 kernel, so
+ * they may not be defined when compiling on older kernels.
+ */
+#ifndef MAP_SHARED_VALIDATE
+#define MAP_SHARED_VALIDATE   0x3
 #endif
+#ifndef MAP_SYNC
+#define MAP_SYNC              0x80000
+#endif
+#endif /* CONFIG_LINUX */
 
 size_t qemu_fd_getpagesize(int fd)
 {
@@ -97,6 +108,7 @@  void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
 #endif
     size_t offset;
     void *ptr1;
+    int xflags = 0;
 
     if (ptr == MAP_FAILED) {
         return MAP_FAILED;
@@ -107,12 +119,34 @@  void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared)
     assert(align >= getpagesize());
 
     offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
+
+#if defined(__linux__)
+    /*
+     * If 'fd' refers to a file supporting DAX, mmap it with MAP_SYNC
+     * will guarantee the guest write persistence without other
+     * actions in QEMU (e.g., fsync() in QEMU).
+     *
+     * MAP_SHARED_VALIDATE ensures mmap with MAP_SYNC fails if
+     * MAP_SYNC is not supported by the kernel or the file.
+     *
+     * On failures of mmap with xflags, QEMU will retry mmap without
+     * xflags.
+     */
+    xflags = shared ? (MAP_SHARED_VALIDATE | MAP_SYNC) : 0;
+#endif
+
+ retry_mmap_fd:
     ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
                 MAP_FIXED |
                 (fd == -1 ? MAP_ANONYMOUS : 0) |
-                (shared ? MAP_SHARED : MAP_PRIVATE),
+                (shared ? MAP_SHARED : MAP_PRIVATE) | xflags,
                 fd, 0);
     if (ptr1 == MAP_FAILED) {
+        if (xflags) {
+            xflags = 0;
+            goto retry_mmap_fd;
+        }
+
         munmap(ptr, total);
         return MAP_FAILED;
     }