diff mbox series

[V1,17/26] machine: memfd-alloc option

Message ID 1714406135-451286-18-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New, archived
Headers show
Series Live update: cpr-exec | expand

Commit Message

Steven Sistare April 29, 2024, 3:55 p.m. UTC
Allocate anonymous memory using memfd_create if the memfd-alloc machine
option is set.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 hw/core/machine.c   | 22 ++++++++++++++++++++++
 include/hw/boards.h |  1 +
 qemu-options.hx     |  6 ++++++
 system/memory.c     |  9 ++++++---
 system/physmem.c    | 18 +++++++++++++++++-
 system/trace-events |  1 +
 6 files changed, 53 insertions(+), 4 deletions(-)

Comments

Peter Xu May 28, 2024, 9:12 p.m. UTC | #1
On Mon, Apr 29, 2024 at 08:55:26AM -0700, Steve Sistare wrote:
> Allocate anonymous memory using memfd_create if the memfd-alloc machine
> option is set.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>  hw/core/machine.c   | 22 ++++++++++++++++++++++
>  include/hw/boards.h |  1 +
>  qemu-options.hx     |  6 ++++++
>  system/memory.c     |  9 ++++++---
>  system/physmem.c    | 18 +++++++++++++++++-
>  system/trace-events |  1 +
>  6 files changed, 53 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 582c2df..9567b97 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -443,6 +443,20 @@ static void machine_set_mem_merge(Object *obj, bool value, Error **errp)
>      ms->mem_merge = value;
>  }
>  
> +static bool machine_get_memfd_alloc(Object *obj, Error **errp)
> +{
> +    MachineState *ms = MACHINE(obj);
> +
> +    return ms->memfd_alloc;
> +}
> +
> +static void machine_set_memfd_alloc(Object *obj, bool value, Error **errp)
> +{
> +    MachineState *ms = MACHINE(obj);
> +
> +    ms->memfd_alloc = value;
> +}
> +
>  static bool machine_get_usb(Object *obj, Error **errp)
>  {
>      MachineState *ms = MACHINE(obj);
> @@ -1044,6 +1058,11 @@ static void machine_class_init(ObjectClass *oc, void *data)
>      object_class_property_set_description(oc, "mem-merge",
>          "Enable/disable memory merge support");
>  
> +    object_class_property_add_bool(oc, "memfd-alloc",
> +        machine_get_memfd_alloc, machine_set_memfd_alloc);
> +    object_class_property_set_description(oc, "memfd-alloc",
> +        "Enable/disable allocating anonymous memory using memfd_create");
> +
>      object_class_property_add_bool(oc, "usb",
>          machine_get_usb, machine_set_usb);
>      object_class_property_set_description(oc, "usb",
> @@ -1387,6 +1406,9 @@ static bool create_default_memdev(MachineState *ms, const char *path, Error **er
>      if (!object_property_set_int(obj, "size", ms->ram_size, errp)) {
>          goto out;
>      }
> +    if (!object_property_set_bool(obj, "share", ms->memfd_alloc, errp)) {
> +        goto out;
> +    }
>      object_property_add_child(object_get_objects_root(), mc->default_ram_id,
>                                obj);
>      /* Ensure backend's memory region name is equal to mc->default_ram_id */
> diff --git a/include/hw/boards.h b/include/hw/boards.h
> index 69c1ba4..96259c3 100644
> --- a/include/hw/boards.h
> +++ b/include/hw/boards.h
> @@ -372,6 +372,7 @@ struct MachineState {
>      bool dump_guest_core;
>      bool mem_merge;
>      bool require_guest_memfd;
> +    bool memfd_alloc;
>      bool usb;
>      bool usb_disabled;
>      char *firmware;
> diff --git a/qemu-options.hx b/qemu-options.hx
> index cf61f6b..f0dfda5 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -32,6 +32,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
>      "                vmport=on|off|auto controls emulation of vmport (default: auto)\n"
>      "                dump-guest-core=on|off include guest memory in a core dump (default=on)\n"
>      "                mem-merge=on|off controls memory merge support (default: on)\n"
> +    "                memfd-alloc=on|off controls allocating anonymous guest RAM using memfd_create (default: off)\n"
>      "                aes-key-wrap=on|off controls support for AES key wrapping (default=on)\n"
>      "                dea-key-wrap=on|off controls support for DEA key wrapping (default=on)\n"
>      "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
> @@ -79,6 +80,11 @@ SRST
>          supported by the host, de-duplicates identical memory pages
>          among VMs instances (enabled by default).
>  
> +    ``memfd-alloc=on|off``
> +        Enables or disables allocation of anonymous guest RAM using
> +        memfd_create.  Any associated memory-backend objects are created with
> +        share=on.  The memfd-alloc default is off.
> +
>      ``aes-key-wrap=on|off``
>          Enables or disables AES key wrapping support on s390-ccw hosts.
>          This feature controls whether AES wrapping keys will be created
> diff --git a/system/memory.c b/system/memory.c
> index 49f1cb2..ca04a0e 100644
> --- a/system/memory.c
> +++ b/system/memory.c
> @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
>                                        uint64_t size,
>                                        Error **errp)
>  {
> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;

If there's a machine option to "use memfd for allocations", then it's
shared mem... Hmm..

It is a bit confusing to me in quite a few levels:

  - Why memory allocation method will be defined by a machine property,
    even if we have memory-backend-* which should cover everything?

  - Even if we have such a machine property, why setting "memfd" will
    always imply shared?  why not private?  After all it's not called
    "memfd-shared-alloc", and we can create private mappings using
    e.g. memory-backend-memfd,share=off.

>      return memory_region_init_ram_flags_nomigrate(mr, owner, name,
> -                                                  size, 0, errp);
> +                                                  size, flags, errp);
>  }
>  
>  bool memory_region_init_ram_flags_nomigrate(MemoryRegion *mr,
> @@ -1713,8 +1714,9 @@ bool memory_region_init_rom_nomigrate(MemoryRegion *mr,
>                                        uint64_t size,
>                                        Error **errp)
>  {
> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
>      if (!memory_region_init_ram_flags_nomigrate(mr, owner, name,
> -                                                size, 0, errp)) {
> +                                                size, flags, errp)) {
>           return false;
>      }
>      mr->readonly = true;
> @@ -1731,6 +1733,7 @@ bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
>                                               Error **errp)
>  {
>      Error *err = NULL;
> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
>      assert(ops);
>      memory_region_init(mr, owner, name, size);
>      mr->ops = ops;
> @@ -1738,7 +1741,7 @@ bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
>      mr->terminates = true;
>      mr->rom_device = true;
>      mr->destructor = memory_region_destructor_ram;
> -    mr->ram_block = qemu_ram_alloc(size, 0, mr, &err);
> +    mr->ram_block = qemu_ram_alloc(size, flags, mr, &err);
>      if (err) {
>          mr->size = int128_zero();
>          object_unparent(OBJECT(mr));
> diff --git a/system/physmem.c b/system/physmem.c
> index c736af5..36d97ec 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -45,6 +45,7 @@
>  #include "qemu/qemu-print.h"
>  #include "qemu/log.h"
>  #include "qemu/memalign.h"
> +#include "qemu/memfd.h"
>  #include "exec/memory.h"
>  #include "exec/ioport.h"
>  #include "sysemu/dma.h"
> @@ -1825,6 +1826,19 @@ static void *ram_block_alloc_host(RAMBlock *rb, Error **errp)
>      if (xen_enabled()) {
>          xen_ram_alloc(rb->offset, rb->max_length, mr, errp);
>  
> +    } else if (rb->flags & RAM_SHARED) {
> +        if (rb->fd == -1) {
> +            mr->align = QEMU_VMALLOC_ALIGN;
> +            rb->fd = qemu_memfd_create(rb->idstr, rb->max_length + mr->align,
> +                                       0, 0, 0, errp);
> +        }

We used to have case where RAM_SHARED && rb->fd==-1, I think.

We have some code that checks explicitly on rb->fd against -1 to know
whether it's a fd based.  I'm not sure whether there'll be implications to
affect those codes.

Maybe it's mostly fine, OTOH I worry more on the whole idea.  I'm not sure
whether this is relevant to "we want to be able to share the mem with the
new process", in this case can we simply require the user to use file based
memory backends, rather than such change?

Thanks,

> +        if (rb->fd >= 0) {
> +            int mfd = rb->fd;
> +            qemu_set_cloexec(mfd);
> +            host = file_ram_alloc(rb, rb->max_length, mfd, false, 0, errp);
> +            trace_qemu_anon_memfd_alloc(rb->idstr, rb->max_length, mfd, host);
> +        }
> +
>      } else {
>          host = qemu_anon_ram_alloc(rb->max_length, &mr->align,
>                                     qemu_ram_is_shared(rb),
> @@ -2106,8 +2120,10 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
>                                                       void *host),
>                                       MemoryRegion *mr, Error **errp)
>  {
> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
> +    flags |= RAM_RESIZEABLE;
>      return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
> -                                   RAM_RESIZEABLE, mr, errp);
> +                                   flags, mr, errp);
>  }
>  
>  static void reclaim_ramblock(RAMBlock *block)
> diff --git a/system/trace-events b/system/trace-events
> index f0a80ba..0092734 100644
> --- a/system/trace-events
> +++ b/system/trace-events
> @@ -41,3 +41,4 @@ dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"P
>  
>  # physmem.c
>  ram_block_create(const char *name, uint32_t flags, int fd, size_t used_length, size_t max_length, size_t align) "%s, flags %u, fd %d, len %lu, maxlen %lu, align %lu"
> +qemu_anon_memfd_alloc(const char *name, size_t size, int fd, void *ptr) "%s size %zu fd %d -> %p"
> -- 
> 1.8.3.1
>
Steven Sistare May 29, 2024, 5:31 p.m. UTC | #2
On 5/28/2024 5:12 PM, Peter Xu wrote:
> On Mon, Apr 29, 2024 at 08:55:26AM -0700, Steve Sistare wrote:
>> Allocate anonymous memory using memfd_create if the memfd-alloc machine
>> option is set.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>>   hw/core/machine.c   | 22 ++++++++++++++++++++++
>>   include/hw/boards.h |  1 +
>>   qemu-options.hx     |  6 ++++++
>>   system/memory.c     |  9 ++++++---
>>   system/physmem.c    | 18 +++++++++++++++++-
>>   system/trace-events |  1 +
>>   6 files changed, 53 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/core/machine.c b/hw/core/machine.c
>> index 582c2df..9567b97 100644
>> --- a/hw/core/machine.c
>> +++ b/hw/core/machine.c
>> @@ -443,6 +443,20 @@ static void machine_set_mem_merge(Object *obj, bool value, Error **errp)
>>       ms->mem_merge = value;
>>   }
>>   
>> +static bool machine_get_memfd_alloc(Object *obj, Error **errp)
>> +{
>> +    MachineState *ms = MACHINE(obj);
>> +
>> +    return ms->memfd_alloc;
>> +}
>> +
>> +static void machine_set_memfd_alloc(Object *obj, bool value, Error **errp)
>> +{
>> +    MachineState *ms = MACHINE(obj);
>> +
>> +    ms->memfd_alloc = value;
>> +}
>> +
>>   static bool machine_get_usb(Object *obj, Error **errp)
>>   {
>>       MachineState *ms = MACHINE(obj);
>> @@ -1044,6 +1058,11 @@ static void machine_class_init(ObjectClass *oc, void *data)
>>       object_class_property_set_description(oc, "mem-merge",
>>           "Enable/disable memory merge support");
>>   
>> +    object_class_property_add_bool(oc, "memfd-alloc",
>> +        machine_get_memfd_alloc, machine_set_memfd_alloc);
>> +    object_class_property_set_description(oc, "memfd-alloc",
>> +        "Enable/disable allocating anonymous memory using memfd_create");
>> +
>>       object_class_property_add_bool(oc, "usb",
>>           machine_get_usb, machine_set_usb);
>>       object_class_property_set_description(oc, "usb",
>> @@ -1387,6 +1406,9 @@ static bool create_default_memdev(MachineState *ms, const char *path, Error **er
>>       if (!object_property_set_int(obj, "size", ms->ram_size, errp)) {
>>           goto out;
>>       }
>> +    if (!object_property_set_bool(obj, "share", ms->memfd_alloc, errp)) {
>> +        goto out;
>> +    }
>>       object_property_add_child(object_get_objects_root(), mc->default_ram_id,
>>                                 obj);
>>       /* Ensure backend's memory region name is equal to mc->default_ram_id */
>> diff --git a/include/hw/boards.h b/include/hw/boards.h
>> index 69c1ba4..96259c3 100644
>> --- a/include/hw/boards.h
>> +++ b/include/hw/boards.h
>> @@ -372,6 +372,7 @@ struct MachineState {
>>       bool dump_guest_core;
>>       bool mem_merge;
>>       bool require_guest_memfd;
>> +    bool memfd_alloc;
>>       bool usb;
>>       bool usb_disabled;
>>       char *firmware;
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index cf61f6b..f0dfda5 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -32,6 +32,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
>>       "                vmport=on|off|auto controls emulation of vmport (default: auto)\n"
>>       "                dump-guest-core=on|off include guest memory in a core dump (default=on)\n"
>>       "                mem-merge=on|off controls memory merge support (default: on)\n"
>> +    "                memfd-alloc=on|off controls allocating anonymous guest RAM using memfd_create (default: off)\n"
>>       "                aes-key-wrap=on|off controls support for AES key wrapping (default=on)\n"
>>       "                dea-key-wrap=on|off controls support for DEA key wrapping (default=on)\n"
>>       "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
>> @@ -79,6 +80,11 @@ SRST
>>           supported by the host, de-duplicates identical memory pages
>>           among VMs instances (enabled by default).
>>   
>> +    ``memfd-alloc=on|off``
>> +        Enables or disables allocation of anonymous guest RAM using
>> +        memfd_create.  Any associated memory-backend objects are created with
>> +        share=on.  The memfd-alloc default is off.
>> +
>>       ``aes-key-wrap=on|off``
>>           Enables or disables AES key wrapping support on s390-ccw hosts.
>>           This feature controls whether AES wrapping keys will be created
>> diff --git a/system/memory.c b/system/memory.c
>> index 49f1cb2..ca04a0e 100644
>> --- a/system/memory.c
>> +++ b/system/memory.c
>> @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
>>                                         uint64_t size,
>>                                         Error **errp)
>>   {
>> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
> 
> If there's a machine option to "use memfd for allocations", then it's
> shared mem... Hmm..
>
> It is a bit confusing to me in quite a few levels:
> 
>    - Why memory allocation method will be defined by a machine property,
>      even if we have memory-backend-* which should cover everything?

Some memory regions are implicitly created, and have no explicit representation
on the qemu command line.  memfd-alloc affects those.

More generally, memfd-alloc affects all ramblock allocations that are
not explicitly represented by memory-backend object.  Thus the simple
command line "qemu -m 1G" does not explicitly describe an object, so it
goes through the anonymous allocation path, and is affected by memfd-alloc.

Internally, create_default_memdev does create a memory-backend object.
That is what my doc comment above refers to:
   Any associated memory-backend objects are created with share=on

An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.

The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:

+#     Memory backend objects must have the share=on attribute, and
+#     must be mmap'able in the new QEMU process.  For example,
+#     memory-backend-file is acceptable, but memory-backend-ram is
+#     not.
+#
+#     The VM must be started with the '-machine memfd-alloc=on'
+#     option.  This causes implicit ram blocks -- those not explicitly
+#     described by a memory-backend object -- to be allocated by
+#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
+#     RAM when it is specified without a memory-backend object.

>    - Even if we have such a machine property, why setting "memfd" will
>      always imply shared?  why not private?  After all it's not called
>      "memfd-shared-alloc", and we can create private mappings using
>      e.g. memory-backend-memfd,share=off.

There is no use case for memfd-alloc with share=off, so no point IMO in
making the option more verbose.  For cpr, the mapping with all its modifications
must be visible to new qemu when qemu mmaps it.

>>       return memory_region_init_ram_flags_nomigrate(mr, owner, name,
>> -                                                  size, 0, errp);
>> +                                                  size, flags, errp);
>>   }
>>   
>>   bool memory_region_init_ram_flags_nomigrate(MemoryRegion *mr,
>> @@ -1713,8 +1714,9 @@ bool memory_region_init_rom_nomigrate(MemoryRegion *mr,
>>                                         uint64_t size,
>>                                         Error **errp)
>>   {
>> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
>>       if (!memory_region_init_ram_flags_nomigrate(mr, owner, name,
>> -                                                size, 0, errp)) {
>> +                                                size, flags, errp)) {
>>            return false;
>>       }
>>       mr->readonly = true;
>> @@ -1731,6 +1733,7 @@ bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
>>                                                Error **errp)
>>   {
>>       Error *err = NULL;
>> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
>>       assert(ops);
>>       memory_region_init(mr, owner, name, size);
>>       mr->ops = ops;
>> @@ -1738,7 +1741,7 @@ bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
>>       mr->terminates = true;
>>       mr->rom_device = true;
>>       mr->destructor = memory_region_destructor_ram;
>> -    mr->ram_block = qemu_ram_alloc(size, 0, mr, &err);
>> +    mr->ram_block = qemu_ram_alloc(size, flags, mr, &err);
>>       if (err) {
>>           mr->size = int128_zero();
>>           object_unparent(OBJECT(mr));
>> diff --git a/system/physmem.c b/system/physmem.c
>> index c736af5..36d97ec 100644
>> --- a/system/physmem.c
>> +++ b/system/physmem.c
>> @@ -45,6 +45,7 @@
>>   #include "qemu/qemu-print.h"
>>   #include "qemu/log.h"
>>   #include "qemu/memalign.h"
>> +#include "qemu/memfd.h"
>>   #include "exec/memory.h"
>>   #include "exec/ioport.h"
>>   #include "sysemu/dma.h"
>> @@ -1825,6 +1826,19 @@ static void *ram_block_alloc_host(RAMBlock *rb, Error **errp)
>>       if (xen_enabled()) {
>>           xen_ram_alloc(rb->offset, rb->max_length, mr, errp);
>>   
>> +    } else if (rb->flags & RAM_SHARED) {
>> +        if (rb->fd == -1) {
>> +            mr->align = QEMU_VMALLOC_ALIGN;
>> +            rb->fd = qemu_memfd_create(rb->idstr, rb->max_length + mr->align,
>> +                                       0, 0, 0, errp);
>> +        }
> 
> We used to have case where RAM_SHARED && rb->fd==-1, I think.

file_ram_alloc is the only other function that assigns to rb->fd, and
hence the only function that cares about fd > 0.  It does not
interfere with memfd_alloc.  All calls to create ram blocks funnel
through these two functions:

qemu_ram_alloc_from_fd()
   ram_block_create()
   file_ram_alloc()

qemu_ram_alloc_internal()
   ram_block_create()
   ram_block_alloc_host()
     if (rb->flags & RAM_SHARED) {
         if (rb->fd == -1) {
             rb->fd = qemu_memfd_create()
         }
         if (rb->fd >= 0) {
             file_ram_alloc(rb->fd)

> We have some code that checks explicitly on rb->fd against -1 to know
> whether it's a fd based.  I'm not sure whether there'll be implications to
> affect those codes.

That's OK, the memfd allocation completely acts like an fd based ramblock.
   rb->fd = qemu_memfd_create()

> Maybe it's mostly fine, OTOH I worry more on the whole idea.  I'm not sure
> whether this is relevant to "we want to be able to share the mem with the
> new process", in this case can we simply require the user to use file based
> memory backends, rather than such change?

That does not work for implicitly created memory regions.

- Steve

>> +        if (rb->fd >= 0) {
>> +            int mfd = rb->fd;
>> +            qemu_set_cloexec(mfd);
>> +            host = file_ram_alloc(rb, rb->max_length, mfd, false, 0, errp);
>> +            trace_qemu_anon_memfd_alloc(rb->idstr, rb->max_length, mfd, host);
>> +        }
>> +
>>       } else {
>>           host = qemu_anon_ram_alloc(rb->max_length, &mr->align,
>>                                      qemu_ram_is_shared(rb),
>> @@ -2106,8 +2120,10 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
>>                                                        void *host),
>>                                        MemoryRegion *mr, Error **errp)
>>   {
>> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
>> +    flags |= RAM_RESIZEABLE;
>>       return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
>> -                                   RAM_RESIZEABLE, mr, errp);
>> +                                   flags, mr, errp);
>>   }
>>   
>>   static void reclaim_ramblock(RAMBlock *block)
>> diff --git a/system/trace-events b/system/trace-events
>> index f0a80ba..0092734 100644
>> --- a/system/trace-events
>> +++ b/system/trace-events
>> @@ -41,3 +41,4 @@ dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"P
>>   
>>   # physmem.c
>>   ram_block_create(const char *name, uint32_t flags, int fd, size_t used_length, size_t max_length, size_t align) "%s, flags %u, fd %d, len %lu, maxlen %lu, align %lu"
>> +qemu_anon_memfd_alloc(const char *name, size_t size, int fd, void *ptr) "%s size %zu fd %d -> %p"
>> -- 
>> 1.8.3.1
>>
>
Peter Xu May 29, 2024, 7:14 p.m. UTC | #3
On Wed, May 29, 2024 at 01:31:38PM -0400, Steven Sistare wrote:
> > > diff --git a/system/memory.c b/system/memory.c
> > > index 49f1cb2..ca04a0e 100644
> > > --- a/system/memory.c
> > > +++ b/system/memory.c
> > > @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
> > >                                         uint64_t size,
> > >                                         Error **errp)
> > >   {
> > > +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
> > 
> > If there's a machine option to "use memfd for allocations", then it's
> > shared mem... Hmm..
> > 
> > It is a bit confusing to me in quite a few levels:
> > 
> >    - Why memory allocation method will be defined by a machine property,
> >      even if we have memory-backend-* which should cover everything?
> 
> Some memory regions are implicitly created, and have no explicit representation
> on the qemu command line.  memfd-alloc affects those.
> 
> More generally, memfd-alloc affects all ramblock allocations that are
> not explicitly represented by memory-backend object.  Thus the simple
> command line "qemu -m 1G" does not explicitly describe an object, so it
> goes through the anonymous allocation path, and is affected by memfd-alloc.

Can we simply now allow "qemu -m 1G" to work for cpr-exec?  AFAIU that's
what we do with cpr-reboot: we ask the user to specify the right things to
make other thing work.  Otherwise it won't.

> 
> Internally, create_default_memdev does create a memory-backend object.
> That is what my doc comment above refers to:
>   Any associated memory-backend objects are created with share=on
> 
> An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.
> 
> The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:
> 
> +#     Memory backend objects must have the share=on attribute, and
> +#     must be mmap'able in the new QEMU process.  For example,
> +#     memory-backend-file is acceptable, but memory-backend-ram is
> +#     not.
> +#
> +#     The VM must be started with the '-machine memfd-alloc=on'
> +#     option.  This causes implicit ram blocks -- those not explicitly
> +#     described by a memory-backend object -- to be allocated by
> +#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
> +#     RAM when it is specified without a memory-backend object.

VGA is IIRC 16MB chunk, ROM is even smaller.  If the user specifies -object
memory-backend-file,share=on propertly, these should be the only outliers?

Are these important enough for the downtime?  Can we put them into the
migrated image alongside with the rest device states?

> 
> >    - Even if we have such a machine property, why setting "memfd" will
> >      always imply shared?  why not private?  After all it's not called
> >      "memfd-shared-alloc", and we can create private mappings using
> >      e.g. memory-backend-memfd,share=off.
> 
> There is no use case for memfd-alloc with share=off, so no point IMO in
> making the option more verbose.

Unfortunately this fact doesn't make the property easier to understand. :-(

> For cpr, the mapping with all its modifications must be visible to new
> qemu when qemu mmaps it.

So this might be the important part - do you mean migrating
VGA/ROM/... small ramblocks won't work (besides any performance concerns)?
Could you elaborate?

Cpr-reboot already introduced lots of tricky knobs to QEMU.  We may need to
restrict that specialty to minimal, making the interfacing as clear as
possible, or (at least migration) maintainers will start to be soon scared
and running away, if such proposal was not shot down.

In short, I hope when we introduce new knobs for cpr, we shouldn't always
keep cpr-* modes in mind, but consider whenever the user can use it without
cpr-*.  I'm not sure whether it'll be always possible, but we should try.

Thanks,
Steven Sistare May 30, 2024, 5:11 p.m. UTC | #4
On 5/29/2024 3:14 PM, Peter Xu wrote:
> On Wed, May 29, 2024 at 01:31:38PM -0400, Steven Sistare wrote:
>>>> diff --git a/system/memory.c b/system/memory.c
>>>> index 49f1cb2..ca04a0e 100644
>>>> --- a/system/memory.c
>>>> +++ b/system/memory.c
>>>> @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
>>>>                                          uint64_t size,
>>>>                                          Error **errp)
>>>>    {
>>>> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
>>>
>>> If there's a machine option to "use memfd for allocations", then it's
>>> shared mem... Hmm..
>>>
>>> It is a bit confusing to me in quite a few levels:
>>>
>>>     - Why memory allocation method will be defined by a machine property,
>>>       even if we have memory-backend-* which should cover everything?
>>
>> Some memory regions are implicitly created, and have no explicit representation
>> on the qemu command line.  memfd-alloc affects those.
>>
>> More generally, memfd-alloc affects all ramblock allocations that are
>> not explicitly represented by memory-backend object.  Thus the simple
>> command line "qemu -m 1G" does not explicitly describe an object, so it
>> goes through the anonymous allocation path, and is affected by memfd-alloc.
> 
> Can we simply now allow "qemu -m 1G" to work for cpr-exec?  

I assume you meant "simply not allow".

Yes, I could do that, but I would need to explicitly add code to exclude this
case, and add a blocker.  Right now it "just works" for all paths that lead to
ram_block_alloc_host, without any special logic at the memory-backend level.
And, I'm not convinced that simplifies the docs, as now I would need to tell
the user that "-m 1G" and similar constructions do not work with cpr.

I can try to clarify the doc for -memfd-alloc as currently defined.

> AFAIU that's
> what we do with cpr-reboot: we ask the user to specify the right things to
> make other thing work.  Otherwise it won't.
> 
>>
>> Internally, create_default_memdev does create a memory-backend object.
>> That is what my doc comment above refers to:
>>    Any associated memory-backend objects are created with share=on
>>
>> An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.
>>
>> The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:
>>
>> +#     Memory backend objects must have the share=on attribute, and
>> +#     must be mmap'able in the new QEMU process.  For example,
>> +#     memory-backend-file is acceptable, but memory-backend-ram is
>> +#     not.
>> +#
>> +#     The VM must be started with the '-machine memfd-alloc=on'
>> +#     option.  This causes implicit ram blocks -- those not explicitly
>> +#     described by a memory-backend object -- to be allocated by
>> +#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
>> +#     RAM when it is specified without a memory-backend object.
> 
> VGA is IIRC 16MB chunk, ROM is even smaller.  If the user specifies -object
> memory-backend-file,share=on propertly, these should be the only outliers?
> 
> Are these important enough for the downtime?  Can we put them into the
> migrated image alongside with the rest device states?

It's not about downtime.  vfio, vdpa, and iommufd pin all guest pages.
The pages must remain pinned during CPR to support ongoing DMA activity
which could target those pages (which we do not quiesce), and the same
physical pages must be used for the ramblocks in the new qemu process.

>>>     - Even if we have such a machine property, why setting "memfd" will
>>>       always imply shared?  why not private?  After all it's not called
>>>       "memfd-shared-alloc", and we can create private mappings using
>>>       e.g. memory-backend-memfd,share=off.
>>
>> There is no use case for memfd-alloc with share=off, so no point IMO in
>> making the option more verbose.
> 
> Unfortunately this fact doesn't make the property easier to understand. :-( >
>> For cpr, the mapping with all its modifications must be visible to new
>> qemu when qemu mmaps it.
> 
> So this might be the important part - do you mean migrating
> VGA/ROM/... small ramblocks won't work (besides any performance concerns)?
> Could you elaborate?

Pinning.

> Cpr-reboot already introduced lots of tricky knobs to QEMU.  We may need to
> restrict that specialty to minimal, making the interfacing as clear as
> possible, or (at least migration) maintainers will start to be soon scared
> and running away, if such proposal was not shot down.
> 
> In short, I hope when we introduce new knobs for cpr, we shouldn't always
> keep cpr-* modes in mind, but consider whenever the user can use it without
> cpr-*.  I'm not sure whether it'll be always possible, but we should try.

I agree in principle.  FWIW, I have tried to generalize the functionality needed
by cpr so it can be used in other ways: per-mode blockers, per-mode notifiers,
precreate vmstate, factory objects; to base it on migration internals with
minimal change (vmstate); and to make minimal changes in the migration control
paths.

- Steve
Peter Xu May 30, 2024, 6:14 p.m. UTC | #5
On Thu, May 30, 2024 at 01:11:09PM -0400, Steven Sistare wrote:
> On 5/29/2024 3:14 PM, Peter Xu wrote:
> > On Wed, May 29, 2024 at 01:31:38PM -0400, Steven Sistare wrote:
> > > > > diff --git a/system/memory.c b/system/memory.c
> > > > > index 49f1cb2..ca04a0e 100644
> > > > > --- a/system/memory.c
> > > > > +++ b/system/memory.c
> > > > > @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
> > > > >                                          uint64_t size,
> > > > >                                          Error **errp)
> > > > >    {
> > > > > +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
> > > > 
> > > > If there's a machine option to "use memfd for allocations", then it's
> > > > shared mem... Hmm..
> > > > 
> > > > It is a bit confusing to me in quite a few levels:
> > > > 
> > > >     - Why memory allocation method will be defined by a machine property,
> > > >       even if we have memory-backend-* which should cover everything?
> > > 
> > > Some memory regions are implicitly created, and have no explicit representation
> > > on the qemu command line.  memfd-alloc affects those.
> > > 
> > > More generally, memfd-alloc affects all ramblock allocations that are
> > > not explicitly represented by memory-backend object.  Thus the simple
> > > command line "qemu -m 1G" does not explicitly describe an object, so it
> > > goes through the anonymous allocation path, and is affected by memfd-alloc.
> > 
> > Can we simply now allow "qemu -m 1G" to work for cpr-exec?
> 
> I assume you meant "simply not allow".
> 
> Yes, I could do that, but I would need to explicitly add code to exclude this
> case, and add a blocker.  Right now it "just works" for all paths that lead to
> ram_block_alloc_host, without any special logic at the memory-backend level.
> And, I'm not convinced that simplifies the docs, as now I would need to tell
> the user that "-m 1G" and similar constructions do not work with cpr.
> 
> I can try to clarify the doc for -memfd-alloc as currently defined.

Why do we need to keep cpr working for existing qemu cmdlines?  We'll
already need to add more new cmdline options already anyway, right?

cpr-reboot wasn't doing it, and that made sense to me, so that new features
will require the user to opt-in for it, starting with changing its
cmdlines.

> 
> > AFAIU that's
> > what we do with cpr-reboot: we ask the user to specify the right things to
> > make other thing work.  Otherwise it won't.
> > 
> > > 
> > > Internally, create_default_memdev does create a memory-backend object.
> > > That is what my doc comment above refers to:
> > >    Any associated memory-backend objects are created with share=on
> > > 
> > > An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.
> > > 
> > > The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:
> > > 
> > > +#     Memory backend objects must have the share=on attribute, and
> > > +#     must be mmap'able in the new QEMU process.  For example,
> > > +#     memory-backend-file is acceptable, but memory-backend-ram is
> > > +#     not.
> > > +#
> > > +#     The VM must be started with the '-machine memfd-alloc=on'
> > > +#     option.  This causes implicit ram blocks -- those not explicitly
> > > +#     described by a memory-backend object -- to be allocated by
> > > +#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
> > > +#     RAM when it is specified without a memory-backend object.
> > 
> > VGA is IIRC 16MB chunk, ROM is even smaller.  If the user specifies -object
> > memory-backend-file,share=on propertly, these should be the only outliers?
> > 
> > Are these important enough for the downtime?  Can we put them into the
> > migrated image alongside with the rest device states?
> 
> It's not about downtime.  vfio, vdpa, and iommufd pin all guest pages.
> The pages must remain pinned during CPR to support ongoing DMA activity
> which could target those pages (which we do not quiesce), and the same
> physical pages must be used for the ramblocks in the new qemu process.

Ah ok, yes DMA can happen on the fly.

Guest mem is definitely the major DMA target and that can be covered by
-object memory-backend-*,shared=on cmdlines.

ROM is definitely not a DMA target.  So is VGA ram a target for, perhaps,
an assigned vGPU device?  Do we have a list of things that will need that?
Can we make them work somehow by sharing them like guest mem?

It'll be a complete tragedy if we introduced this whole thing only because
of some minority.  I want to understand whether there's any generic way to
solve this problem rather than this magical machine property.  IMHO it's
very not trivial to maintain.

> 
> > > >     - Even if we have such a machine property, why setting "memfd" will
> > > >       always imply shared?  why not private?  After all it's not called
> > > >       "memfd-shared-alloc", and we can create private mappings using
> > > >       e.g. memory-backend-memfd,share=off.
> > > 
> > > There is no use case for memfd-alloc with share=off, so no point IMO in
> > > making the option more verbose.
> > 
> > Unfortunately this fact doesn't make the property easier to understand. :-( >
> > > For cpr, the mapping with all its modifications must be visible to new
> > > qemu when qemu mmaps it.
> > 
> > So this might be the important part - do you mean migrating
> > VGA/ROM/... small ramblocks won't work (besides any performance concerns)?
> > Could you elaborate?
> 
> Pinning.
> 
> > Cpr-reboot already introduced lots of tricky knobs to QEMU.  We may need to
> > restrict that specialty to minimal, making the interfacing as clear as
> > possible, or (at least migration) maintainers will start to be soon scared
> > and running away, if such proposal was not shot down.
> > 
> > In short, I hope when we introduce new knobs for cpr, we shouldn't always
> > keep cpr-* modes in mind, but consider whenever the user can use it without
> > cpr-*.  I'm not sure whether it'll be always possible, but we should try.
> 
> I agree in principle.  FWIW, I have tried to generalize the functionality needed
> by cpr so it can be used in other ways: per-mode blockers, per-mode notifiers,
> precreate vmstate, factory objects; to base it on migration internals with
> minimal change (vmstate); and to make minimal changes in the migration control
> paths.

Thanks.

For this one I think reusing -object interface (hopefully without
introducing a knob) would be a great step if that can fully describe what
cpr-exec is looking for.  E.g., when cpr-exec mode enabled it can sanity
check the memory backends making sure all things satisfy its need, and fail
migration otherwise upfront.
Steven Sistare May 31, 2024, 7:32 p.m. UTC | #6
On 5/30/2024 2:14 PM, Peter Xu wrote:
> On Thu, May 30, 2024 at 01:11:09PM -0400, Steven Sistare wrote:
>> On 5/29/2024 3:14 PM, Peter Xu wrote:
>>> On Wed, May 29, 2024 at 01:31:38PM -0400, Steven Sistare wrote:
>>>>>> diff --git a/system/memory.c b/system/memory.c
>>>>>> index 49f1cb2..ca04a0e 100644
>>>>>> --- a/system/memory.c
>>>>>> +++ b/system/memory.c
>>>>>> @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
>>>>>>                                           uint64_t size,
>>>>>>                                           Error **errp)
>>>>>>     {
>>>>>> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
>>>>>
>>>>> If there's a machine option to "use memfd for allocations", then it's
>>>>> shared mem... Hmm..
>>>>>
>>>>> It is a bit confusing to me in quite a few levels:
>>>>>
>>>>>      - Why memory allocation method will be defined by a machine property,
>>>>>        even if we have memory-backend-* which should cover everything?
>>>>
>>>> Some memory regions are implicitly created, and have no explicit representation
>>>> on the qemu command line.  memfd-alloc affects those.
>>>>
>>>> More generally, memfd-alloc affects all ramblock allocations that are
>>>> not explicitly represented by memory-backend object.  Thus the simple
>>>> command line "qemu -m 1G" does not explicitly describe an object, so it
>>>> goes through the anonymous allocation path, and is affected by memfd-alloc.
>>>
>>> Can we simply now allow "qemu -m 1G" to work for cpr-exec?
>>
>> I assume you meant "simply not allow".
>>
>> Yes, I could do that, but I would need to explicitly add code to exclude this
>> case, and add a blocker.  Right now it "just works" for all paths that lead to
>> ram_block_alloc_host, without any special logic at the memory-backend level.
>> And, I'm not convinced that simplifies the docs, as now I would need to tell
>> the user that "-m 1G" and similar constructions do not work with cpr.
>>
>> I can try to clarify the doc for -memfd-alloc as currently defined.
> 
> Why do we need to keep cpr working for existing qemu cmdlines?  We'll
> already need to add more new cmdline options already anyway, right?
> 
> cpr-reboot wasn't doing it, and that made sense to me, so that new features
> will require the user to opt-in for it, starting with changing its
> cmdlines.

I agree.  We need a new option to opt-in to cpr-friendly memory allocation, and I
am proposing -machine memfd-alloc. I am simply saying that I can try to do a better
job explaining the functionality in my proposed text for memfd-alloc, instead of
changing the functionality to exclude "-m 1G".  I believe excluding "-m 1G" is the
wrong approach, for the reasons I stated - messier implementation *and* documentation.

I am open to different syntax for opting in.

>>> AFAIU that's
>>> what we do with cpr-reboot: we ask the user to specify the right things to
>>> make other thing work.  Otherwise it won't.
>>>
>>>>
>>>> Internally, create_default_memdev does create a memory-backend object.
>>>> That is what my doc comment above refers to:
>>>>     Any associated memory-backend objects are created with share=on
>>>>
>>>> An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.
>>>>
>>>> The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:
>>>>
>>>> +#     Memory backend objects must have the share=on attribute, and
>>>> +#     must be mmap'able in the new QEMU process.  For example,
>>>> +#     memory-backend-file is acceptable, but memory-backend-ram is
>>>> +#     not.
>>>> +#
>>>> +#     The VM must be started with the '-machine memfd-alloc=on'
>>>> +#     option.  This causes implicit ram blocks -- those not explicitly
>>>> +#     described by a memory-backend object -- to be allocated by
>>>> +#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
>>>> +#     RAM when it is specified without a memory-backend object.
>>>
>>> VGA is IIRC 16MB chunk, ROM is even smaller.  If the user specifies -object
>>> memory-backend-file,share=on propertly, these should be the only outliers?
>>>
>>> Are these important enough for the downtime?  Can we put them into the
>>> migrated image alongside with the rest device states?
>>
>> It's not about downtime.  vfio, vdpa, and iommufd pin all guest pages.
>> The pages must remain pinned during CPR to support ongoing DMA activity
>> which could target those pages (which we do not quiesce), and the same
>> physical pages must be used for the ramblocks in the new qemu process.
> 
> Ah ok, yes DMA can happen on the fly.
> 
> Guest mem is definitely the major DMA target and that can be covered by
> -object memory-backend-*,shared=on cmdlines.
> 
> ROM is definitely not a DMA target.  So is VGA ram a target for, perhaps,
> an assigned vGPU device?  Do we have a list of things that will need that?
> Can we make them work somehow by sharing them like guest mem?

The pass-through devices map and pin all memory accessible to the guest.
We cannot make exceptions based on our intuition of how the memory will
and will not be used.

Also, we cannot simply abandon the old pinned ramblocks, owned by an mm_struct
that will become a zombie.  We would actually need to write additional code
to call device ioctls to unmap the oddball ramblocks.  It is far cleaner
and more correct to preserve them all.

> It'll be a complete tragedy if we introduced this whole thing only because
> of some minority.  I want to understand whether there's any generic way to
> solve this problem rather than this magical machine property.  IMHO it's
> very not trivial to maintain.

The machine property is the generic way.

A single opt-in option to call memfd_create() is an elegant and effective solution.
The code is small and not hard to maintain.  This is the option patch.  Most of it
is the boiler plate that any option has, and the single code location that formerly
called qemu_anon_ram_alloc now optionally calls qemu_memfd_create:

   machine: memfd-alloc option             25 insertions(+), 28 deletions(-)

These patches are simply stylistic and modularity improvements for ramblock,
valuable in their own right, which allows the previous patch to be small and clean.

   physmem: ram_block_create               29 insertions(+), 21 deletions(-)
   physmem: hoist guest_memfd creation     48 insertions(+), 37 deletions(-)
   physmem: hoist host memory allocation   36 insertions(+), 44 deletions(-)
   physmem: set ram block idstr earlier    25 insertions(+), 28 deletions(-)

>>>>>      - Even if we have such a machine property, why setting "memfd" will
>>>>>        always imply shared?  why not private?  After all it's not called
>>>>>        "memfd-shared-alloc", and we can create private mappings using
>>>>>        e.g. memory-backend-memfd,share=off.
>>>>
>>>> There is no use case for memfd-alloc with share=off, so no point IMO in
>>>> making the option more verbose.
>>>
>>> Unfortunately this fact doesn't make the property easier to understand. :-( >
>>>> For cpr, the mapping with all its modifications must be visible to new
>>>> qemu when qemu mmaps it.
>>>
>>> So this might be the important part - do you mean migrating
>>> VGA/ROM/... small ramblocks won't work (besides any performance concerns)?
>>> Could you elaborate?
>>
>> Pinning.
>>
>>> Cpr-reboot already introduced lots of tricky knobs to QEMU.  We may need to
>>> restrict that specialty to minimal, making the interfacing as clear as
>>> possible, or (at least migration) maintainers will start to be soon scared
>>> and running away, if such proposal was not shot down.
>>>
>>> In short, I hope when we introduce new knobs for cpr, we shouldn't always
>>> keep cpr-* modes in mind, but consider whenever the user can use it without
>>> cpr-*.  I'm not sure whether it'll be always possible, but we should try.
>>
>> I agree in principle.  FWIW, I have tried to generalize the functionality needed
>> by cpr so it can be used in other ways: per-mode blockers, per-mode notifiers,
>> precreate vmstate, factory objects; to base it on migration internals with
>> minimal change (vmstate); and to make minimal changes in the migration control
>> paths.
> 
> Thanks.
> 
> For this one I think reusing -object interface (hopefully without
> introducing a knob) would be a great step if that can fully describe what
> cpr-exec is looking for.  E.g., when cpr-exec mode enabled it can sanity
> check the memory backends making sure all things satisfy its need, and fail
> migration otherwise upfront.

For '-object memory-backend-*', I can tell whether cpr is allowed or not
without additional knobs.  See the blocker patches for examples where cpr
is blocked.

The problem is the implicit ramblocks that currently call qemu_ram_alloc_internal.

- Steve
Daniel P. Berrangé June 3, 2024, 10:17 a.m. UTC | #7
On Wed, May 29, 2024 at 01:31:38PM -0400, Steven Sistare wrote:
> On 5/28/2024 5:12 PM, Peter Xu wrote:
> > On Mon, Apr 29, 2024 at 08:55:26AM -0700, Steve Sistare wrote:
> > > Allocate anonymous memory using memfd_create if the memfd-alloc machine
> > > option is set.
> > > 
> > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > > ---
> > >   hw/core/machine.c   | 22 ++++++++++++++++++++++
> > >   include/hw/boards.h |  1 +
> > >   qemu-options.hx     |  6 ++++++
> > >   system/memory.c     |  9 ++++++---
> > >   system/physmem.c    | 18 +++++++++++++++++-
> > >   system/trace-events |  1 +
> > >   6 files changed, 53 insertions(+), 4 deletions(-)

> > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > index cf61f6b..f0dfda5 100644
> > > --- a/qemu-options.hx
> > > +++ b/qemu-options.hx
> > > @@ -32,6 +32,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
> > >       "                vmport=on|off|auto controls emulation of vmport (default: auto)\n"
> > >       "                dump-guest-core=on|off include guest memory in a core dump (default=on)\n"
> > >       "                mem-merge=on|off controls memory merge support (default: on)\n"
> > > +    "                memfd-alloc=on|off controls allocating anonymous guest RAM using memfd_create (default: off)\n"
> > >       "                aes-key-wrap=on|off controls support for AES key wrapping (default=on)\n"
> > >       "                dea-key-wrap=on|off controls support for DEA key wrapping (default=on)\n"
> > >       "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
> > > @@ -79,6 +80,11 @@ SRST
> > >           supported by the host, de-duplicates identical memory pages
> > >           among VMs instances (enabled by default).
> > > +    ``memfd-alloc=on|off``
> > > +        Enables or disables allocation of anonymous guest RAM using
> > > +        memfd_create.  Any associated memory-backend objects are created with
> > > +        share=on.  The memfd-alloc default is off.
> > > +
> > >       ``aes-key-wrap=on|off``
> > >           Enables or disables AES key wrapping support on s390-ccw hosts.
> > >           This feature controls whether AES wrapping keys will be created
> > > diff --git a/system/memory.c b/system/memory.c
> > > index 49f1cb2..ca04a0e 100644
> > > --- a/system/memory.c
> > > +++ b/system/memory.c
> > > @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
> > >                                         uint64_t size,
> > >                                         Error **errp)
> > >   {
> > > +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
> > 
> > If there's a machine option to "use memfd for allocations", then it's
> > shared mem... Hmm..
> > 
> > It is a bit confusing to me in quite a few levels:
> > 
> >    - Why memory allocation method will be defined by a machine property,
> >      even if we have memory-backend-* which should cover everything?
> 
> Some memory regions are implicitly created, and have no explicit representation
> on the qemu command line.  memfd-alloc affects those.
> 
> More generally, memfd-alloc affects all ramblock allocations that are
> not explicitly represented by memory-backend object.  Thus the simple
> command line "qemu -m 1G" does not explicitly describe an object, so it
> goes through the anonymous allocation path, and is affected by memfd-alloc.
> 
> Internally, create_default_memdev does create a memory-backend object.
> That is what my doc comment above refers to:
>   Any associated memory-backend objects are created with share=on
> 
> An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.
> 
> The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:
> 
> +#     Memory backend objects must have the share=on attribute, and
> +#     must be mmap'able in the new QEMU process.  For example,
> +#     memory-backend-file is acceptable, but memory-backend-ram is
> +#     not.
> +#
> +#     The VM must be started with the '-machine memfd-alloc=on'
> +#     option.  This causes implicit ram blocks -- those not explicitly
> +#     described by a memory-backend object -- to be allocated by
> +#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
> +#     RAM when it is specified without a memory-backend object.
> 
> >    - Even if we have such a machine property, why setting "memfd" will
> >      always imply shared?  why not private?  After all it's not called
> >      "memfd-shared-alloc", and we can create private mappings using
> >      e.g. memory-backend-memfd,share=off.
> 
> There is no use case for memfd-alloc with share=off, so no point IMO in
> making the option more verbose.  For cpr, the mapping with all its modifications
> must be visible to new qemu when qemu mmaps it.


So IIUC, cpr doesn't care about the use of 'memfd' as the specific impl,
it only cares that the memory is share=on.

Rather than having a machine type option "memfd-alloc" which is named after
a Linux specific impl detail, how about having a machine type option
"mem-share=on", which just happens to trigger use of memfd internally on
Linux ? That gives us freedom to use non-memfd options if appropriate in
the future.

With regards,
Daniel
Steven Sistare June 3, 2024, 11:59 a.m. UTC | #8
On 6/3/2024 6:17 AM, Daniel P. Berrangé wrote:
> On Wed, May 29, 2024 at 01:31:38PM -0400, Steven Sistare wrote:
>> On 5/28/2024 5:12 PM, Peter Xu wrote:
>>> On Mon, Apr 29, 2024 at 08:55:26AM -0700, Steve Sistare wrote:
>>>> Allocate anonymous memory using memfd_create if the memfd-alloc machine
>>>> option is set.
>>>>
>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>> ---
>>>>    hw/core/machine.c   | 22 ++++++++++++++++++++++
>>>>    include/hw/boards.h |  1 +
>>>>    qemu-options.hx     |  6 ++++++
>>>>    system/memory.c     |  9 ++++++---
>>>>    system/physmem.c    | 18 +++++++++++++++++-
>>>>    system/trace-events |  1 +
>>>>    6 files changed, 53 insertions(+), 4 deletions(-)
> 
>>>> diff --git a/qemu-options.hx b/qemu-options.hx
>>>> index cf61f6b..f0dfda5 100644
>>>> --- a/qemu-options.hx
>>>> +++ b/qemu-options.hx
>>>> @@ -32,6 +32,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
>>>>        "                vmport=on|off|auto controls emulation of vmport (default: auto)\n"
>>>>        "                dump-guest-core=on|off include guest memory in a core dump (default=on)\n"
>>>>        "                mem-merge=on|off controls memory merge support (default: on)\n"
>>>> +    "                memfd-alloc=on|off controls allocating anonymous guest RAM using memfd_create (default: off)\n"
>>>>        "                aes-key-wrap=on|off controls support for AES key wrapping (default=on)\n"
>>>>        "                dea-key-wrap=on|off controls support for DEA key wrapping (default=on)\n"
>>>>        "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
>>>> @@ -79,6 +80,11 @@ SRST
>>>>            supported by the host, de-duplicates identical memory pages
>>>>            among VMs instances (enabled by default).
>>>> +    ``memfd-alloc=on|off``
>>>> +        Enables or disables allocation of anonymous guest RAM using
>>>> +        memfd_create.  Any associated memory-backend objects are created with
>>>> +        share=on.  The memfd-alloc default is off.
>>>> +
>>>>        ``aes-key-wrap=on|off``
>>>>            Enables or disables AES key wrapping support on s390-ccw hosts.
>>>>            This feature controls whether AES wrapping keys will be created
>>>> diff --git a/system/memory.c b/system/memory.c
>>>> index 49f1cb2..ca04a0e 100644
>>>> --- a/system/memory.c
>>>> +++ b/system/memory.c
>>>> @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
>>>>                                          uint64_t size,
>>>>                                          Error **errp)
>>>>    {
>>>> +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
>>>
>>> If there's a machine option to "use memfd for allocations", then it's
>>> shared mem... Hmm..
>>>
>>> It is a bit confusing to me in quite a few levels:
>>>
>>>     - Why memory allocation method will be defined by a machine property,
>>>       even if we have memory-backend-* which should cover everything?
>>
>> Some memory regions are implicitly created, and have no explicit representation
>> on the qemu command line.  memfd-alloc affects those.
>>
>> More generally, memfd-alloc affects all ramblock allocations that are
>> not explicitly represented by memory-backend object.  Thus the simple
>> command line "qemu -m 1G" does not explicitly describe an object, so it
>> goes through the anonymous allocation path, and is affected by memfd-alloc.
>>
>> Internally, create_default_memdev does create a memory-backend object.
>> That is what my doc comment above refers to:
>>    Any associated memory-backend objects are created with share=on
>>
>> An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.
>>
>> The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:
>>
>> +#     Memory backend objects must have the share=on attribute, and
>> +#     must be mmap'able in the new QEMU process.  For example,
>> +#     memory-backend-file is acceptable, but memory-backend-ram is
>> +#     not.
>> +#
>> +#     The VM must be started with the '-machine memfd-alloc=on'
>> +#     option.  This causes implicit ram blocks -- those not explicitly
>> +#     described by a memory-backend object -- to be allocated by
>> +#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
>> +#     RAM when it is specified without a memory-backend object.
>>
>>>     - Even if we have such a machine property, why setting "memfd" will
>>>       always imply shared?  why not private?  After all it's not called
>>>       "memfd-shared-alloc", and we can create private mappings using
>>>       e.g. memory-backend-memfd,share=off.
>>
>> There is no use case for memfd-alloc with share=off, so no point IMO in
>> making the option more verbose.  For cpr, the mapping with all its modifications
>> must be visible to new qemu when qemu mmaps it.
> 
> 
> So IIUC, cpr doesn't care about the use of 'memfd' as the specific impl,
> it only cares that the memory is share=on.
> 
> Rather than having a machine type option "memfd-alloc" which is named after
> a Linux specific impl detail, how about having a machine type option
> "mem-share=on", which just happens to trigger use of memfd internally on
> Linux ? That gives us freedom to use non-memfd options if appropriate in
> the future.

That would be fine.  Internally we still need a mechanism to preserve the
memory and name it so qemu can mmap it post-exec, but in theory we could
invent some other mechanism to do so, such as creating /dev/shm files with
canonical names.

- Steve
Peter Xu June 3, 2024, 9:48 p.m. UTC | #9
On Fri, May 31, 2024 at 03:32:05PM -0400, Steven Sistare wrote:
> On 5/30/2024 2:14 PM, Peter Xu wrote:
> > On Thu, May 30, 2024 at 01:11:09PM -0400, Steven Sistare wrote:
> > > On 5/29/2024 3:14 PM, Peter Xu wrote:
> > > > On Wed, May 29, 2024 at 01:31:38PM -0400, Steven Sistare wrote:
> > > > > > > diff --git a/system/memory.c b/system/memory.c
> > > > > > > index 49f1cb2..ca04a0e 100644
> > > > > > > --- a/system/memory.c
> > > > > > > +++ b/system/memory.c
> > > > > > > @@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
> > > > > > >                                           uint64_t size,
> > > > > > >                                           Error **errp)
> > > > > > >     {
> > > > > > > +    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
> > > > > > 
> > > > > > If there's a machine option to "use memfd for allocations", then it's
> > > > > > shared mem... Hmm..
> > > > > > 
> > > > > > It is a bit confusing to me in quite a few levels:
> > > > > > 
> > > > > >      - Why memory allocation method will be defined by a machine property,
> > > > > >        even if we have memory-backend-* which should cover everything?
> > > > > 
> > > > > Some memory regions are implicitly created, and have no explicit representation
> > > > > on the qemu command line.  memfd-alloc affects those.
> > > > > 
> > > > > More generally, memfd-alloc affects all ramblock allocations that are
> > > > > not explicitly represented by memory-backend object.  Thus the simple
> > > > > command line "qemu -m 1G" does not explicitly describe an object, so it
> > > > > goes through the anonymous allocation path, and is affected by memfd-alloc.
> > > > 
> > > > Can we simply now allow "qemu -m 1G" to work for cpr-exec?
> > > 
> > > I assume you meant "simply not allow".
> > > 
> > > Yes, I could do that, but I would need to explicitly add code to exclude this
> > > case, and add a blocker.  Right now it "just works" for all paths that lead to
> > > ram_block_alloc_host, without any special logic at the memory-backend level.
> > > And, I'm not convinced that simplifies the docs, as now I would need to tell
> > > the user that "-m 1G" and similar constructions do not work with cpr.
> > > 
> > > I can try to clarify the doc for -memfd-alloc as currently defined.
> > 
> > Why do we need to keep cpr working for existing qemu cmdlines?  We'll
> > already need to add more new cmdline options already anyway, right?
> > 
> > cpr-reboot wasn't doing it, and that made sense to me, so that new features
> > will require the user to opt-in for it, starting with changing its
> > cmdlines.
> 
> I agree.  We need a new option to opt-in to cpr-friendly memory allocation, and I
> am proposing -machine memfd-alloc. I am simply saying that I can try to do a better
> job explaining the functionality in my proposed text for memfd-alloc, instead of
> changing the functionality to exclude "-m 1G".  I believe excluding "-m 1G" is the
> wrong approach, for the reasons I stated - messier implementation *and* documentation.
> 
> I am open to different syntax for opting in.

If the machine property is the only way to go then I agree that might be a
good idea, even though the name can be further discussed.  But I still want
to figure out something below.

> 
> > > > AFAIU that's
> > > > what we do with cpr-reboot: we ask the user to specify the right things to
> > > > make other thing work.  Otherwise it won't.
> > > > 
> > > > > 
> > > > > Internally, create_default_memdev does create a memory-backend object.
> > > > > That is what my doc comment above refers to:
> > > > >     Any associated memory-backend objects are created with share=on
> > > > > 
> > > > > An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.
> > > > > 
> > > > > The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:
> > > > > 
> > > > > +#     Memory backend objects must have the share=on attribute, and
> > > > > +#     must be mmap'able in the new QEMU process.  For example,
> > > > > +#     memory-backend-file is acceptable, but memory-backend-ram is
> > > > > +#     not.
> > > > > +#
> > > > > +#     The VM must be started with the '-machine memfd-alloc=on'
> > > > > +#     option.  This causes implicit ram blocks -- those not explicitly
> > > > > +#     described by a memory-backend object -- to be allocated by
> > > > > +#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
> > > > > +#     RAM when it is specified without a memory-backend object.
> > > > 
> > > > VGA is IIRC 16MB chunk, ROM is even smaller.  If the user specifies -object
> > > > memory-backend-file,share=on propertly, these should be the only outliers?
> > > > 
> > > > Are these important enough for the downtime?  Can we put them into the
> > > > migrated image alongside with the rest device states?
> > > 
> > > It's not about downtime.  vfio, vdpa, and iommufd pin all guest pages.
> > > The pages must remain pinned during CPR to support ongoing DMA activity
> > > which could target those pages (which we do not quiesce), and the same
> > > physical pages must be used for the ramblocks in the new qemu process.
> > 
> > Ah ok, yes DMA can happen on the fly.
> > 
> > Guest mem is definitely the major DMA target and that can be covered by
> > -object memory-backend-*,shared=on cmdlines.
> > 
> > ROM is definitely not a DMA target.  So is VGA ram a target for, perhaps,
> > an assigned vGPU device?  Do we have a list of things that will need that?
> > Can we make them work somehow by sharing them like guest mem?
> 
> The pass-through devices map and pin all memory accessible to the guest.
> We cannot make exceptions based on our intuition of how the memory will
> and will not be used.

True, but if you see my whole point is trying to see whether we can get rid
of the machine property completely, and it's also because we're just so
close to getting rid of it.. which I feel.

QEMU memory layout can be complicated across all the platforms, but I doubt
whether that's true for CPR's purpose and archs that it plans to support.
A generic VM's memory topology shouldn't be that complicated at all,
e.g. on x86:

              Block Name    PSize              Offset               Used              Total                HVA  RO
                  pc.ram    4 KiB  0x0000000000000000 0x0000000008000000 0x0000000008000000 0x00007f684fe00000  rw
   0000:00:02.0/vga.vram    4 KiB  0x0000000008080000 0x0000000001000000 0x0000000001000000 0x00007f684e800000  rw
    /rom@etc/acpi/tables    4 KiB  0x0000000009100000 0x0000000000020000 0x0000000000200000 0x00007f684dc00000  ro
                 pc.bios    4 KiB  0x0000000008000000 0x0000000000040000 0x0000000000040000 0x00007f68dc200000  ro
  0000:00:03.0/e1000.rom    4 KiB  0x00000000090c0000 0x0000000000040000 0x0000000000040000 0x00007f684e000000  ro
                  pc.rom    4 KiB  0x0000000008040000 0x0000000000020000 0x0000000000020000 0x00007f684fa00000  ro
    0000:00:02.0/vga.rom    4 KiB  0x0000000009080000 0x0000000000010000 0x0000000000010000 0x00007f684e400000  ro
   /rom@etc/table-loader    4 KiB  0x0000000009300000 0x0000000000001000 0x0000000000010000 0x00007f684d800000  ro
      /rom@etc/acpi/rsdp    4 KiB  0x0000000009340000 0x0000000000001000 0x0000000000001000 0x00007f684d400000  ro

It's simply the major ram, ROMs, and VGA.

I hoped it can work without it, I'll mention one more reason below.

So if we're going to have this machine property, can I still request a
possible list of things that can be the target of this property besides
guest mem?  I just want to know where it can be used "for real".  I know it
might be complicated, but maybe not.  I really can only think of VGA, and I
doubt whether that should be DMAed at all.

> 
> Also, we cannot simply abandon the old pinned ramblocks, owned by an mm_struct
> that will become a zombie.  We would actually need to write additional code
> to call device ioctls to unmap the oddball ramblocks.  It is far cleaner
> and more correct to preserve them all.
> 
> > It'll be a complete tragedy if we introduced this whole thing only because
> > of some minority.  I want to understand whether there's any generic way to
> > solve this problem rather than this magical machine property.  IMHO it's
> > very not trivial to maintain.
> 
> The machine property is the generic way.
> 
> A single opt-in option to call memfd_create() is an elegant and effective solution.
> The code is small and not hard to maintain.  This is the option patch.  Most of it
> is the boiler plate that any option has, and the single code location that formerly
> called qemu_anon_ram_alloc now optionally calls qemu_memfd_create:
> 
>   machine: memfd-alloc option             25 insertions(+), 28 deletions(-)
> 
> These patches are simply stylistic and modularity improvements for ramblock,
> valuable in their own right, which allows the previous patch to be small and clean.
> 
>   physmem: ram_block_create               29 insertions(+), 21 deletions(-)
>   physmem: hoist guest_memfd creation     48 insertions(+), 37 deletions(-)
>   physmem: hoist host memory allocation   36 insertions(+), 44 deletions(-)
>   physmem: set ram block idstr earlier    25 insertions(+), 28 deletions(-)

Let me explain the other reason why I don't want to have that machine
property..

That property, irrelevant of what it is called (and I doubt whether Dan's
suggestion on "shared-ram" is good, e.g. mmap(MAP_SHARED) doesn't have user
visible fd but it's shared-ram for sure..), is yet another way to specify
guest mem types.

What if the user specified this property but specified something else in
the -object parameters?  E.g. -machine share-ram=on -object
memory-backend-ram,share=off.  What should we do?

Fundamentally that's also why I "hope" we can avoid adding yet one more
place configure guest mem, and stick with -objects.

> 
> > > > > >      - Even if we have such a machine property, why setting "memfd" will
> > > > > >        always imply shared?  why not private?  After all it's not called
> > > > > >        "memfd-shared-alloc", and we can create private mappings using
> > > > > >        e.g. memory-backend-memfd,share=off.
> > > > > 
> > > > > There is no use case for memfd-alloc with share=off, so no point IMO in
> > > > > making the option more verbose.
> > > > 
> > > > Unfortunately this fact doesn't make the property easier to understand. :-( >
> > > > > For cpr, the mapping with all its modifications must be visible to new
> > > > > qemu when qemu mmaps it.
> > > > 
> > > > So this might be the important part - do you mean migrating
> > > > VGA/ROM/... small ramblocks won't work (besides any performance concerns)?
> > > > Could you elaborate?
> > > 
> > > Pinning.
> > > 
> > > > Cpr-reboot already introduced lots of tricky knobs to QEMU.  We may need to
> > > > restrict that specialty to minimal, making the interfacing as clear as
> > > > possible, or (at least migration) maintainers will start to be soon scared
> > > > and running away, if such proposal was not shot down.
> > > > 
> > > > In short, I hope when we introduce new knobs for cpr, we shouldn't always
> > > > keep cpr-* modes in mind, but consider whenever the user can use it without
> > > > cpr-*.  I'm not sure whether it'll be always possible, but we should try.
> > > 
> > > I agree in principle.  FWIW, I have tried to generalize the functionality needed
> > > by cpr so it can be used in other ways: per-mode blockers, per-mode notifiers,
> > > precreate vmstate, factory objects; to base it on migration internals with
> > > minimal change (vmstate); and to make minimal changes in the migration control
> > > paths.
> > 
> > Thanks.
> > 
> > For this one I think reusing -object interface (hopefully without
> > introducing a knob) would be a great step if that can fully describe what
> > cpr-exec is looking for.  E.g., when cpr-exec mode enabled it can sanity
> > check the memory backends making sure all things satisfy its need, and fail
> > migration otherwise upfront.
> 
> For '-object memory-backend-*', I can tell whether cpr is allowed or not
> without additional knobs.  See the blocker patches for examples where cpr
> is blocked.
> 
> The problem is the implicit ramblocks that currently call qemu_ram_alloc_internal.

I hope it won't ever happen that we introduced this property because "we
don't know", then when we know we found no real RAM regions are using it if
properly specify -object.  So I hope at least we know what we're doing, and
for explicitly what reason..

Let me know if you think I'm asking too much (which is possible.. :).  But I
hope in case that might be interesting to you too to know the real answer.

Thanks,
Daniel P. Berrangé June 4, 2024, 7:13 a.m. UTC | #10
On Mon, Jun 03, 2024 at 05:48:32PM -0400, Peter Xu wrote:
> That property, irrelevant of what it is called (and I doubt whether Dan's
> suggestion on "shared-ram" is good, e.g. mmap(MAP_SHARED) doesn't have user
> visible fd but it's shared-ram for sure..), is yet another way to specify
> guest mem types.
> 
> What if the user specified this property but specified something else in
> the -object parameters?  E.g. -machine share-ram=on -object
> memory-backend-ram,share=off.  What should we do?

The machine property would only apply to memory regions that are
*NOT* being created via -object. The memory-backend objects would
always honour their own share settnig.

With regards,
Daniel
Peter Xu June 4, 2024, 3:58 p.m. UTC | #11
On Tue, Jun 04, 2024 at 08:13:26AM +0100, Daniel P. Berrangé wrote:
> On Mon, Jun 03, 2024 at 05:48:32PM -0400, Peter Xu wrote:
> > That property, irrelevant of what it is called (and I doubt whether Dan's
> > suggestion on "shared-ram" is good, e.g. mmap(MAP_SHARED) doesn't have user
> > visible fd but it's shared-ram for sure..), is yet another way to specify
> > guest mem types.
> > 
> > What if the user specified this property but specified something else in
> > the -object parameters?  E.g. -machine share-ram=on -object
> > memory-backend-ram,share=off.  What should we do?
> 
> The machine property would only apply to memory regions that are
> *NOT* being created via -object. The memory-backend objects would
> always honour their own share settnig.

In that case we may want to rename that to share-ram-by-default=on.
Otherwise it's not clear which one would take effect from an user POV, even
if we can define it like that in the code.

Even with share-ram-by-default=on, it can be still confusing in some form
or another. Consider this cmdline:

  -machine q35,share-ram-by-default=on -object memory-backend-ram,id=mem1

Then is mem1 shared or not?  From reading the cmdline, if share ram by
default it should be ON if we don't specify it, but it's actually off?
It's because -object has its own default values.

IMHO fundamentally it's just controversial to have two ways to configure
guest memory.  If '-object' is the preferred and complete way to configure
it, I prefer sticking with it if possible and see what is missing.

I think I raised that as the other major reason too, that I think it's so
far only about the vram that is out of the picture here.  We don't and
shouldn't have complicated RW RAMs floating around that we want this
property to cover.  We should make sure we introduce this property with
some good reason, rather than "ok we don't know what happened, we don't
know what will leverage this, but maybe there is some floating RAMs..".
Right after we introduce this property we need to carry it forever, and
prepared people start to use it with/without cpr, and that's some
maintenance cost that I want to see whether we can avoid.

Thanks,
David Hildenbrand June 4, 2024, 4:14 p.m. UTC | #12
On 04.06.24 17:58, Peter Xu wrote:
> On Tue, Jun 04, 2024 at 08:13:26AM +0100, Daniel P. Berrangé wrote:
>> On Mon, Jun 03, 2024 at 05:48:32PM -0400, Peter Xu wrote:
>>> That property, irrelevant of what it is called (and I doubt whether Dan's
>>> suggestion on "shared-ram" is good, e.g. mmap(MAP_SHARED) doesn't have user
>>> visible fd but it's shared-ram for sure..), is yet another way to specify
>>> guest mem types.
>>>
>>> What if the user specified this property but specified something else in
>>> the -object parameters?  E.g. -machine share-ram=on -object
>>> memory-backend-ram,share=off.  What should we do?
>>
>> The machine property would only apply to memory regions that are
>> *NOT* being created via -object. The memory-backend objects would
>> always honour their own share settnig.
> 
> In that case we may want to rename that to share-ram-by-default=on.
> Otherwise it's not clear which one would take effect from an user POV, even
> if we can define it like that in the code.
> 
> Even with share-ram-by-default=on, it can be still confusing in some form
> or another. Consider this cmdline:
> 
>    -machine q35,share-ram-by-default=on -object memory-backend-ram,id=mem1
> 
> Then is mem1 shared or not?  From reading the cmdline, if share ram by
> default it should be ON if we don't specify it, but it's actually off?
> It's because -object has its own default values.

We do have something similar with "merge" and "dump" properties. See 
machine_mem_merge() / machine_dump_guest_core().

These correspond to the "mem-merge" and "dump-guest-core" machine 
properties.

But ...

> 
> IMHO fundamentally it's just controversial to have two ways to configure
> guest memory.  If '-object' is the preferred and complete way to configure
> it, I prefer sticking with it if possible and see what is missing.

... I agree with that. With vhost-user we also require a reasonable 
configuration (using proper fd-based shared memory) for it to work.

> 
> I think I raised that as the other major reason too, that I think it's so
> far only about the vram that is out of the picture here.  We don't and
> shouldn't have complicated RW RAMs floating around that we want this
> property to cover.

Agreed. And maybe we can still keep migration of any MAP_PRIVATE thing 
working by migrating that memory? CPR will be "slightly less fast".

But the biggest piece -- guest RAM -- will be migrated via the fd directly.
Peter Xu June 4, 2024, 4:41 p.m. UTC | #13
On Tue, Jun 04, 2024 at 06:14:08PM +0200, David Hildenbrand wrote:
> On 04.06.24 17:58, Peter Xu wrote:
> > On Tue, Jun 04, 2024 at 08:13:26AM +0100, Daniel P. Berrangé wrote:
> > > On Mon, Jun 03, 2024 at 05:48:32PM -0400, Peter Xu wrote:
> > > > That property, irrelevant of what it is called (and I doubt whether Dan's
> > > > suggestion on "shared-ram" is good, e.g. mmap(MAP_SHARED) doesn't have user
> > > > visible fd but it's shared-ram for sure..), is yet another way to specify
> > > > guest mem types.
> > > > 
> > > > What if the user specified this property but specified something else in
> > > > the -object parameters?  E.g. -machine share-ram=on -object
> > > > memory-backend-ram,share=off.  What should we do?
> > > 
> > > The machine property would only apply to memory regions that are
> > > *NOT* being created via -object. The memory-backend objects would
> > > always honour their own share settnig.
> > 
> > In that case we may want to rename that to share-ram-by-default=on.
> > Otherwise it's not clear which one would take effect from an user POV, even
> > if we can define it like that in the code.
> > 
> > Even with share-ram-by-default=on, it can be still confusing in some form
> > or another. Consider this cmdline:
> > 
> >    -machine q35,share-ram-by-default=on -object memory-backend-ram,id=mem1
> > 
> > Then is mem1 shared or not?  From reading the cmdline, if share ram by
> > default it should be ON if we don't specify it, but it's actually off?
> > It's because -object has its own default values.
> 
> We do have something similar with "merge" and "dump" properties. See
> machine_mem_merge() / machine_dump_guest_core().
> 
> These correspond to the "mem-merge" and "dump-guest-core" machine
> properties.

These look fine so far, as long as -object cmdline doesn't allow to specify
the same thing again.

> 
> But ...
> 
> > 
> > IMHO fundamentally it's just controversial to have two ways to configure
> > guest memory.  If '-object' is the preferred and complete way to configure
> > it, I prefer sticking with it if possible and see what is missing.
> 
> ... I agree with that. With vhost-user we also require a reasonable
> configuration (using proper fd-based shared memory) for it to work.
> 
> > 
> > I think I raised that as the other major reason too, that I think it's so
> > far only about the vram that is out of the picture here.  We don't and
> > shouldn't have complicated RW RAMs floating around that we want this
> > property to cover.
> 
> Agreed. And maybe we can still keep migration of any MAP_PRIVATE thing
> working by migrating that memory? CPR will be "slightly less fast".
> 
> But the biggest piece -- guest RAM -- will be migrated via the fd directly.

I think it should work but only without VFIO.  When with VFIO there must
have no private pages at all or migrating is racy with concurrent DMAs
(yes, AFAICT CPR can run migration with DMA running..).

CPR has a pretty tricky way of using VFIO pgtables in that it requires the
PFNs to not change before/after migration.  Feel free to have a look at
VFIO_DMA_MAP_FLAG_VADDR in vfio.h then you may get a feeling of it.

Thanks,
David Hildenbrand June 4, 2024, 5:16 p.m. UTC | #14
On 04.06.24 18:41, Peter Xu wrote:
> On Tue, Jun 04, 2024 at 06:14:08PM +0200, David Hildenbrand wrote:
>> On 04.06.24 17:58, Peter Xu wrote:
>>> On Tue, Jun 04, 2024 at 08:13:26AM +0100, Daniel P. Berrangé wrote:
>>>> On Mon, Jun 03, 2024 at 05:48:32PM -0400, Peter Xu wrote:
>>>>> That property, irrelevant of what it is called (and I doubt whether Dan's
>>>>> suggestion on "shared-ram" is good, e.g. mmap(MAP_SHARED) doesn't have user
>>>>> visible fd but it's shared-ram for sure..), is yet another way to specify
>>>>> guest mem types.
>>>>>
>>>>> What if the user specified this property but specified something else in
>>>>> the -object parameters?  E.g. -machine share-ram=on -object
>>>>> memory-backend-ram,share=off.  What should we do?
>>>>
>>>> The machine property would only apply to memory regions that are
>>>> *NOT* being created via -object. The memory-backend objects would
>>>> always honour their own share settnig.
>>>
>>> In that case we may want to rename that to share-ram-by-default=on.
>>> Otherwise it's not clear which one would take effect from an user POV, even
>>> if we can define it like that in the code.
>>>
>>> Even with share-ram-by-default=on, it can be still confusing in some form
>>> or another. Consider this cmdline:
>>>
>>>     -machine q35,share-ram-by-default=on -object memory-backend-ram,id=mem1
>>>
>>> Then is mem1 shared or not?  From reading the cmdline, if share ram by
>>> default it should be ON if we don't specify it, but it's actually off?
>>> It's because -object has its own default values.
>>
>> We do have something similar with "merge" and "dump" properties. See
>> machine_mem_merge() / machine_dump_guest_core().
>>
>> These correspond to the "mem-merge" and "dump-guest-core" machine
>> properties.
> 
> These look fine so far, as long as -object cmdline doesn't allow to specify
> the same thing again.
> 

You can. The mem-merge / dump-guest-core set the default that can be 
modified per memory backend (merge / dump properties).

>>
>> But ...
>>
>>>
>>> IMHO fundamentally it's just controversial to have two ways to configure
>>> guest memory.  If '-object' is the preferred and complete way to configure
>>> it, I prefer sticking with it if possible and see what is missing.
>>
>> ... I agree with that. With vhost-user we also require a reasonable
>> configuration (using proper fd-based shared memory) for it to work.
>>
>>>
>>> I think I raised that as the other major reason too, that I think it's so
>>> far only about the vram that is out of the picture here.  We don't and
>>> shouldn't have complicated RW RAMs floating around that we want this
>>> property to cover.
>>
>> Agreed. And maybe we can still keep migration of any MAP_PRIVATE thing
>> working by migrating that memory? CPR will be "slightly less fast".
>>
>> But the biggest piece -- guest RAM -- will be migrated via the fd directly.
> 
> I think it should work but only without VFIO.  When with VFIO there must
> have no private pages at all or migrating is racy with concurrent DMAs
> (yes, AFAICT CPR can run migration with DMA running..).

Understood. For these we could fail migration. Thanks for the pointer.
diff mbox series

Patch

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 582c2df..9567b97 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -443,6 +443,20 @@  static void machine_set_mem_merge(Object *obj, bool value, Error **errp)
     ms->mem_merge = value;
 }
 
+static bool machine_get_memfd_alloc(Object *obj, Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+
+    return ms->memfd_alloc;
+}
+
+static void machine_set_memfd_alloc(Object *obj, bool value, Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+
+    ms->memfd_alloc = value;
+}
+
 static bool machine_get_usb(Object *obj, Error **errp)
 {
     MachineState *ms = MACHINE(obj);
@@ -1044,6 +1058,11 @@  static void machine_class_init(ObjectClass *oc, void *data)
     object_class_property_set_description(oc, "mem-merge",
         "Enable/disable memory merge support");
 
+    object_class_property_add_bool(oc, "memfd-alloc",
+        machine_get_memfd_alloc, machine_set_memfd_alloc);
+    object_class_property_set_description(oc, "memfd-alloc",
+        "Enable/disable allocating anonymous memory using memfd_create");
+
     object_class_property_add_bool(oc, "usb",
         machine_get_usb, machine_set_usb);
     object_class_property_set_description(oc, "usb",
@@ -1387,6 +1406,9 @@  static bool create_default_memdev(MachineState *ms, const char *path, Error **er
     if (!object_property_set_int(obj, "size", ms->ram_size, errp)) {
         goto out;
     }
+    if (!object_property_set_bool(obj, "share", ms->memfd_alloc, errp)) {
+        goto out;
+    }
     object_property_add_child(object_get_objects_root(), mc->default_ram_id,
                               obj);
     /* Ensure backend's memory region name is equal to mc->default_ram_id */
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 69c1ba4..96259c3 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -372,6 +372,7 @@  struct MachineState {
     bool dump_guest_core;
     bool mem_merge;
     bool require_guest_memfd;
+    bool memfd_alloc;
     bool usb;
     bool usb_disabled;
     char *firmware;
diff --git a/qemu-options.hx b/qemu-options.hx
index cf61f6b..f0dfda5 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -32,6 +32,7 @@  DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
     "                vmport=on|off|auto controls emulation of vmport (default: auto)\n"
     "                dump-guest-core=on|off include guest memory in a core dump (default=on)\n"
     "                mem-merge=on|off controls memory merge support (default: on)\n"
+    "                memfd-alloc=on|off controls allocating anonymous guest RAM using memfd_create (default: off)\n"
     "                aes-key-wrap=on|off controls support for AES key wrapping (default=on)\n"
     "                dea-key-wrap=on|off controls support for DEA key wrapping (default=on)\n"
     "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
@@ -79,6 +80,11 @@  SRST
         supported by the host, de-duplicates identical memory pages
         among VMs instances (enabled by default).
 
+    ``memfd-alloc=on|off``
+        Enables or disables allocation of anonymous guest RAM using
+        memfd_create.  Any associated memory-backend objects are created with
+        share=on.  The memfd-alloc default is off.
+
     ``aes-key-wrap=on|off``
         Enables or disables AES key wrapping support on s390-ccw hosts.
         This feature controls whether AES wrapping keys will be created
diff --git a/system/memory.c b/system/memory.c
index 49f1cb2..ca04a0e 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -1552,8 +1552,9 @@  bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
                                       uint64_t size,
                                       Error **errp)
 {
+    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
     return memory_region_init_ram_flags_nomigrate(mr, owner, name,
-                                                  size, 0, errp);
+                                                  size, flags, errp);
 }
 
 bool memory_region_init_ram_flags_nomigrate(MemoryRegion *mr,
@@ -1713,8 +1714,9 @@  bool memory_region_init_rom_nomigrate(MemoryRegion *mr,
                                       uint64_t size,
                                       Error **errp)
 {
+    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
     if (!memory_region_init_ram_flags_nomigrate(mr, owner, name,
-                                                size, 0, errp)) {
+                                                size, flags, errp)) {
          return false;
     }
     mr->readonly = true;
@@ -1731,6 +1733,7 @@  bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
                                              Error **errp)
 {
     Error *err = NULL;
+    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
     assert(ops);
     memory_region_init(mr, owner, name, size);
     mr->ops = ops;
@@ -1738,7 +1741,7 @@  bool memory_region_init_rom_device_nomigrate(MemoryRegion *mr,
     mr->terminates = true;
     mr->rom_device = true;
     mr->destructor = memory_region_destructor_ram;
-    mr->ram_block = qemu_ram_alloc(size, 0, mr, &err);
+    mr->ram_block = qemu_ram_alloc(size, flags, mr, &err);
     if (err) {
         mr->size = int128_zero();
         object_unparent(OBJECT(mr));
diff --git a/system/physmem.c b/system/physmem.c
index c736af5..36d97ec 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -45,6 +45,7 @@ 
 #include "qemu/qemu-print.h"
 #include "qemu/log.h"
 #include "qemu/memalign.h"
+#include "qemu/memfd.h"
 #include "exec/memory.h"
 #include "exec/ioport.h"
 #include "sysemu/dma.h"
@@ -1825,6 +1826,19 @@  static void *ram_block_alloc_host(RAMBlock *rb, Error **errp)
     if (xen_enabled()) {
         xen_ram_alloc(rb->offset, rb->max_length, mr, errp);
 
+    } else if (rb->flags & RAM_SHARED) {
+        if (rb->fd == -1) {
+            mr->align = QEMU_VMALLOC_ALIGN;
+            rb->fd = qemu_memfd_create(rb->idstr, rb->max_length + mr->align,
+                                       0, 0, 0, errp);
+        }
+        if (rb->fd >= 0) {
+            int mfd = rb->fd;
+            qemu_set_cloexec(mfd);
+            host = file_ram_alloc(rb, rb->max_length, mfd, false, 0, errp);
+            trace_qemu_anon_memfd_alloc(rb->idstr, rb->max_length, mfd, host);
+        }
+
     } else {
         host = qemu_anon_ram_alloc(rb->max_length, &mr->align,
                                    qemu_ram_is_shared(rb),
@@ -2106,8 +2120,10 @@  RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
                                                      void *host),
                                      MemoryRegion *mr, Error **errp)
 {
+    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
+    flags |= RAM_RESIZEABLE;
     return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
-                                   RAM_RESIZEABLE, mr, errp);
+                                   flags, mr, errp);
 }
 
 static void reclaim_ramblock(RAMBlock *block)
diff --git a/system/trace-events b/system/trace-events
index f0a80ba..0092734 100644
--- a/system/trace-events
+++ b/system/trace-events
@@ -41,3 +41,4 @@  dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"P
 
 # physmem.c
 ram_block_create(const char *name, uint32_t flags, int fd, size_t used_length, size_t max_length, size_t align) "%s, flags %u, fd %d, len %lu, maxlen %lu, align %lu"
+qemu_anon_memfd_alloc(const char *name, size_t size, int fd, void *ptr) "%s size %zu fd %d -> %p"