diff mbox series

[RESEND,v6,01/36] memory: alloc RAM from file at offset

Message ID cb792b8d6f93d00c10790de8698c468b6ff4ab69.1587614626.git.elena.ufimtseva@oracle.com (mailing list archive)
State New, archived
Headers show
Series [RESEND,v6,01/36] memory: alloc RAM from file at offset | expand

Commit Message

Elena Ufimtseva April 23, 2020, 4:13 a.m. UTC
From: Jagannathan Raman <jag.raman@oracle.com>

Allow RAM MemoryRegion to be created from an offset in a file, instead
of allocating at offset of 0 by default. This is needed to synchronize
RAM between QEMU & remote process.

Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 exec.c                    | 11 +++++++----
 include/exec/ram_addr.h   |  2 +-
 include/qemu/mmap-alloc.h |  3 ++-
 memory.c                  |  2 +-
 util/mmap-alloc.c         |  7 ++++---
 util/oslib-posix.c        |  2 +-
 6 files changed, 16 insertions(+), 11 deletions(-)

Comments

Stefan Hajnoczi May 12, 2020, 8:26 a.m. UTC | #1
On Wed, Apr 22, 2020 at 09:13:36PM -0700, elena.ufimtseva@oracle.com wrote:
> From: Jagannathan Raman <jag.raman@oracle.com>
> 
> Allow RAM MemoryRegion to be created from an offset in a file, instead
> of allocating at offset of 0 by default. This is needed to synchronize
> RAM between QEMU & remote process.
> 
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  exec.c                    | 11 +++++++----
>  include/exec/ram_addr.h   |  2 +-
>  include/qemu/mmap-alloc.h |  3 ++-
>  memory.c                  |  2 +-
>  util/mmap-alloc.c         |  7 ++++---
>  util/oslib-posix.c        |  2 +-
>  6 files changed, 16 insertions(+), 11 deletions(-)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Daniel P. Berrangé May 12, 2020, 8:48 a.m. UTC | #2
On Wed, Apr 22, 2020 at 09:13:36PM -0700, elena.ufimtseva@oracle.com wrote:
> From: Jagannathan Raman <jag.raman@oracle.com>
> 
> Allow RAM MemoryRegion to be created from an offset in a file, instead
> of allocating at offset of 0 by default. This is needed to synchronize
> RAM between QEMU & remote process.

Can you elaborate on why remote processes require the RAM to be offset
from zero ?

NB, I'm not objecting - I'm just curious to understand more.

> 
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  exec.c                    | 11 +++++++----
>  include/exec/ram_addr.h   |  2 +-
>  include/qemu/mmap-alloc.h |  3 ++-
>  memory.c                  |  2 +-
>  util/mmap-alloc.c         |  7 ++++---
>  util/oslib-posix.c        |  2 +-
>  6 files changed, 16 insertions(+), 11 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index 2874bb5088..d0ac9545f4 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1801,6 +1801,7 @@ static void *file_ram_alloc(RAMBlock *block,
>                              ram_addr_t memory,
>                              int fd,
>                              bool truncate,
> +                            off_t offset,
>                              Error **errp)
>  {
>      void *area;
> @@ -1851,7 +1852,8 @@ static void *file_ram_alloc(RAMBlock *block,
>      }
>  
>      area = qemu_ram_mmap(fd, memory, block->mr->align,
> -                         block->flags & RAM_SHARED, block->flags & RAM_PMEM);
> +                         block->flags & RAM_SHARED, block->flags & RAM_PMEM,
> +                         offset);
>      if (area == MAP_FAILED) {
>          error_setg_errno(errp, errno,
>                           "unable to map backing store for guest RAM");
> @@ -2283,7 +2285,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
>  #ifdef CONFIG_POSIX
>  RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>                                   uint32_t ram_flags, int fd,
> -                                 Error **errp)
> +                                 off_t offset, Error **errp)
>  {
>      RAMBlock *new_block;
>      Error *local_err = NULL;
> @@ -2328,7 +2330,8 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>      new_block->used_length = size;
>      new_block->max_length = size;
>      new_block->flags = ram_flags;
> -    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
> +    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
> +                                     errp);
>      if (!new_block->host) {
>          g_free(new_block);
>          return NULL;
> @@ -2358,7 +2361,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
>          return NULL;
>      }
>  
> -    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
> +    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, errp);
>      if (!block) {
>          if (created) {
>              unlink(mem_path);
> diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
> index 5e59a3d8d7..1b9f489ff0 100644
> --- a/include/exec/ram_addr.h
> +++ b/include/exec/ram_addr.h
> @@ -121,7 +121,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
>                                     Error **errp);
>  RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>                                   uint32_t ram_flags, int fd,
> -                                 Error **errp);
> +                                 off_t offset, Error **errp);
>  
>  RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>                                    MemoryRegion *mr, Error **errp);
> diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
> index e786266b92..4f579858bc 100644
> --- a/include/qemu/mmap-alloc.h
> +++ b/include/qemu/mmap-alloc.h
> @@ -25,7 +25,8 @@ void *qemu_ram_mmap(int fd,
>                      size_t size,
>                      size_t align,
>                      bool shared,
> -                    bool is_pmem);
> +                    bool is_pmem,
> +                    off_t start);
>  
>  void qemu_ram_munmap(int fd, void *ptr, size_t size);
>  
> diff --git a/memory.c b/memory.c
> index 601b749906..f5fec476b7 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -1596,7 +1596,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
>      mr->destructor = memory_region_destructor_ram;
>      mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
>                                             share ? RAM_SHARED : 0,
> -                                           fd, &err);
> +                                           fd, 0, &err);
>      mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
>      if (err) {
>          mr->size = int128_zero();
> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> index 27dcccd8ec..a28f7025f0 100644
> --- a/util/mmap-alloc.c
> +++ b/util/mmap-alloc.c
> @@ -86,7 +86,8 @@ void *qemu_ram_mmap(int fd,
>                      size_t size,
>                      size_t align,
>                      bool shared,
> -                    bool is_pmem)
> +                    bool is_pmem,
> +                    off_t start)
>  {
>      int flags;
>      int map_sync_flags = 0;
> @@ -147,7 +148,7 @@ void *qemu_ram_mmap(int fd,
>      offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
>  
>      ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
> -               flags | map_sync_flags, fd, 0);
> +               flags | map_sync_flags, fd, start);
>  
>      if (ptr == MAP_FAILED && map_sync_flags) {
>          if (errno == ENOTSUP) {
> @@ -172,7 +173,7 @@ void *qemu_ram_mmap(int fd,
>           * we will remove these flags to handle compatibility.
>           */
>          ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
> -                   flags, fd, 0);
> +                   flags, fd, start);
>      }
>  
>      if (ptr == MAP_FAILED) {
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 062236a1ab..4c6b9e90c6 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -209,7 +209,7 @@ void *qemu_memalign(size_t alignment, size_t size)
>  void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
>  {
>      size_t align = QEMU_VMALLOC_ALIGN;
> -    void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
> +    void *ptr = qemu_ram_mmap(-1, size, align, shared, false, 0);
>  
>      if (ptr == MAP_FAILED) {
>          return NULL;
> -- 
> 2.25.GIT
> 

Regards,
Daniel
Jag Raman May 12, 2020, 11:56 a.m. UTC | #3
> On May 12, 2020, at 4:48 AM, Daniel P. Berrangé <berrange@redhat.com> wrote:
> 
> On Wed, Apr 22, 2020 at 09:13:36PM -0700, elena.ufimtseva@oracle.com wrote:
>> From: Jagannathan Raman <jag.raman@oracle.com>
>> 
>> Allow RAM MemoryRegion to be created from an offset in a file, instead
>> of allocating at offset of 0 by default. This is needed to synchronize
>> RAM between QEMU & remote process.
> 
> Can you elaborate on why remote processes require the RAM to be offset
> from zero ?

Hi Daniel,

As it turns out, the RAM is scattered across the physical address space
(system_memory) of QEMU. Therefore, the system memory is composed
of multiple sections of RAM, and some sections start at a non-zero RAM
offset.

As a result, the remote process needs the ability to map these RAM
sections into system_memory.

Thank you!
--
Jag

> 
> NB, I'm not objecting - I'm just curious to understand more.
> 
>> 
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>> ---
>> exec.c                    | 11 +++++++----
>> include/exec/ram_addr.h   |  2 +-
>> include/qemu/mmap-alloc.h |  3 ++-
>> memory.c                  |  2 +-
>> util/mmap-alloc.c         |  7 ++++---
>> util/oslib-posix.c        |  2 +-
>> 6 files changed, 16 insertions(+), 11 deletions(-)
>> 
>> diff --git a/exec.c b/exec.c
>> index 2874bb5088..d0ac9545f4 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -1801,6 +1801,7 @@ static void *file_ram_alloc(RAMBlock *block,
>>                             ram_addr_t memory,
>>                             int fd,
>>                             bool truncate,
>> +                            off_t offset,
>>                             Error **errp)
>> {
>>     void *area;
>> @@ -1851,7 +1852,8 @@ static void *file_ram_alloc(RAMBlock *block,
>>     }
>> 
>>     area = qemu_ram_mmap(fd, memory, block->mr->align,
>> -                         block->flags & RAM_SHARED, block->flags & RAM_PMEM);
>> +                         block->flags & RAM_SHARED, block->flags & RAM_PMEM,
>> +                         offset);
>>     if (area == MAP_FAILED) {
>>         error_setg_errno(errp, errno,
>>                          "unable to map backing store for guest RAM");
>> @@ -2283,7 +2285,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
>> #ifdef CONFIG_POSIX
>> RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>>                                  uint32_t ram_flags, int fd,
>> -                                 Error **errp)
>> +                                 off_t offset, Error **errp)
>> {
>>     RAMBlock *new_block;
>>     Error *local_err = NULL;
>> @@ -2328,7 +2330,8 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>>     new_block->used_length = size;
>>     new_block->max_length = size;
>>     new_block->flags = ram_flags;
>> -    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
>> +    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
>> +                                     errp);
>>     if (!new_block->host) {
>>         g_free(new_block);
>>         return NULL;
>> @@ -2358,7 +2361,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
>>         return NULL;
>>     }
>> 
>> -    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
>> +    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, errp);
>>     if (!block) {
>>         if (created) {
>>             unlink(mem_path);
>> diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
>> index 5e59a3d8d7..1b9f489ff0 100644
>> --- a/include/exec/ram_addr.h
>> +++ b/include/exec/ram_addr.h
>> @@ -121,7 +121,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
>>                                    Error **errp);
>> RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>>                                  uint32_t ram_flags, int fd,
>> -                                 Error **errp);
>> +                                 off_t offset, Error **errp);
>> 
>> RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>>                                   MemoryRegion *mr, Error **errp);
>> diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
>> index e786266b92..4f579858bc 100644
>> --- a/include/qemu/mmap-alloc.h
>> +++ b/include/qemu/mmap-alloc.h
>> @@ -25,7 +25,8 @@ void *qemu_ram_mmap(int fd,
>>                     size_t size,
>>                     size_t align,
>>                     bool shared,
>> -                    bool is_pmem);
>> +                    bool is_pmem,
>> +                    off_t start);
>> 
>> void qemu_ram_munmap(int fd, void *ptr, size_t size);
>> 
>> diff --git a/memory.c b/memory.c
>> index 601b749906..f5fec476b7 100644
>> --- a/memory.c
>> +++ b/memory.c
>> @@ -1596,7 +1596,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
>>     mr->destructor = memory_region_destructor_ram;
>>     mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
>>                                            share ? RAM_SHARED : 0,
>> -                                           fd, &err);
>> +                                           fd, 0, &err);
>>     mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
>>     if (err) {
>>         mr->size = int128_zero();
>> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
>> index 27dcccd8ec..a28f7025f0 100644
>> --- a/util/mmap-alloc.c
>> +++ b/util/mmap-alloc.c
>> @@ -86,7 +86,8 @@ void *qemu_ram_mmap(int fd,
>>                     size_t size,
>>                     size_t align,
>>                     bool shared,
>> -                    bool is_pmem)
>> +                    bool is_pmem,
>> +                    off_t start)
>> {
>>     int flags;
>>     int map_sync_flags = 0;
>> @@ -147,7 +148,7 @@ void *qemu_ram_mmap(int fd,
>>     offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
>> 
>>     ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
>> -               flags | map_sync_flags, fd, 0);
>> +               flags | map_sync_flags, fd, start);
>> 
>>     if (ptr == MAP_FAILED && map_sync_flags) {
>>         if (errno == ENOTSUP) {
>> @@ -172,7 +173,7 @@ void *qemu_ram_mmap(int fd,
>>          * we will remove these flags to handle compatibility.
>>          */
>>         ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
>> -                   flags, fd, 0);
>> +                   flags, fd, start);
>>     }
>> 
>>     if (ptr == MAP_FAILED) {
>> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
>> index 062236a1ab..4c6b9e90c6 100644
>> --- a/util/oslib-posix.c
>> +++ b/util/oslib-posix.c
>> @@ -209,7 +209,7 @@ void *qemu_memalign(size_t alignment, size_t size)
>> void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
>> {
>>     size_t align = QEMU_VMALLOC_ALIGN;
>> -    void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
>> +    void *ptr = qemu_ram_mmap(-1, size, align, shared, false, 0);
>> 
>>     if (ptr == MAP_FAILED) {
>>         return NULL;
>> -- 
>> 2.25.GIT
>> 
> 
> Regards,
> Daniel
> -- 
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
> 
>
Stefan Hajnoczi May 13, 2020, 8:40 a.m. UTC | #4
On Tue, May 12, 2020 at 07:56:33AM -0400, Jag Raman wrote:
> 
> 
> > On May 12, 2020, at 4:48 AM, Daniel P. Berrangé <berrange@redhat.com> wrote:
> > 
> > On Wed, Apr 22, 2020 at 09:13:36PM -0700, elena.ufimtseva@oracle.com wrote:
> >> From: Jagannathan Raman <jag.raman@oracle.com>
> >> 
> >> Allow RAM MemoryRegion to be created from an offset in a file, instead
> >> of allocating at offset of 0 by default. This is needed to synchronize
> >> RAM between QEMU & remote process.
> > 
> > Can you elaborate on why remote processes require the RAM to be offset
> > from zero ?
> 
> Hi Daniel,
> 
> As it turns out, the RAM is scattered across the physical address space
> (system_memory) of QEMU. Therefore, the system memory is composed
> of multiple sections of RAM, and some sections start at a non-zero RAM
> offset.
> 
> As a result, the remote process needs the ability to map these RAM
> sections into system_memory.

To explain a bit more, my understanding is that the offset is
specifically for mmap(2). As Jag alluded to, multiple sections can use a
single backing RAM file. These sections have different offsets in the
file.

Jag, maybe you can include a concrete explanation like the following in
the commit description:

Launch QEMU like this:

  qemu-system-x86_64 -mem-path /dev/shm -m 8G

There is only one RAM file descriptor:

  $ cat /proc/$(pidof qemu)/fd
  ...
  lrwx------. 1 stefanha stefanha 64 May 13 09:34 19 -> '/dev/shm/qemu_back_mem.pc.ram.7YAlqn (deleted)'

But the memory tree shows that single file is split into multiple ranges
of guest physical address space:

  (qemu) info mtree
  memory-region: system
  0000000000000000-ffffffffffffffff (prio 0, i/o): system
    0000000000000000-00000000bfffffff (prio 0, i/o): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
    ...
    0000000100000000-000000023fffffff (prio 0, i/o): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff

This means QEMU needs to send two regions to the remote device process.
They both mmap the same file but from different starting file offsets.

Stefan
Igor Mammedov May 13, 2020, 3:25 p.m. UTC | #5
On Wed, 13 May 2020 09:40:42 +0100
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> On Tue, May 12, 2020 at 07:56:33AM -0400, Jag Raman wrote:
> > 
> >   
> > > On May 12, 2020, at 4:48 AM, Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > 
> > > On Wed, Apr 22, 2020 at 09:13:36PM -0700, elena.ufimtseva@oracle.com wrote:  
> > >> From: Jagannathan Raman <jag.raman@oracle.com>
> > >> 
> > >> Allow RAM MemoryRegion to be created from an offset in a file, instead
> > >> of allocating at offset of 0 by default. This is needed to synchronize
> > >> RAM between QEMU & remote process.  
> > > 
> > > Can you elaborate on why remote processes require the RAM to be offset
> > > from zero ?  
> > 
> > Hi Daniel,
> > 
> > As it turns out, the RAM is scattered across the physical address space
> > (system_memory) of QEMU. Therefore, the system memory is composed
> > of multiple sections of RAM, and some sections start at a non-zero RAM
> > offset.
> > 
> > As a result, the remote process needs the ability to map these RAM
> > sections into system_memory.  
> 
> To explain a bit more, my understanding is that the offset is
> specifically for mmap(2). As Jag alluded to, multiple sections can use a
> single backing RAM file. These sections have different offsets in the
> file.
> 
> Jag, maybe you can include a concrete explanation like the following in
> the commit description:
> 
> Launch QEMU like this:
> 
>   qemu-system-x86_64 -mem-path /dev/shm -m 8G
> 
> There is only one RAM file descriptor:
> 
>   $ cat /proc/$(pidof qemu)/fd
>   ...
>   lrwx------. 1 stefanha stefanha 64 May 13 09:34 19 -> '/dev/shm/qemu_back_mem.pc.ram.7YAlqn (deleted)'
> 
> But the memory tree shows that single file is split into multiple ranges
> of guest physical address space:
> 
>   (qemu) info mtree
>   memory-region: system
>   0000000000000000-ffffffffffffffff (prio 0, i/o): system
>     0000000000000000-00000000bfffffff (prio 0, i/o): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
>     ...
>     0000000100000000-000000023fffffff (prio 0, i/o): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
> 
> This means QEMU needs to send two regions to the remote device process.
> They both mmap the same file but from different starting file offsets.

are we talking here about GPA offests her or about host offsets in mmaped host file?
If it's the later then above mtree doesn't show true picture (those entries are just aliases),
main guest RAM is allocated as a single continuous chunk (so far) which belongs
to a memory-backend.

> Stefan
Jag Raman May 13, 2020, 8:08 p.m. UTC | #6
> On May 13, 2020, at 11:25 AM, Igor Mammedov <imammedo@redhat.com> wrote:
> 
> On Wed, 13 May 2020 09:40:42 +0100
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
>> On Tue, May 12, 2020 at 07:56:33AM -0400, Jag Raman wrote:
>>> 
>>> 
>>>> On May 12, 2020, at 4:48 AM, Daniel P. Berrangé <berrange@redhat.com> wrote:
>>>> 
>>>> On Wed, Apr 22, 2020 at 09:13:36PM -0700, elena.ufimtseva@oracle.com wrote:  
>>>>> From: Jagannathan Raman <jag.raman@oracle.com>
>>>>> 
>>>>> Allow RAM MemoryRegion to be created from an offset in a file, instead
>>>>> of allocating at offset of 0 by default. This is needed to synchronize
>>>>> RAM between QEMU & remote process.  
>>>> 
>>>> Can you elaborate on why remote processes require the RAM to be offset
>>>> from zero ?  
>>> 
>>> Hi Daniel,
>>> 
>>> As it turns out, the RAM is scattered across the physical address space
>>> (system_memory) of QEMU. Therefore, the system memory is composed
>>> of multiple sections of RAM, and some sections start at a non-zero RAM
>>> offset.
>>> 
>>> As a result, the remote process needs the ability to map these RAM
>>> sections into system_memory.  
>> 
>> To explain a bit more, my understanding is that the offset is
>> specifically for mmap(2). As Jag alluded to, multiple sections can use a
>> single backing RAM file. These sections have different offsets in the
>> file.
>> 
>> Jag, maybe you can include a concrete explanation like the following in
>> the commit description:
>> 
>> Launch QEMU like this:
>> 
>>  qemu-system-x86_64 -mem-path /dev/shm -m 8G
>> 
>> There is only one RAM file descriptor:
>> 
>>  $ cat /proc/$(pidof qemu)/fd
>>  ...
>>  lrwx------. 1 stefanha stefanha 64 May 13 09:34 19 -> '/dev/shm/qemu_back_mem.pc.ram.7YAlqn (deleted)'
>> 
>> But the memory tree shows that single file is split into multiple ranges
>> of guest physical address space:
>> 
>>  (qemu) info mtree
>>  memory-region: system
>>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
>>    0000000000000000-00000000bfffffff (prio 0, i/o): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
>>    ...
>>    0000000100000000-000000023fffffff (prio 0, i/o): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
>> 
>> This means QEMU needs to send two regions to the remote device process.
>> They both mmap the same file but from different starting file offsets.
> 
> are we talking here about GPA offests her or about host offsets in mmaped host file?
> If it's the later then above mtree doesn't show true picture (those entries are just aliases),
> main guest RAM is allocated as a single continuous chunk (so far) which belongs
> to a memory-backend.

Thanks for the info about ‘mtree’ QMP option. We’ll use that to better explain the need for offset
during memory allocation.

As we can observe from the ‘mtree’ output, different sections of system memory vector into the
RAM. Since we have only one file descriptor available for all of RAM, we need to mmap() these
sections at different offsets within the memory file.

Hey Igor, the offset passed into the mmap() syscall is the offset within host file. Thanks for
pointing out that these are aliases. I believe the mmap() operation is equivalent to the “alias”
operation within MemoryRegion framework. We are sending the ‘fd’, ‘offset’ within the fd and
the size of these RAM regions to the remote device over the unix socket. Hopefully, this looks
good to you.

Thanks!
--
Jag

> 
>> Stefan
> 
>
Igor Mammedov May 14, 2020, 9:47 a.m. UTC | #7
On Wed, 13 May 2020 16:08:06 -0400
Jag Raman <jag.raman@oracle.com> wrote:

> > On May 13, 2020, at 11:25 AM, Igor Mammedov <imammedo@redhat.com> wrote:
> > 
> > On Wed, 13 May 2020 09:40:42 +0100
> > Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >   
> >> On Tue, May 12, 2020 at 07:56:33AM -0400, Jag Raman wrote:  
> >>> 
> >>>   
> >>>> On May 12, 2020, at 4:48 AM, Daniel P. Berrangé <berrange@redhat.com> wrote:
> >>>> 
> >>>> On Wed, Apr 22, 2020 at 09:13:36PM -0700, elena.ufimtseva@oracle.com wrote:    
> >>>>> From: Jagannathan Raman <jag.raman@oracle.com>
> >>>>> 
> >>>>> Allow RAM MemoryRegion to be created from an offset in a file, instead
> >>>>> of allocating at offset of 0 by default. This is needed to synchronize
> >>>>> RAM between QEMU & remote process.    
> >>>> 
> >>>> Can you elaborate on why remote processes require the RAM to be offset
> >>>> from zero ?    
> >>> 
> >>> Hi Daniel,
> >>> 
> >>> As it turns out, the RAM is scattered across the physical address space
> >>> (system_memory) of QEMU. Therefore, the system memory is composed
> >>> of multiple sections of RAM, and some sections start at a non-zero RAM
> >>> offset.
> >>> 
> >>> As a result, the remote process needs the ability to map these RAM
> >>> sections into system_memory.    
> >> 
> >> To explain a bit more, my understanding is that the offset is
> >> specifically for mmap(2). As Jag alluded to, multiple sections can use a
> >> single backing RAM file. These sections have different offsets in the
> >> file.
> >> 
> >> Jag, maybe you can include a concrete explanation like the following in
> >> the commit description:
> >> 
> >> Launch QEMU like this:
> >> 
> >>  qemu-system-x86_64 -mem-path /dev/shm -m 8G
> >> 
> >> There is only one RAM file descriptor:
> >> 
> >>  $ cat /proc/$(pidof qemu)/fd
> >>  ...
> >>  lrwx------. 1 stefanha stefanha 64 May 13 09:34 19 -> '/dev/shm/qemu_back_mem.pc.ram.7YAlqn (deleted)'
> >> 
> >> But the memory tree shows that single file is split into multiple ranges
> >> of guest physical address space:
> >> 
> >>  (qemu) info mtree
> >>  memory-region: system
> >>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
> >>    0000000000000000-00000000bfffffff (prio 0, i/o): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
> >>    ...
> >>    0000000100000000-000000023fffffff (prio 0, i/o): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
> >> 
> >> This means QEMU needs to send two regions to the remote device process.
> >> They both mmap the same file but from different starting file offsets.  
> > 
> > are we talking here about GPA offests her or about host offsets in mmaped host file?
> > If it's the later then above mtree doesn't show true picture (those entries are just aliases),
> > main guest RAM is allocated as a single continuous chunk (so far) which belongs
> > to a memory-backend.  
> 
> Thanks for the info about ‘mtree’ QMP option. We’ll use that to better explain the need for offset
> during memory allocation.
> 
> As we can observe from the ‘mtree’ output, different sections of system memory vector into the
> RAM. Since we have only one file descriptor available for all of RAM, we need to mmap() these
> sections at different offsets within the memory file.
> 
> Hey Igor, the offset passed into the mmap() syscall is the offset within host file. Thanks for
> pointing out that these are aliases. I believe the mmap() operation is equivalent to the “alias”
> operation within MemoryRegion framework. We are sending the ‘fd’, ‘offset’ within the fd and
> the size of these RAM regions to the remote device over the unix socket. Hopefully, this looks
> good to you.

could you point to a specific patch where on remote device that is being received and used?

> 
> Thanks!
> --
> Jag
> 
> >   
> >> Stefan  
> > 
> >   
>
Dr. David Alan Gilbert May 14, 2020, 9:51 a.m. UTC | #8
* Jag Raman (jag.raman@oracle.com) wrote:
> 
> 
> > On May 13, 2020, at 11:25 AM, Igor Mammedov <imammedo@redhat.com> wrote:
> > 
> > On Wed, 13 May 2020 09:40:42 +0100
> > Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> >> On Tue, May 12, 2020 at 07:56:33AM -0400, Jag Raman wrote:
> >>> 
> >>> 
> >>>> On May 12, 2020, at 4:48 AM, Daniel P. Berrangé <berrange@redhat.com> wrote:
> >>>> 
> >>>> On Wed, Apr 22, 2020 at 09:13:36PM -0700, elena.ufimtseva@oracle.com wrote:  
> >>>>> From: Jagannathan Raman <jag.raman@oracle.com>
> >>>>> 
> >>>>> Allow RAM MemoryRegion to be created from an offset in a file, instead
> >>>>> of allocating at offset of 0 by default. This is needed to synchronize
> >>>>> RAM between QEMU & remote process.  
> >>>> 
> >>>> Can you elaborate on why remote processes require the RAM to be offset
> >>>> from zero ?  
> >>> 
> >>> Hi Daniel,
> >>> 
> >>> As it turns out, the RAM is scattered across the physical address space
> >>> (system_memory) of QEMU. Therefore, the system memory is composed
> >>> of multiple sections of RAM, and some sections start at a non-zero RAM
> >>> offset.
> >>> 
> >>> As a result, the remote process needs the ability to map these RAM
> >>> sections into system_memory.  
> >> 
> >> To explain a bit more, my understanding is that the offset is
> >> specifically for mmap(2). As Jag alluded to, multiple sections can use a
> >> single backing RAM file. These sections have different offsets in the
> >> file.
> >> 
> >> Jag, maybe you can include a concrete explanation like the following in
> >> the commit description:
> >> 
> >> Launch QEMU like this:
> >> 
> >>  qemu-system-x86_64 -mem-path /dev/shm -m 8G
> >> 
> >> There is only one RAM file descriptor:
> >> 
> >>  $ cat /proc/$(pidof qemu)/fd
> >>  ...
> >>  lrwx------. 1 stefanha stefanha 64 May 13 09:34 19 -> '/dev/shm/qemu_back_mem.pc.ram.7YAlqn (deleted)'
> >> 
> >> But the memory tree shows that single file is split into multiple ranges
> >> of guest physical address space:
> >> 
> >>  (qemu) info mtree
> >>  memory-region: system
> >>  0000000000000000-ffffffffffffffff (prio 0, i/o): system
> >>    0000000000000000-00000000bfffffff (prio 0, i/o): alias ram-below-4g @pc.ram 0000000000000000-00000000bfffffff
> >>    ...
> >>    0000000100000000-000000023fffffff (prio 0, i/o): alias ram-above-4g @pc.ram 00000000c0000000-00000001ffffffff
> >> 
> >> This means QEMU needs to send two regions to the remote device process.
> >> They both mmap the same file but from different starting file offsets.
> > 
> > are we talking here about GPA offests her or about host offsets in mmaped host file?
> > If it's the later then above mtree doesn't show true picture (those entries are just aliases),
> > main guest RAM is allocated as a single continuous chunk (so far) which belongs
> > to a memory-backend.
> 
> Thanks for the info about ‘mtree’ QMP option. We’ll use that to better explain the need for offset
> during memory allocation.
> 
> As we can observe from the ‘mtree’ output, different sections of system memory vector into the
> RAM. Since we have only one file descriptor available for all of RAM, we need to mmap() these
> sections at different offsets within the memory file.
> 
> Hey Igor, the offset passed into the mmap() syscall is the offset within host file. Thanks for
> pointing out that these are aliases. I believe the mmap() operation is equivalent to the “alias”
> operation within MemoryRegion framework. We are sending the ‘fd’, ‘offset’ within the fd and
> the size of these RAM regions to the remote device over the unix socket. Hopefully, this looks
> good to you.

Vhost-user already does something similar; see it's
vhost_user_fill_set_mem_table_msg:

            msg->payload.memory.regions[*fd_num].userspace_addr =
                reg->userspace_addr;
            msg->payload.memory.regions[*fd_num].memory_size =
                reg->memory_size;
            msg->payload.memory.regions[*fd_num].guest_phys_addr =
                reg->guest_phys_addr;
            msg->payload.memory.regions[*fd_num].mmap_offset = offset;

Note you're not needing to map the fd with an offset on the qemu side,
it's something only needed on the remote.

Dave

> Thanks!
> --
> Jag
> 
> > 
> >> Stefan
> > 
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/exec.c b/exec.c
index 2874bb5088..d0ac9545f4 100644
--- a/exec.c
+++ b/exec.c
@@ -1801,6 +1801,7 @@  static void *file_ram_alloc(RAMBlock *block,
                             ram_addr_t memory,
                             int fd,
                             bool truncate,
+                            off_t offset,
                             Error **errp)
 {
     void *area;
@@ -1851,7 +1852,8 @@  static void *file_ram_alloc(RAMBlock *block,
     }
 
     area = qemu_ram_mmap(fd, memory, block->mr->align,
-                         block->flags & RAM_SHARED, block->flags & RAM_PMEM);
+                         block->flags & RAM_SHARED, block->flags & RAM_PMEM,
+                         offset);
     if (area == MAP_FAILED) {
         error_setg_errno(errp, errno,
                          "unable to map backing store for guest RAM");
@@ -2283,7 +2285,7 @@  static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
 #ifdef CONFIG_POSIX
 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
                                  uint32_t ram_flags, int fd,
-                                 Error **errp)
+                                 off_t offset, Error **errp)
 {
     RAMBlock *new_block;
     Error *local_err = NULL;
@@ -2328,7 +2330,8 @@  RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
     new_block->used_length = size;
     new_block->max_length = size;
     new_block->flags = ram_flags;
-    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
+    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
+                                     errp);
     if (!new_block->host) {
         g_free(new_block);
         return NULL;
@@ -2358,7 +2361,7 @@  RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
         return NULL;
     }
 
-    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
+    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, errp);
     if (!block) {
         if (created) {
             unlink(mem_path);
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 5e59a3d8d7..1b9f489ff0 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -121,7 +121,7 @@  RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
                                    Error **errp);
 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
                                  uint32_t ram_flags, int fd,
-                                 Error **errp);
+                                 off_t offset, Error **errp);
 
 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                   MemoryRegion *mr, Error **errp);
diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
index e786266b92..4f579858bc 100644
--- a/include/qemu/mmap-alloc.h
+++ b/include/qemu/mmap-alloc.h
@@ -25,7 +25,8 @@  void *qemu_ram_mmap(int fd,
                     size_t size,
                     size_t align,
                     bool shared,
-                    bool is_pmem);
+                    bool is_pmem,
+                    off_t start);
 
 void qemu_ram_munmap(int fd, void *ptr, size_t size);
 
diff --git a/memory.c b/memory.c
index 601b749906..f5fec476b7 100644
--- a/memory.c
+++ b/memory.c
@@ -1596,7 +1596,7 @@  void memory_region_init_ram_from_fd(MemoryRegion *mr,
     mr->destructor = memory_region_destructor_ram;
     mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
                                            share ? RAM_SHARED : 0,
-                                           fd, &err);
+                                           fd, 0, &err);
     mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
     if (err) {
         mr->size = int128_zero();
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 27dcccd8ec..a28f7025f0 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -86,7 +86,8 @@  void *qemu_ram_mmap(int fd,
                     size_t size,
                     size_t align,
                     bool shared,
-                    bool is_pmem)
+                    bool is_pmem,
+                    off_t start)
 {
     int flags;
     int map_sync_flags = 0;
@@ -147,7 +148,7 @@  void *qemu_ram_mmap(int fd,
     offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
 
     ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
-               flags | map_sync_flags, fd, 0);
+               flags | map_sync_flags, fd, start);
 
     if (ptr == MAP_FAILED && map_sync_flags) {
         if (errno == ENOTSUP) {
@@ -172,7 +173,7 @@  void *qemu_ram_mmap(int fd,
          * we will remove these flags to handle compatibility.
          */
         ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
-                   flags, fd, 0);
+                   flags, fd, start);
     }
 
     if (ptr == MAP_FAILED) {
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 062236a1ab..4c6b9e90c6 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -209,7 +209,7 @@  void *qemu_memalign(size_t alignment, size_t size)
 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
 {
     size_t align = QEMU_VMALLOC_ALIGN;
-    void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
+    void *ptr = qemu_ram_mmap(-1, size, align, shared, false, 0);
 
     if (ptr == MAP_FAILED) {
         return NULL;