diff mbox series

[V2,05/13] physmem: preserve ram blocks for cpr

Message ID 1727725244-105198-6-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New, archived
Headers show
Series Live update: cpr-transfer | expand

Commit Message

Steven Sistare Sept. 30, 2024, 7:40 p.m. UTC
Save the memfd for anonymous ramblocks in CPR state, along with a name
that uniquely identifies it.  The block's idstr is not yet set, so it
cannot be used for this purpose.  Find the saved memfd in new QEMU when
creating a block.  QEMU hard-codes the length of some internally-created
blocks, so to guard against that length changing, use lseek to get the
actual length of an incoming memfd.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 system/physmem.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

Comments

Peter Xu Oct. 7, 2024, 3:49 p.m. UTC | #1
On Mon, Sep 30, 2024 at 12:40:36PM -0700, Steve Sistare wrote:
> Save the memfd for anonymous ramblocks in CPR state, along with a name
> that uniquely identifies it.  The block's idstr is not yet set, so it
> cannot be used for this purpose.  Find the saved memfd in new QEMU when
> creating a block.  QEMU hard-codes the length of some internally-created
> blocks, so to guard against that length changing, use lseek to get the
> actual length of an incoming memfd.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>  system/physmem.c | 25 ++++++++++++++++++++++++-
>  1 file changed, 24 insertions(+), 1 deletion(-)
> 
> diff --git a/system/physmem.c b/system/physmem.c
> index 174f7e0..ddbeec9 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -72,6 +72,7 @@
>  
>  #include "qapi/qapi-types-migration.h"
>  #include "migration/options.h"
> +#include "migration/cpr.h"
>  #include "migration/vmstate.h"
>  
>  #include "qemu/range.h"
> @@ -1663,6 +1664,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>      }
>  }
>  
> +static char *cpr_name(RAMBlock *block)
> +{
> +    MemoryRegion *mr = block->mr;
> +    const char *mr_name = memory_region_name(mr);
> +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> +
> +    if (id) {
> +        return g_strdup_printf("%s/%s", id, mr_name);
> +    } else {
> +        return g_strdup(mr_name);
> +    }
> +}
> +
>  size_t qemu_ram_pagesize(RAMBlock *rb)
>  {
>      return rb->page_size;
> @@ -1858,14 +1872,18 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
>                                          TYPE_MEMORY_BACKEND)) {
>              size_t max_length = new_block->max_length;
>              MemoryRegion *mr = new_block->mr;
> -            const char *name = memory_region_name(mr);
> +            g_autofree char *name = cpr_name(new_block);
>  
>              new_block->mr->align = QEMU_VMALLOC_ALIGN;
>              new_block->flags |= RAM_SHARED;
> +            new_block->fd = cpr_find_fd(name, 0);
>  
>              if (new_block->fd == -1) {
>                  new_block->fd = qemu_memfd_create(name, max_length + mr->align,
>                                                    0, 0, 0, errp);
> +                cpr_save_fd(name, 0, new_block->fd);
> +            } else {
> +                new_block->max_length = lseek(new_block->fd, 0, SEEK_END);

So this can overwrite the max_length that the caller specified..

I remember we used to have some tricks on specifying different max_length
for ROMs on dest QEMU (on which, qemu firmwares also upgraded on the dest
host so the size can be bigger than src qemu's old ramblocks), so that the
MR is always large enough to reload even the new firmwares, while migration
only migrates the smaller size (used_length) so it's fine as we keep the
extra sizes empty. I think that can relevant to the qemu_ram_resize() call
of parse_ramblock().

The reload will not happen until some point, perhaps system resets.  I
wonder whether that is an issue in this case.

+Igor +Mst for this.

>              }
>  
>              if (new_block->fd >= 0) {
> @@ -1875,6 +1893,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
>                                                   false, 0, errp);
>              }
>              if (!new_block->host) {
> +                cpr_delete_fd(name, 0);
>                  qemu_mutex_unlock_ramlist();
>                  return;
>              }
> @@ -2182,6 +2201,8 @@ static void reclaim_ramblock(RAMBlock *block)
>  
>  void qemu_ram_free(RAMBlock *block)
>  {
> +    g_autofree char *name = NULL;
> +
>      if (!block) {
>          return;
>      }
> @@ -2192,6 +2213,8 @@ void qemu_ram_free(RAMBlock *block)
>      }
>  
>      qemu_mutex_lock_ramlist();
> +    name = cpr_name(block);
> +    cpr_delete_fd(name, 0);
>      QLIST_REMOVE_RCU(block, next);
>      ram_list.mru_block = NULL;
>      /* Write list before version */
> -- 
> 1.8.3.1
>
Peter Xu Oct. 7, 2024, 4:28 p.m. UTC | #2
On Mon, Oct 07, 2024 at 11:49:25AM -0400, Peter Xu wrote:
> On Mon, Sep 30, 2024 at 12:40:36PM -0700, Steve Sistare wrote:
> > Save the memfd for anonymous ramblocks in CPR state, along with a name
> > that uniquely identifies it.  The block's idstr is not yet set, so it
> > cannot be used for this purpose.  Find the saved memfd in new QEMU when
> > creating a block.  QEMU hard-codes the length of some internally-created
> > blocks, so to guard against that length changing, use lseek to get the
> > actual length of an incoming memfd.
> > 
> > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > ---
> >  system/physmem.c | 25 ++++++++++++++++++++++++-
> >  1 file changed, 24 insertions(+), 1 deletion(-)
> > 
> > diff --git a/system/physmem.c b/system/physmem.c
> > index 174f7e0..ddbeec9 100644
> > --- a/system/physmem.c
> > +++ b/system/physmem.c
> > @@ -72,6 +72,7 @@
> >  
> >  #include "qapi/qapi-types-migration.h"
> >  #include "migration/options.h"
> > +#include "migration/cpr.h"
> >  #include "migration/vmstate.h"
> >  
> >  #include "qemu/range.h"
> > @@ -1663,6 +1664,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
> >      }
> >  }
> >  
> > +static char *cpr_name(RAMBlock *block)
> > +{
> > +    MemoryRegion *mr = block->mr;
> > +    const char *mr_name = memory_region_name(mr);
> > +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> > +
> > +    if (id) {
> > +        return g_strdup_printf("%s/%s", id, mr_name);
> > +    } else {
> > +        return g_strdup(mr_name);
> > +    }
> > +}
> > +
> >  size_t qemu_ram_pagesize(RAMBlock *rb)
> >  {
> >      return rb->page_size;
> > @@ -1858,14 +1872,18 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
> >                                          TYPE_MEMORY_BACKEND)) {
> >              size_t max_length = new_block->max_length;
> >              MemoryRegion *mr = new_block->mr;
> > -            const char *name = memory_region_name(mr);
> > +            g_autofree char *name = cpr_name(new_block);
> >  
> >              new_block->mr->align = QEMU_VMALLOC_ALIGN;
> >              new_block->flags |= RAM_SHARED;
> > +            new_block->fd = cpr_find_fd(name, 0);
> >  
> >              if (new_block->fd == -1) {
> >                  new_block->fd = qemu_memfd_create(name, max_length + mr->align,
> >                                                    0, 0, 0, errp);
> > +                cpr_save_fd(name, 0, new_block->fd);
> > +            } else {
> > +                new_block->max_length = lseek(new_block->fd, 0, SEEK_END);
> 
> So this can overwrite the max_length that the caller specified..
> 
> I remember we used to have some tricks on specifying different max_length
> for ROMs on dest QEMU (on which, qemu firmwares also upgraded on the dest
> host so the size can be bigger than src qemu's old ramblocks), so that the
> MR is always large enough to reload even the new firmwares, while migration
> only migrates the smaller size (used_length) so it's fine as we keep the
> extra sizes empty. I think that can relevant to the qemu_ram_resize() call
> of parse_ramblock().
> 
> The reload will not happen until some point, perhaps system resets.  I
> wonder whether that is an issue in this case.
> 
> +Igor +Mst for this.

PS: If this is needed by CPR-transfer only because mmap() later can fail
due to a bigger max_length, I wonder whether it can be fixed by passing
truncate=true in the upcoming file_ram_alloc(), rather than overwritting
the max_length value itself.

> 
> >              }
> >  
> >              if (new_block->fd >= 0) {
> > @@ -1875,6 +1893,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
> >                                                   false, 0, errp);
> >              }
> >              if (!new_block->host) {
> > +                cpr_delete_fd(name, 0);
> >                  qemu_mutex_unlock_ramlist();
> >                  return;
> >              }
> > @@ -2182,6 +2201,8 @@ static void reclaim_ramblock(RAMBlock *block)
> >  
> >  void qemu_ram_free(RAMBlock *block)
> >  {
> > +    g_autofree char *name = NULL;
> > +
> >      if (!block) {
> >          return;
> >      }
> > @@ -2192,6 +2213,8 @@ void qemu_ram_free(RAMBlock *block)
> >      }
> >  
> >      qemu_mutex_lock_ramlist();
> > +    name = cpr_name(block);
> > +    cpr_delete_fd(name, 0);
> >      QLIST_REMOVE_RCU(block, next);
> >      ram_list.mru_block = NULL;
> >      /* Write list before version */
> > -- 
> > 1.8.3.1
> > 
> 
> -- 
> Peter Xu
Steven Sistare Oct. 8, 2024, 3:17 p.m. UTC | #3
On 10/7/2024 12:28 PM, Peter Xu wrote:
> On Mon, Oct 07, 2024 at 11:49:25AM -0400, Peter Xu wrote:
>> On Mon, Sep 30, 2024 at 12:40:36PM -0700, Steve Sistare wrote:
>>> Save the memfd for anonymous ramblocks in CPR state, along with a name
>>> that uniquely identifies it.  The block's idstr is not yet set, so it
>>> cannot be used for this purpose.  Find the saved memfd in new QEMU when
>>> creating a block.  QEMU hard-codes the length of some internally-created
>>> blocks, so to guard against that length changing, use lseek to get the
>>> actual length of an incoming memfd.
>>>
>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>> ---
>>>   system/physmem.c | 25 ++++++++++++++++++++++++-
>>>   1 file changed, 24 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/system/physmem.c b/system/physmem.c
>>> index 174f7e0..ddbeec9 100644
>>> --- a/system/physmem.c
>>> +++ b/system/physmem.c
>>> @@ -72,6 +72,7 @@
>>>   
>>>   #include "qapi/qapi-types-migration.h"
>>>   #include "migration/options.h"
>>> +#include "migration/cpr.h"
>>>   #include "migration/vmstate.h"
>>>   
>>>   #include "qemu/range.h"
>>> @@ -1663,6 +1664,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>>>       }
>>>   }
>>>   
>>> +static char *cpr_name(RAMBlock *block)
>>> +{
>>> +    MemoryRegion *mr = block->mr;
>>> +    const char *mr_name = memory_region_name(mr);
>>> +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>>> +
>>> +    if (id) {
>>> +        return g_strdup_printf("%s/%s", id, mr_name);
>>> +    } else {
>>> +        return g_strdup(mr_name);
>>> +    }
>>> +}
>>> +
>>>   size_t qemu_ram_pagesize(RAMBlock *rb)
>>>   {
>>>       return rb->page_size;
>>> @@ -1858,14 +1872,18 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
>>>                                           TYPE_MEMORY_BACKEND)) {
>>>               size_t max_length = new_block->max_length;
>>>               MemoryRegion *mr = new_block->mr;
>>> -            const char *name = memory_region_name(mr);
>>> +            g_autofree char *name = cpr_name(new_block);
>>>   
>>>               new_block->mr->align = QEMU_VMALLOC_ALIGN;
>>>               new_block->flags |= RAM_SHARED;
>>> +            new_block->fd = cpr_find_fd(name, 0);
>>>   
>>>               if (new_block->fd == -1) {
>>>                   new_block->fd = qemu_memfd_create(name, max_length + mr->align,
>>>                                                     0, 0, 0, errp);
>>> +                cpr_save_fd(name, 0, new_block->fd);
>>> +            } else {
>>> +                new_block->max_length = lseek(new_block->fd, 0, SEEK_END);
>>
>> So this can overwrite the max_length that the caller specified..
>>
>> I remember we used to have some tricks on specifying different max_length
>> for ROMs on dest QEMU (on which, qemu firmwares also upgraded on the dest
>> host so the size can be bigger than src qemu's old ramblocks), so that the
>> MR is always large enough to reload even the new firmwares, while migration
>> only migrates the smaller size (used_length) so it's fine as we keep the
>> extra sizes empty. I think that can relevant to the qemu_ram_resize() call
>> of parse_ramblock().

Yes, resizable ram block for firmware blob is the only case I know of where
the length changed in the past.  If a length changes in the future, we will
need to detect and accommodate that change here, and I believe the fix will
be to simply use the actual length, as per the code above.  But if you prefer,
for now I can check for length change and return an error. New qemu will fail
to start, and old qemu will recover.

>> The reload will not happen until some point, perhaps system resets.  I
>> wonder whether that is an issue in this case.

Firmware is only generated once, via this path on x86:
   qmp_x_exit_preconfig
     qemu_machine_creation_done
       qdev_machine_creation_done
         pc_machine_done
           acpi_setup
             acpi_add_rom_blob
               rom_add_blob
                 rom_set_mr

After a system reset, the ramblock contents from memory are used as-is.

> PS: If this is needed by CPR-transfer only because mmap() later can fail
> due to a bigger max_length, 

That is the reason.  IMO adjusting max_length is more robust than fiddling
with truncate and pretending that max_length is larger, when qemu will never
be able to use the phantom space up to max_length.

- Steve

> I wonder whether it can be fixed by passing
> truncate=true in the upcoming file_ram_alloc(), rather than overwritting
> the max_length value itself.
> 
>>
>>>               }
>>>   
>>>               if (new_block->fd >= 0) {
>>> @@ -1875,6 +1893,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
>>>                                                    false, 0, errp);
>>>               }
>>>               if (!new_block->host) {
>>> +                cpr_delete_fd(name, 0);
>>>                   qemu_mutex_unlock_ramlist();
>>>                   return;
>>>               }
>>> @@ -2182,6 +2201,8 @@ static void reclaim_ramblock(RAMBlock *block)
>>>   
>>>   void qemu_ram_free(RAMBlock *block)
>>>   {
>>> +    g_autofree char *name = NULL;
>>> +
>>>       if (!block) {
>>>           return;
>>>       }
>>> @@ -2192,6 +2213,8 @@ void qemu_ram_free(RAMBlock *block)
>>>       }
>>>   
>>>       qemu_mutex_lock_ramlist();
>>> +    name = cpr_name(block);
>>> +    cpr_delete_fd(name, 0);
>>>       QLIST_REMOVE_RCU(block, next);
>>>       ram_list.mru_block = NULL;
>>>       /* Write list before version */
>>> -- 
>>> 1.8.3.1
>>>
>>
>> -- 
>> Peter Xu
>
Peter Xu Oct. 8, 2024, 4:26 p.m. UTC | #4
On Tue, Oct 08, 2024 at 11:17:46AM -0400, Steven Sistare wrote:
> On 10/7/2024 12:28 PM, Peter Xu wrote:
> > On Mon, Oct 07, 2024 at 11:49:25AM -0400, Peter Xu wrote:
> > > On Mon, Sep 30, 2024 at 12:40:36PM -0700, Steve Sistare wrote:
> > > > Save the memfd for anonymous ramblocks in CPR state, along with a name
> > > > that uniquely identifies it.  The block's idstr is not yet set, so it
> > > > cannot be used for this purpose.  Find the saved memfd in new QEMU when
> > > > creating a block.  QEMU hard-codes the length of some internally-created
> > > > blocks, so to guard against that length changing, use lseek to get the
> > > > actual length of an incoming memfd.
> > > > 
> > > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > > > ---
> > > >   system/physmem.c | 25 ++++++++++++++++++++++++-
> > > >   1 file changed, 24 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/system/physmem.c b/system/physmem.c
> > > > index 174f7e0..ddbeec9 100644
> > > > --- a/system/physmem.c
> > > > +++ b/system/physmem.c
> > > > @@ -72,6 +72,7 @@
> > > >   #include "qapi/qapi-types-migration.h"
> > > >   #include "migration/options.h"
> > > > +#include "migration/cpr.h"
> > > >   #include "migration/vmstate.h"
> > > >   #include "qemu/range.h"
> > > > @@ -1663,6 +1664,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
> > > >       }
> > > >   }
> > > > +static char *cpr_name(RAMBlock *block)
> > > > +{
> > > > +    MemoryRegion *mr = block->mr;
> > > > +    const char *mr_name = memory_region_name(mr);
> > > > +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> > > > +
> > > > +    if (id) {
> > > > +        return g_strdup_printf("%s/%s", id, mr_name);
> > > > +    } else {
> > > > +        return g_strdup(mr_name);
> > > > +    }
> > > > +}
> > > > +
> > > >   size_t qemu_ram_pagesize(RAMBlock *rb)
> > > >   {
> > > >       return rb->page_size;
> > > > @@ -1858,14 +1872,18 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
> > > >                                           TYPE_MEMORY_BACKEND)) {
> > > >               size_t max_length = new_block->max_length;
> > > >               MemoryRegion *mr = new_block->mr;
> > > > -            const char *name = memory_region_name(mr);
> > > > +            g_autofree char *name = cpr_name(new_block);
> > > >               new_block->mr->align = QEMU_VMALLOC_ALIGN;
> > > >               new_block->flags |= RAM_SHARED;
> > > > +            new_block->fd = cpr_find_fd(name, 0);
> > > >               if (new_block->fd == -1) {
> > > >                   new_block->fd = qemu_memfd_create(name, max_length + mr->align,
> > > >                                                     0, 0, 0, errp);
> > > > +                cpr_save_fd(name, 0, new_block->fd);
> > > > +            } else {
> > > > +                new_block->max_length = lseek(new_block->fd, 0, SEEK_END);
> > > 
> > > So this can overwrite the max_length that the caller specified..
> > > 
> > > I remember we used to have some tricks on specifying different max_length
> > > for ROMs on dest QEMU (on which, qemu firmwares also upgraded on the dest
> > > host so the size can be bigger than src qemu's old ramblocks), so that the
> > > MR is always large enough to reload even the new firmwares, while migration
> > > only migrates the smaller size (used_length) so it's fine as we keep the
> > > extra sizes empty. I think that can relevant to the qemu_ram_resize() call
> > > of parse_ramblock().
> 
> Yes, resizable ram block for firmware blob is the only case I know of where
> the length changed in the past.  If a length changes in the future, we will
> need to detect and accommodate that change here, and I believe the fix will
> be to simply use the actual length, as per the code above.  But if you prefer,
> for now I can check for length change and return an error. New qemu will fail
> to start, and old qemu will recover.
> 
> > > The reload will not happen until some point, perhaps system resets.  I
> > > wonder whether that is an issue in this case.
> 
> Firmware is only generated once, via this path on x86:
>   qmp_x_exit_preconfig
>     qemu_machine_creation_done
>       qdev_machine_creation_done
>         pc_machine_done
>           acpi_setup
>             acpi_add_rom_blob
>               rom_add_blob
>                 rom_set_mr
> 
> After a system reset, the ramblock contents from memory are used as-is.
> 
> > PS: If this is needed by CPR-transfer only because mmap() later can fail
> > due to a bigger max_length,
> 
> That is the reason.  IMO adjusting max_length is more robust than fiddling
> with truncate and pretending that max_length is larger, when qemu will never
> be able to use the phantom space up to max_length.

I thought it was not pretending, but the ROM region might be resized after
a system reset?  I worry that your change here can violate with such
resizing later, so that qemu_ram_resize() can potentially fail after (1)
CPR-transfer upgrades completes, then follow with (2) a system reset.

We can observe such resizing kick off in every reboot, like:

(gdb) bt
#0  qemu_ram_resize
#1  0x00005602b623b740 in memory_region_ram_resize
#2  0x00005602b60f5580 in acpi_ram_update
#3  0x00005602b60f5667 in acpi_build_update
#4  0x00005602b5e1028b in fw_cfg_select
#5  0x00005602b5e105af in fw_cfg_dma_transfer
#6  0x00005602b5e109a8 in fw_cfg_dma_mem_write
#7  0x00005602b62352ec in memory_region_write_accessor
#8  0x00005602b62355e6 in access_with_adjusted_size
#9  0x00005602b6238de8 in memory_region_dispatch_write
#10 0x00005602b62488c5 in flatview_write_continue_step
#11 0x00005602b6248997 in flatview_write_continue
#12 0x00005602b6248abf in flatview_write
#13 0x00005602b6248f39 in address_space_write
#14 0x00005602b6248fb1 in address_space_rw
#15 0x00005602b62a5d86 in kvm_handle_io
#16 0x00005602b62a6cb2 in kvm_cpu_exec
#17 0x00005602b62aa37a in kvm_vcpu_thread_fn
#18 0x00005602b655da57 in qemu_thread_start
#19 0x00007f120224a1b7 in start_thread
#20 0x00007f12022cc39c in clone3

Specifically, see this code clip:

acpi_ram_update():
    memory_region_ram_resize(mr, size, &error_abort);
    memcpy(memory_region_get_ram_ptr(mr), data->data, size);

Per my understanding, what it does is during the reset the ROM ramblock
will resize to the new size (normally, only larger, in my memory there used
to have a ROM grew from 256K->512K, or something like that), then the
memcpy() injects the latest firmware that it pre-loaded into mem.

So after such system reset, QEMU might start to see new ROM code loaded
here (not the one that got migrated anymore, which will only match the
version installed on src QEMU).  Here the problem is the new firmware can
be larger, so I _think_ we need to make sure max_length is not modified by
CPR to allow resizing happen here, while if we use truncate=true here it
should just work in all cases.

I think it could be verified with an old QEMU running with old ROM files
(which is smaller), then CPR migrate to a new QEMU running new ROM files
(which is larger), then reboot to see whether that new QEMU crash.  Maybe
we can emulate that with "romfile=XXX" parameter.

I am not fluent with ROM/firmware code, but please double check..
Steven Sistare Oct. 8, 2024, 9:05 p.m. UTC | #5
On 10/8/2024 12:26 PM, Peter Xu wrote:
> On Tue, Oct 08, 2024 at 11:17:46AM -0400, Steven Sistare wrote:
>> On 10/7/2024 12:28 PM, Peter Xu wrote:
>>> On Mon, Oct 07, 2024 at 11:49:25AM -0400, Peter Xu wrote:
>>>> On Mon, Sep 30, 2024 at 12:40:36PM -0700, Steve Sistare wrote:
>>>>> Save the memfd for anonymous ramblocks in CPR state, along with a name
>>>>> that uniquely identifies it.  The block's idstr is not yet set, so it
>>>>> cannot be used for this purpose.  Find the saved memfd in new QEMU when
>>>>> creating a block.  QEMU hard-codes the length of some internally-created
>>>>> blocks, so to guard against that length changing, use lseek to get the
>>>>> actual length of an incoming memfd.
>>>>>
>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>> ---
>>>>>    system/physmem.c | 25 ++++++++++++++++++++++++-
>>>>>    1 file changed, 24 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/system/physmem.c b/system/physmem.c
>>>>> index 174f7e0..ddbeec9 100644
>>>>> --- a/system/physmem.c
>>>>> +++ b/system/physmem.c
>>>>> @@ -72,6 +72,7 @@
>>>>>    #include "qapi/qapi-types-migration.h"
>>>>>    #include "migration/options.h"
>>>>> +#include "migration/cpr.h"
>>>>>    #include "migration/vmstate.h"
>>>>>    #include "qemu/range.h"
>>>>> @@ -1663,6 +1664,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>>>>>        }
>>>>>    }
>>>>> +static char *cpr_name(RAMBlock *block)
>>>>> +{
>>>>> +    MemoryRegion *mr = block->mr;
>>>>> +    const char *mr_name = memory_region_name(mr);
>>>>> +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>>>>> +
>>>>> +    if (id) {
>>>>> +        return g_strdup_printf("%s/%s", id, mr_name);
>>>>> +    } else {
>>>>> +        return g_strdup(mr_name);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>    size_t qemu_ram_pagesize(RAMBlock *rb)
>>>>>    {
>>>>>        return rb->page_size;
>>>>> @@ -1858,14 +1872,18 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
>>>>>                                            TYPE_MEMORY_BACKEND)) {
>>>>>                size_t max_length = new_block->max_length;
>>>>>                MemoryRegion *mr = new_block->mr;
>>>>> -            const char *name = memory_region_name(mr);
>>>>> +            g_autofree char *name = cpr_name(new_block);
>>>>>                new_block->mr->align = QEMU_VMALLOC_ALIGN;
>>>>>                new_block->flags |= RAM_SHARED;
>>>>> +            new_block->fd = cpr_find_fd(name, 0);
>>>>>                if (new_block->fd == -1) {
>>>>>                    new_block->fd = qemu_memfd_create(name, max_length + mr->align,
>>>>>                                                      0, 0, 0, errp);
>>>>> +                cpr_save_fd(name, 0, new_block->fd);
>>>>> +            } else {
>>>>> +                new_block->max_length = lseek(new_block->fd, 0, SEEK_END);
>>>>
>>>> So this can overwrite the max_length that the caller specified..
>>>>
>>>> I remember we used to have some tricks on specifying different max_length
>>>> for ROMs on dest QEMU (on which, qemu firmwares also upgraded on the dest
>>>> host so the size can be bigger than src qemu's old ramblocks), so that the
>>>> MR is always large enough to reload even the new firmwares, while migration
>>>> only migrates the smaller size (used_length) so it's fine as we keep the
>>>> extra sizes empty. I think that can relevant to the qemu_ram_resize() call
>>>> of parse_ramblock().
>>
>> Yes, resizable ram block for firmware blob is the only case I know of where
>> the length changed in the past.  If a length changes in the future, we will
>> need to detect and accommodate that change here, and I believe the fix will
>> be to simply use the actual length, as per the code above.  But if you prefer,
>> for now I can check for length change and return an error. New qemu will fail
>> to start, and old qemu will recover.
>>
>>>> The reload will not happen until some point, perhaps system resets.  I
>>>> wonder whether that is an issue in this case.
>>
>> Firmware is only generated once, via this path on x86:
>>    qmp_x_exit_preconfig
>>      qemu_machine_creation_done
>>        qdev_machine_creation_done
>>          pc_machine_done
>>            acpi_setup
>>              acpi_add_rom_blob
>>                rom_add_blob
>>                  rom_set_mr
>>
>> After a system reset, the ramblock contents from memory are used as-is.
>>
>>> PS: If this is needed by CPR-transfer only because mmap() later can fail
>>> due to a bigger max_length,
>>
>> That is the reason.  IMO adjusting max_length is more robust than fiddling
>> with truncate and pretending that max_length is larger, when qemu will never
>> be able to use the phantom space up to max_length.
> 
> I thought it was not pretending, but the ROM region might be resized after
> a system reset?  I worry that your change here can violate with such
> resizing later, so that qemu_ram_resize() can potentially fail after (1)
> CPR-transfer upgrades completes, then follow with (2) a system reset.
> 
> We can observe such resizing kick off in every reboot, like:
> 
> (gdb) bt
> #0  qemu_ram_resize
> #1  0x00005602b623b740 in memory_region_ram_resize
> #2  0x00005602b60f5580 in acpi_ram_update
> #3  0x00005602b60f5667 in acpi_build_update
> #4  0x00005602b5e1028b in fw_cfg_select
> #5  0x00005602b5e105af in fw_cfg_dma_transfer
> #6  0x00005602b5e109a8 in fw_cfg_dma_mem_write
> #7  0x00005602b62352ec in memory_region_write_accessor
> #8  0x00005602b62355e6 in access_with_adjusted_size
> #9  0x00005602b6238de8 in memory_region_dispatch_write
> #10 0x00005602b62488c5 in flatview_write_continue_step
> #11 0x00005602b6248997 in flatview_write_continue
> #12 0x00005602b6248abf in flatview_write
> #13 0x00005602b6248f39 in address_space_write
> #14 0x00005602b6248fb1 in address_space_rw
> #15 0x00005602b62a5d86 in kvm_handle_io
> #16 0x00005602b62a6cb2 in kvm_cpu_exec
> #17 0x00005602b62aa37a in kvm_vcpu_thread_fn
> #18 0x00005602b655da57 in qemu_thread_start
> #19 0x00007f120224a1b7 in start_thread
> #20 0x00007f12022cc39c in clone3
> 
> Specifically, see this code clip:
> 
> acpi_ram_update():
>      memory_region_ram_resize(mr, size, &error_abort);
>      memcpy(memory_region_get_ram_ptr(mr), data->data, size);
> 
> Per my understanding, what it does is during the reset the ROM ramblock
> will resize to the new size (normally, only larger, in my memory there used
> to have a ROM grew from 256K->512K, or something like that), then the
> memcpy() injects the latest firmware that it pre-loaded into mem.
> 
> So after such system reset, QEMU might start to see new ROM code loaded
> here (not the one that got migrated anymore, which will only match the
> version installed on src QEMU).  Here the problem is the new firmware can
> be larger, so I _think_ we need to make sure max_length is not modified by
> CPR to allow resizing happen here, while if we use truncate=true here it
> should just work in all cases.
> 
> I think it could be verified with an old QEMU running with old ROM files
> (which is smaller), then CPR migrate to a new QEMU running new ROM files
> (which is larger), then reboot to see whether that new QEMU crash.  Maybe
> we can emulate that with "romfile=XXX" parameter.
> 
> I am not fluent with ROM/firmware code, but please double check..

Thank you for the detailed analysis, I was completely wrong on this one :(

I also keep forgetting that ftruncate can grow as well as shrink a file.
I agree that preserving the dest qemu max_length, and using ftruncate, is the
correct solution, as long as dest max_length >= source max_length.

However, IMO the extra memory created by ftruncate also needs to be pinned for DMA.
We disagreed on exactly what blocks needs to be pinned in previous discussions,
and to save time I would rather not re-open that debate right now.  Instead, I propose
to simply require that max_length does not change, and return an error if it does.
If it changes in some future qemu, we can reopen the discussion.

- Steve
Peter Xu Oct. 8, 2024, 9:32 p.m. UTC | #6
On Tue, Oct 08, 2024 at 05:05:01PM -0400, Steven Sistare wrote:
> On 10/8/2024 12:26 PM, Peter Xu wrote:
> > On Tue, Oct 08, 2024 at 11:17:46AM -0400, Steven Sistare wrote:
> > > On 10/7/2024 12:28 PM, Peter Xu wrote:
> > > > On Mon, Oct 07, 2024 at 11:49:25AM -0400, Peter Xu wrote:
> > > > > On Mon, Sep 30, 2024 at 12:40:36PM -0700, Steve Sistare wrote:
> > > > > > Save the memfd for anonymous ramblocks in CPR state, along with a name
> > > > > > that uniquely identifies it.  The block's idstr is not yet set, so it
> > > > > > cannot be used for this purpose.  Find the saved memfd in new QEMU when
> > > > > > creating a block.  QEMU hard-codes the length of some internally-created
> > > > > > blocks, so to guard against that length changing, use lseek to get the
> > > > > > actual length of an incoming memfd.
> > > > > > 
> > > > > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > > > > > ---
> > > > > >    system/physmem.c | 25 ++++++++++++++++++++++++-
> > > > > >    1 file changed, 24 insertions(+), 1 deletion(-)
> > > > > > 
> > > > > > diff --git a/system/physmem.c b/system/physmem.c
> > > > > > index 174f7e0..ddbeec9 100644
> > > > > > --- a/system/physmem.c
> > > > > > +++ b/system/physmem.c
> > > > > > @@ -72,6 +72,7 @@
> > > > > >    #include "qapi/qapi-types-migration.h"
> > > > > >    #include "migration/options.h"
> > > > > > +#include "migration/cpr.h"
> > > > > >    #include "migration/vmstate.h"
> > > > > >    #include "qemu/range.h"
> > > > > > @@ -1663,6 +1664,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
> > > > > >        }
> > > > > >    }
> > > > > > +static char *cpr_name(RAMBlock *block)
> > > > > > +{
> > > > > > +    MemoryRegion *mr = block->mr;
> > > > > > +    const char *mr_name = memory_region_name(mr);
> > > > > > +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> > > > > > +
> > > > > > +    if (id) {
> > > > > > +        return g_strdup_printf("%s/%s", id, mr_name);
> > > > > > +    } else {
> > > > > > +        return g_strdup(mr_name);
> > > > > > +    }
> > > > > > +}
> > > > > > +
> > > > > >    size_t qemu_ram_pagesize(RAMBlock *rb)
> > > > > >    {
> > > > > >        return rb->page_size;
> > > > > > @@ -1858,14 +1872,18 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
> > > > > >                                            TYPE_MEMORY_BACKEND)) {
> > > > > >                size_t max_length = new_block->max_length;
> > > > > >                MemoryRegion *mr = new_block->mr;
> > > > > > -            const char *name = memory_region_name(mr);
> > > > > > +            g_autofree char *name = cpr_name(new_block);
> > > > > >                new_block->mr->align = QEMU_VMALLOC_ALIGN;
> > > > > >                new_block->flags |= RAM_SHARED;
> > > > > > +            new_block->fd = cpr_find_fd(name, 0);
> > > > > >                if (new_block->fd == -1) {
> > > > > >                    new_block->fd = qemu_memfd_create(name, max_length + mr->align,
> > > > > >                                                      0, 0, 0, errp);
> > > > > > +                cpr_save_fd(name, 0, new_block->fd);
> > > > > > +            } else {
> > > > > > +                new_block->max_length = lseek(new_block->fd, 0, SEEK_END);
> > > > > 
> > > > > So this can overwrite the max_length that the caller specified..
> > > > > 
> > > > > I remember we used to have some tricks on specifying different max_length
> > > > > for ROMs on dest QEMU (on which, qemu firmwares also upgraded on the dest
> > > > > host so the size can be bigger than src qemu's old ramblocks), so that the
> > > > > MR is always large enough to reload even the new firmwares, while migration
> > > > > only migrates the smaller size (used_length) so it's fine as we keep the
> > > > > extra sizes empty. I think that can relevant to the qemu_ram_resize() call
> > > > > of parse_ramblock().
> > > 
> > > Yes, resizable ram block for firmware blob is the only case I know of where
> > > the length changed in the past.  If a length changes in the future, we will
> > > need to detect and accommodate that change here, and I believe the fix will
> > > be to simply use the actual length, as per the code above.  But if you prefer,
> > > for now I can check for length change and return an error. New qemu will fail
> > > to start, and old qemu will recover.
> > > 
> > > > > The reload will not happen until some point, perhaps system resets.  I
> > > > > wonder whether that is an issue in this case.
> > > 
> > > Firmware is only generated once, via this path on x86:
> > >    qmp_x_exit_preconfig
> > >      qemu_machine_creation_done
> > >        qdev_machine_creation_done
> > >          pc_machine_done
> > >            acpi_setup
> > >              acpi_add_rom_blob
> > >                rom_add_blob
> > >                  rom_set_mr
> > > 
> > > After a system reset, the ramblock contents from memory are used as-is.
> > > 
> > > > PS: If this is needed by CPR-transfer only because mmap() later can fail
> > > > due to a bigger max_length,
> > > 
> > > That is the reason.  IMO adjusting max_length is more robust than fiddling
> > > with truncate and pretending that max_length is larger, when qemu will never
> > > be able to use the phantom space up to max_length.
> > 
> > I thought it was not pretending, but the ROM region might be resized after
> > a system reset?  I worry that your change here can violate with such
> > resizing later, so that qemu_ram_resize() can potentially fail after (1)
> > CPR-transfer upgrades completes, then follow with (2) a system reset.
> > 
> > We can observe such resizing kick off in every reboot, like:
> > 
> > (gdb) bt
> > #0  qemu_ram_resize
> > #1  0x00005602b623b740 in memory_region_ram_resize
> > #2  0x00005602b60f5580 in acpi_ram_update
> > #3  0x00005602b60f5667 in acpi_build_update
> > #4  0x00005602b5e1028b in fw_cfg_select
> > #5  0x00005602b5e105af in fw_cfg_dma_transfer
> > #6  0x00005602b5e109a8 in fw_cfg_dma_mem_write
> > #7  0x00005602b62352ec in memory_region_write_accessor
> > #8  0x00005602b62355e6 in access_with_adjusted_size
> > #9  0x00005602b6238de8 in memory_region_dispatch_write
> > #10 0x00005602b62488c5 in flatview_write_continue_step
> > #11 0x00005602b6248997 in flatview_write_continue
> > #12 0x00005602b6248abf in flatview_write
> > #13 0x00005602b6248f39 in address_space_write
> > #14 0x00005602b6248fb1 in address_space_rw
> > #15 0x00005602b62a5d86 in kvm_handle_io
> > #16 0x00005602b62a6cb2 in kvm_cpu_exec
> > #17 0x00005602b62aa37a in kvm_vcpu_thread_fn
> > #18 0x00005602b655da57 in qemu_thread_start
> > #19 0x00007f120224a1b7 in start_thread
> > #20 0x00007f12022cc39c in clone3
> > 
> > Specifically, see this code clip:
> > 
> > acpi_ram_update():
> >      memory_region_ram_resize(mr, size, &error_abort);
> >      memcpy(memory_region_get_ram_ptr(mr), data->data, size);
> > 
> > Per my understanding, what it does is during the reset the ROM ramblock
> > will resize to the new size (normally, only larger, in my memory there used
> > to have a ROM grew from 256K->512K, or something like that), then the
> > memcpy() injects the latest firmware that it pre-loaded into mem.
> > 
> > So after such system reset, QEMU might start to see new ROM code loaded
> > here (not the one that got migrated anymore, which will only match the
> > version installed on src QEMU).  Here the problem is the new firmware can
> > be larger, so I _think_ we need to make sure max_length is not modified by
> > CPR to allow resizing happen here, while if we use truncate=true here it
> > should just work in all cases.
> > 
> > I think it could be verified with an old QEMU running with old ROM files
> > (which is smaller), then CPR migrate to a new QEMU running new ROM files
> > (which is larger), then reboot to see whether that new QEMU crash.  Maybe
> > we can emulate that with "romfile=XXX" parameter.
> > 
> > I am not fluent with ROM/firmware code, but please double check..
> 
> Thank you for the detailed analysis, I was completely wrong on this one :(
> 
> I also keep forgetting that ftruncate can grow as well as shrink a file.
> I agree that preserving the dest qemu max_length, and using ftruncate, is the
> correct solution, as long as dest max_length >= source max_length.
> 
> However, IMO the extra memory created by ftruncate also needs to be pinned for DMA.
> We disagreed on exactly what blocks needs to be pinned in previous discussions,
> and to save time I would rather not re-open that debate right now.  Instead, I propose
> to simply require that max_length does not change, and return an error if it does.
> If it changes in some future qemu, we can reopen the discussion.

Hmm.. why the extra memory needs to be pinned?

From QEMU memory topology POV, anything more than used_length is not
visible to the guest, afaict.

In this specific ROM example, qemu_ram_resize() on src QEMU will first
resize the ramblock (updating used_length), then set that exact same size
with memory_region_set_size() to the MR with the size of the smaller
firmware size when src QEMU boots:

qemu_ram_resize():
    unaligned_size = newsize;
    ...
    newsize = TARGET_PAGE_ALIGN(newsize);
    newsize = REAL_HOST_PAGE_ALIGN(newsize);
    ...
    block->used_length = newsize;
    ...
    memory_region_set_size(block->mr, unaligned_size);

Here a tiny detail is the two sizes are slightly different, but the MR size
is even smaller than used_length.  The MR size decides what can be visible
to the guest, when the MR that owns the ROM file is mapped into GPA range.
That's true on the src, while after CPR migrates to dest that should still
hold true, afaict, as all the rest memory (used->max) is not yet used
before a system reset.

The extra memory (used->max) can be relevant only after a system reset,
when the new firmware will be loaded, and qemu_ram_resize() can indeed
extend that MR to cover more than before.  However that should be fine too
because that means guest memory is being rebuilt, so VFIO memory listeners
should do the right things (unpin old, repin the new ROM that is larger
this time), iiuc.
Steven Sistare Oct. 31, 2024, 8:32 p.m. UTC | #7
On 10/8/2024 5:32 PM, Peter Xu wrote:
> On Tue, Oct 08, 2024 at 05:05:01PM -0400, Steven Sistare wrote:
>> On 10/8/2024 12:26 PM, Peter Xu wrote:
>>> On Tue, Oct 08, 2024 at 11:17:46AM -0400, Steven Sistare wrote:
>>>> On 10/7/2024 12:28 PM, Peter Xu wrote:
>>>>> On Mon, Oct 07, 2024 at 11:49:25AM -0400, Peter Xu wrote:
>>>>>> On Mon, Sep 30, 2024 at 12:40:36PM -0700, Steve Sistare wrote:
>>>>>>> Save the memfd for anonymous ramblocks in CPR state, along with a name
>>>>>>> that uniquely identifies it.  The block's idstr is not yet set, so it
>>>>>>> cannot be used for this purpose.  Find the saved memfd in new QEMU when
>>>>>>> creating a block.  QEMU hard-codes the length of some internally-created
>>>>>>> blocks, so to guard against that length changing, use lseek to get the
>>>>>>> actual length of an incoming memfd.
>>>>>>>
>>>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>>>> ---
>>>>>>>     system/physmem.c | 25 ++++++++++++++++++++++++-
>>>>>>>     1 file changed, 24 insertions(+), 1 deletion(-)
>>>>>>>
>>>>>>> diff --git a/system/physmem.c b/system/physmem.c
>>>>>>> index 174f7e0..ddbeec9 100644
>>>>>>> --- a/system/physmem.c
>>>>>>> +++ b/system/physmem.c
>>>>>>> @@ -72,6 +72,7 @@
>>>>>>>     #include "qapi/qapi-types-migration.h"
>>>>>>>     #include "migration/options.h"
>>>>>>> +#include "migration/cpr.h"
>>>>>>>     #include "migration/vmstate.h"
>>>>>>>     #include "qemu/range.h"
>>>>>>> @@ -1663,6 +1664,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>>>>>>>         }
>>>>>>>     }
>>>>>>> +static char *cpr_name(RAMBlock *block)
>>>>>>> +{
>>>>>>> +    MemoryRegion *mr = block->mr;
>>>>>>> +    const char *mr_name = memory_region_name(mr);
>>>>>>> +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>>>>>>> +
>>>>>>> +    if (id) {
>>>>>>> +        return g_strdup_printf("%s/%s", id, mr_name);
>>>>>>> +    } else {
>>>>>>> +        return g_strdup(mr_name);
>>>>>>> +    }
>>>>>>> +}
>>>>>>> +
>>>>>>>     size_t qemu_ram_pagesize(RAMBlock *rb)
>>>>>>>     {
>>>>>>>         return rb->page_size;
>>>>>>> @@ -1858,14 +1872,18 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
>>>>>>>                                             TYPE_MEMORY_BACKEND)) {
>>>>>>>                 size_t max_length = new_block->max_length;
>>>>>>>                 MemoryRegion *mr = new_block->mr;
>>>>>>> -            const char *name = memory_region_name(mr);
>>>>>>> +            g_autofree char *name = cpr_name(new_block);
>>>>>>>                 new_block->mr->align = QEMU_VMALLOC_ALIGN;
>>>>>>>                 new_block->flags |= RAM_SHARED;
>>>>>>> +            new_block->fd = cpr_find_fd(name, 0);
>>>>>>>                 if (new_block->fd == -1) {
>>>>>>>                     new_block->fd = qemu_memfd_create(name, max_length + mr->align,
>>>>>>>                                                       0, 0, 0, errp);
>>>>>>> +                cpr_save_fd(name, 0, new_block->fd);
>>>>>>> +            } else {
>>>>>>> +                new_block->max_length = lseek(new_block->fd, 0, SEEK_END);
>>>>>>
>>>>>> So this can overwrite the max_length that the caller specified..
>>>>>>
>>>>>> I remember we used to have some tricks on specifying different max_length
>>>>>> for ROMs on dest QEMU (on which, qemu firmwares also upgraded on the dest
>>>>>> host so the size can be bigger than src qemu's old ramblocks), so that the
>>>>>> MR is always large enough to reload even the new firmwares, while migration
>>>>>> only migrates the smaller size (used_length) so it's fine as we keep the
>>>>>> extra sizes empty. I think that can relevant to the qemu_ram_resize() call
>>>>>> of parse_ramblock().
>>>>
>>>> Yes, resizable ram block for firmware blob is the only case I know of where
>>>> the length changed in the past.  If a length changes in the future, we will
>>>> need to detect and accommodate that change here, and I believe the fix will
>>>> be to simply use the actual length, as per the code above.  But if you prefer,
>>>> for now I can check for length change and return an error. New qemu will fail
>>>> to start, and old qemu will recover.
>>>>
>>>>>> The reload will not happen until some point, perhaps system resets.  I
>>>>>> wonder whether that is an issue in this case.
>>>>
>>>> Firmware is only generated once, via this path on x86:
>>>>     qmp_x_exit_preconfig
>>>>       qemu_machine_creation_done
>>>>         qdev_machine_creation_done
>>>>           pc_machine_done
>>>>             acpi_setup
>>>>               acpi_add_rom_blob
>>>>                 rom_add_blob
>>>>                   rom_set_mr
>>>>
>>>> After a system reset, the ramblock contents from memory are used as-is.
>>>>
>>>>> PS: If this is needed by CPR-transfer only because mmap() later can fail
>>>>> due to a bigger max_length,
>>>>
>>>> That is the reason.  IMO adjusting max_length is more robust than fiddling
>>>> with truncate and pretending that max_length is larger, when qemu will never
>>>> be able to use the phantom space up to max_length.
>>>
>>> I thought it was not pretending, but the ROM region might be resized after
>>> a system reset?  I worry that your change here can violate with such
>>> resizing later, so that qemu_ram_resize() can potentially fail after (1)
>>> CPR-transfer upgrades completes, then follow with (2) a system reset.
>>>
>>> We can observe such resizing kick off in every reboot, like:
>>>
>>> (gdb) bt
>>> #0  qemu_ram_resize
>>> #1  0x00005602b623b740 in memory_region_ram_resize
>>> #2  0x00005602b60f5580 in acpi_ram_update
>>> #3  0x00005602b60f5667 in acpi_build_update
>>> #4  0x00005602b5e1028b in fw_cfg_select
>>> #5  0x00005602b5e105af in fw_cfg_dma_transfer
>>> #6  0x00005602b5e109a8 in fw_cfg_dma_mem_write
>>> #7  0x00005602b62352ec in memory_region_write_accessor
>>> #8  0x00005602b62355e6 in access_with_adjusted_size
>>> #9  0x00005602b6238de8 in memory_region_dispatch_write
>>> #10 0x00005602b62488c5 in flatview_write_continue_step
>>> #11 0x00005602b6248997 in flatview_write_continue
>>> #12 0x00005602b6248abf in flatview_write
>>> #13 0x00005602b6248f39 in address_space_write
>>> #14 0x00005602b6248fb1 in address_space_rw
>>> #15 0x00005602b62a5d86 in kvm_handle_io
>>> #16 0x00005602b62a6cb2 in kvm_cpu_exec
>>> #17 0x00005602b62aa37a in kvm_vcpu_thread_fn
>>> #18 0x00005602b655da57 in qemu_thread_start
>>> #19 0x00007f120224a1b7 in start_thread
>>> #20 0x00007f12022cc39c in clone3
>>>
>>> Specifically, see this code clip:
>>>
>>> acpi_ram_update():
>>>       memory_region_ram_resize(mr, size, &error_abort);
>>>       memcpy(memory_region_get_ram_ptr(mr), data->data, size);
>>>
>>> Per my understanding, what it does is during the reset the ROM ramblock
>>> will resize to the new size (normally, only larger, in my memory there used
>>> to have a ROM grew from 256K->512K, or something like that), then the
>>> memcpy() injects the latest firmware that it pre-loaded into mem.
>>>
>>> So after such system reset, QEMU might start to see new ROM code loaded
>>> here (not the one that got migrated anymore, which will only match the
>>> version installed on src QEMU).  Here the problem is the new firmware can
>>> be larger, so I _think_ we need to make sure max_length is not modified by
>>> CPR to allow resizing happen here, while if we use truncate=true here it
>>> should just work in all cases.
>>>
>>> I think it could be verified with an old QEMU running with old ROM files
>>> (which is smaller), then CPR migrate to a new QEMU running new ROM files
>>> (which is larger), then reboot to see whether that new QEMU crash.  Maybe
>>> we can emulate that with "romfile=XXX" parameter.
>>>
>>> I am not fluent with ROM/firmware code, but please double check..
>>
>> Thank you for the detailed analysis, I was completely wrong on this one :(
>>
>> I also keep forgetting that ftruncate can grow as well as shrink a file.
>> I agree that preserving the dest qemu max_length, and using ftruncate, is the
>> correct solution, as long as dest max_length >= source max_length.
>>
>> However, IMO the extra memory created by ftruncate also needs to be pinned for DMA.
>> We disagreed on exactly what blocks needs to be pinned in previous discussions,
>> and to save time I would rather not re-open that debate right now.  Instead, I propose
>> to simply require that max_length does not change, and return an error if it does.
>> If it changes in some future qemu, we can reopen the discussion.
> 
> Hmm.. why the extra memory needs to be pinned?
> 
>  From QEMU memory topology POV, anything more than used_length is not
> visible to the guest, afaict.
> 
> In this specific ROM example, qemu_ram_resize() on src QEMU will first
> resize the ramblock (updating used_length), then set that exact same size
> with memory_region_set_size() to the MR with the size of the smaller
> firmware size when src QEMU boots:
> 
> qemu_ram_resize():
>      unaligned_size = newsize;
>      ...
>      newsize = TARGET_PAGE_ALIGN(newsize);
>      newsize = REAL_HOST_PAGE_ALIGN(newsize);
>      ...
>      block->used_length = newsize;
>      ...
>      memory_region_set_size(block->mr, unaligned_size);
> 
> Here a tiny detail is the two sizes are slightly different, but the MR size
> is even smaller than used_length.  The MR size decides what can be visible
> to the guest, when the MR that owns the ROM file is mapped into GPA range.
> That's true on the src, while after CPR migrates to dest that should still
> hold true, afaict, as all the rest memory (used->max) is not yet used
> before a system reset.
> 
> The extra memory (used->max) can be relevant only after a system reset,
> when the new firmware will be loaded, and qemu_ram_resize() can indeed
> extend that MR to cover more than before.  However that should be fine too
> because that means guest memory is being rebuilt, so VFIO memory listeners
> should do the right things (unpin old, repin the new ROM that is larger
> this time), iiuc.

So, we actually agree!  I said the extra memory needs to be pinned.  You said
the memory listeners will pin the extra memory.  Cool.

I will add ftruncate to this patch to grow the mr.

- Steve
diff mbox series

Patch

diff --git a/system/physmem.c b/system/physmem.c
index 174f7e0..ddbeec9 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -72,6 +72,7 @@ 
 
 #include "qapi/qapi-types-migration.h"
 #include "migration/options.h"
+#include "migration/cpr.h"
 #include "migration/vmstate.h"
 
 #include "qemu/range.h"
@@ -1663,6 +1664,19 @@  void qemu_ram_unset_idstr(RAMBlock *block)
     }
 }
 
+static char *cpr_name(RAMBlock *block)
+{
+    MemoryRegion *mr = block->mr;
+    const char *mr_name = memory_region_name(mr);
+    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
+
+    if (id) {
+        return g_strdup_printf("%s/%s", id, mr_name);
+    } else {
+        return g_strdup(mr_name);
+    }
+}
+
 size_t qemu_ram_pagesize(RAMBlock *rb)
 {
     return rb->page_size;
@@ -1858,14 +1872,18 @@  static void ram_block_add(RAMBlock *new_block, Error **errp)
                                         TYPE_MEMORY_BACKEND)) {
             size_t max_length = new_block->max_length;
             MemoryRegion *mr = new_block->mr;
-            const char *name = memory_region_name(mr);
+            g_autofree char *name = cpr_name(new_block);
 
             new_block->mr->align = QEMU_VMALLOC_ALIGN;
             new_block->flags |= RAM_SHARED;
+            new_block->fd = cpr_find_fd(name, 0);
 
             if (new_block->fd == -1) {
                 new_block->fd = qemu_memfd_create(name, max_length + mr->align,
                                                   0, 0, 0, errp);
+                cpr_save_fd(name, 0, new_block->fd);
+            } else {
+                new_block->max_length = lseek(new_block->fd, 0, SEEK_END);
             }
 
             if (new_block->fd >= 0) {
@@ -1875,6 +1893,7 @@  static void ram_block_add(RAMBlock *new_block, Error **errp)
                                                  false, 0, errp);
             }
             if (!new_block->host) {
+                cpr_delete_fd(name, 0);
                 qemu_mutex_unlock_ramlist();
                 return;
             }
@@ -2182,6 +2201,8 @@  static void reclaim_ramblock(RAMBlock *block)
 
 void qemu_ram_free(RAMBlock *block)
 {
+    g_autofree char *name = NULL;
+
     if (!block) {
         return;
     }
@@ -2192,6 +2213,8 @@  void qemu_ram_free(RAMBlock *block)
     }
 
     qemu_mutex_lock_ramlist();
+    name = cpr_name(block);
+    cpr_delete_fd(name, 0);
     QLIST_REMOVE_RCU(block, next);
     ram_list.mru_block = NULL;
     /* Write list before version */