diff mbox series

[V4,06/19] physmem: preserve ram blocks for cpr

Message ID 1733145611-62315-7-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New, archived
Headers show
Series Live update: cpr-transfer | expand

Commit Message

Steve Sistare Dec. 2, 2024, 1:19 p.m. UTC
Save the memfd for ramblocks in CPR state, along with a name that
uniquely identifies it.  The block's idstr is not yet set, so it
cannot be used for this purpose.  Find the saved memfd in new QEMU when
creating a block.  If the block size is larger in new QEMU, extend the
block using fallocate, and the extra space will be useable after a guest
reset.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

Comments

Peter Xu Dec. 9, 2024, 8:07 p.m. UTC | #1
On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
> Save the memfd for ramblocks in CPR state, along with a name that
> uniquely identifies it.  The block's idstr is not yet set, so it
> cannot be used for this purpose.  Find the saved memfd in new QEMU when
> creating a block.  If the block size is larger in new QEMU, extend the
> block using fallocate, and the extra space will be useable after a guest
> reset.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>  system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
>  1 file changed, 34 insertions(+), 2 deletions(-)
> 
> diff --git a/system/physmem.c b/system/physmem.c
> index 0bcb2cc..aa095a3 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -70,6 +70,7 @@
>  
>  #include "qemu/pmem.h"
>  
> +#include "migration/cpr.h"
>  #include "migration/vmstate.h"
>  
>  #include "qemu/range.h"
> @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>      }
>  }
>  
> +static char *cpr_name(RAMBlock *block)
> +{
> +    MemoryRegion *mr = block->mr;
> +    const char *mr_name = memory_region_name(mr);
> +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> +
> +    if (id) {
> +        return g_strdup_printf("%s/%s", id, mr_name);
> +    } else {
> +        return g_strdup(mr_name);
> +    }
> +}
> +
>  size_t qemu_ram_pagesize(RAMBlock *rb)
>  {
>      return rb->page_size;
> @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>  {
>      size_t max_length = new_block->max_length;
>      MemoryRegion *mr = new_block->mr;
> -    const char *name = memory_region_name(mr);
> -    int fd;
> +    g_autofree char *name = cpr_name(new_block);
> +    int fd = cpr_find_fd(name, 0);

If to use the proposed patch in the reply of patch 2, here this should be
able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.

> +
> +    if (fd >= 0) {
> +        if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
> +            error_setg_errno(errp, errno,
> +                             "cannot grow ram block %s fd %d to %ld bytes",
> +                             name, fd, max_length);
> +            goto err;
> +        }

I remember we discussed something similar to this, do we need ftruncate()
at all?  I think not.

This happens when booting QEMU, so I don't think it's relevant yet to what
size used in src, as this is dest.

It starts to get relevant only when cpr migration starts on src, it sents
ramblocks at the beginning, then parse_ramblock() will properly resize any
ramblock to whatever size it should use.

If the resize didn't happen it can only mean that used_length is correctly
matched on both sides.

So I don't see why a special truncate() call is needed yet..

> +        goto have_fd;
> +    }
>  
>      if (qemu_memfd_available()) {
>          fd = qemu_memfd_create(name, max_length + mr->align, 0, 0, 0, errp);
> @@ -2111,7 +2135,9 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>              return true;
>          }
>      }
> +    cpr_save_fd(name, 0, fd);
>  
> +have_fd:
>      new_block->mr->align = QEMU_VMALLOC_ALIGN;
>      new_block->host = file_ram_alloc(new_block, max_length, fd, false, 0, errp);
>  
> @@ -2122,6 +2148,8 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>          return true;
>      }
>  
> +err:
> +    cpr_delete_fd(name, 0);
>      close(fd);
>      return false;
>  }
> @@ -2234,6 +2262,8 @@ static void reclaim_ramblock(RAMBlock *block)
>  
>  void qemu_ram_free(RAMBlock *block)
>  {
> +    g_autofree char *name = NULL;
> +
>      if (!block) {
>          return;
>      }
> @@ -2244,6 +2274,8 @@ void qemu_ram_free(RAMBlock *block)
>      }
>  
>      qemu_mutex_lock_ramlist();
> +    name = cpr_name(block);
> +    cpr_delete_fd(name, 0);
>      QLIST_REMOVE_RCU(block, next);
>      ram_list.mru_block = NULL;
>      /* Write list before version */
> -- 
> 1.8.3.1
>
Steve Sistare Dec. 12, 2024, 8:38 p.m. UTC | #2
On 12/9/2024 3:07 PM, Peter Xu wrote:
> On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
>> Save the memfd for ramblocks in CPR state, along with a name that
>> uniquely identifies it.  The block's idstr is not yet set, so it
>> cannot be used for this purpose.  Find the saved memfd in new QEMU when
>> creating a block.  If the block size is larger in new QEMU, extend the
>> block using fallocate, and the extra space will be useable after a guest
>> reset.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>>   system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
>>   1 file changed, 34 insertions(+), 2 deletions(-)
>>
>> diff --git a/system/physmem.c b/system/physmem.c
>> index 0bcb2cc..aa095a3 100644
>> --- a/system/physmem.c
>> +++ b/system/physmem.c
>> @@ -70,6 +70,7 @@
>>   
>>   #include "qemu/pmem.h"
>>   
>> +#include "migration/cpr.h"
>>   #include "migration/vmstate.h"
>>   
>>   #include "qemu/range.h"
>> @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>>       }
>>   }
>>   
>> +static char *cpr_name(RAMBlock *block)
>> +{
>> +    MemoryRegion *mr = block->mr;
>> +    const char *mr_name = memory_region_name(mr);
>> +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>> +
>> +    if (id) {
>> +        return g_strdup_printf("%s/%s", id, mr_name);
>> +    } else {
>> +        return g_strdup(mr_name);
>> +    }
>> +}
>> +
>>   size_t qemu_ram_pagesize(RAMBlock *rb)
>>   {
>>       return rb->page_size;
>> @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>>   {
>>       size_t max_length = new_block->max_length;
>>       MemoryRegion *mr = new_block->mr;
>> -    const char *name = memory_region_name(mr);
>> -    int fd;
>> +    g_autofree char *name = cpr_name(new_block);
>> +    int fd = cpr_find_fd(name, 0);
> 
> If to use the proposed patch in the reply of patch 2, here this should be
> able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
> 
>> +
>> +    if (fd >= 0) {
>> +        if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
>> +            error_setg_errno(errp, errno,
>> +                             "cannot grow ram block %s fd %d to %ld bytes",
>> +                             name, fd, max_length);
>> +            goto err;
>> +        }
> 
> I remember we discussed something similar to this, do we need ftruncate()
> at all?  I think not.
> 
> This happens when booting QEMU, so I don't think it's relevant yet to what
> size used in src, as this is dest.
> 
> It starts to get relevant only when cpr migration starts on src, it sents
> ramblocks at the beginning, then parse_ramblock() will properly resize any
> ramblock to whatever size it should use.
> 
> If the resize didn't happen it can only mean that used_length is correctly
> matched on both sides.
> 
> So I don't see why a special truncate() call is needed yet..

You suggested truncate:

   https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/

   "So after such system reset, QEMU might start to see new ROM code loaded
   here (not the one that got migrated anymore, which will only match the
   version installed on src QEMU).  Here the problem is the new firmware can
   be larger, so I _think_ we need to make sure max_length is not modified by
   CPR to allow resizing happen here, while if we use truncate=true here it
   should just work in all cases."

... but you suggested passing a truncate bool to the file_ram_alloc call after
cpr_find_fd.  I could do that instead.  However, if qemu_ram_alloc_shared uses
qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
then I will still call ftruncate here, because qemu_ram_alloc_from_fd  does not
take a truncate argument.

- Steve

>> +        goto have_fd;
>> +    }
>>   
>>       if (qemu_memfd_available()) {
>>           fd = qemu_memfd_create(name, max_length + mr->align, 0, 0, 0, errp);
>> @@ -2111,7 +2135,9 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>>               return true;
>>           }
>>       }
>> +    cpr_save_fd(name, 0, fd);
>>   
>> +have_fd:
>>       new_block->mr->align = QEMU_VMALLOC_ALIGN;
>>       new_block->host = file_ram_alloc(new_block, max_length, fd, false, 0, errp);
>>   
>> @@ -2122,6 +2148,8 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>>           return true;
>>       }
>>   
>> +err:
>> +    cpr_delete_fd(name, 0);
>>       close(fd);
>>       return false;
>>   }
>> @@ -2234,6 +2262,8 @@ static void reclaim_ramblock(RAMBlock *block)
>>   
>>   void qemu_ram_free(RAMBlock *block)
>>   {
>> +    g_autofree char *name = NULL;
>> +
>>       if (!block) {
>>           return;
>>       }
>> @@ -2244,6 +2274,8 @@ void qemu_ram_free(RAMBlock *block)
>>       }
>>   
>>       qemu_mutex_lock_ramlist();
>> +    name = cpr_name(block);
>> +    cpr_delete_fd(name, 0);
>>       QLIST_REMOVE_RCU(block, next);
>>       ram_list.mru_block = NULL;
>>       /* Write list before version */
>> -- 
>> 1.8.3.1
>>
>
Peter Xu Dec. 12, 2024, 10:48 p.m. UTC | #3
On Thu, Dec 12, 2024 at 03:38:14PM -0500, Steven Sistare wrote:
> On 12/9/2024 3:07 PM, Peter Xu wrote:
> > On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
> > > Save the memfd for ramblocks in CPR state, along with a name that
> > > uniquely identifies it.  The block's idstr is not yet set, so it
> > > cannot be used for this purpose.  Find the saved memfd in new QEMU when
> > > creating a block.  If the block size is larger in new QEMU, extend the
> > > block using fallocate, and the extra space will be useable after a guest
> > > reset.
> > > 
> > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > > ---
> > >   system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
> > >   1 file changed, 34 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/system/physmem.c b/system/physmem.c
> > > index 0bcb2cc..aa095a3 100644
> > > --- a/system/physmem.c
> > > +++ b/system/physmem.c
> > > @@ -70,6 +70,7 @@
> > >   #include "qemu/pmem.h"
> > > +#include "migration/cpr.h"
> > >   #include "migration/vmstate.h"
> > >   #include "qemu/range.h"
> > > @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
> > >       }
> > >   }
> > > +static char *cpr_name(RAMBlock *block)
> > > +{
> > > +    MemoryRegion *mr = block->mr;
> > > +    const char *mr_name = memory_region_name(mr);
> > > +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> > > +
> > > +    if (id) {
> > > +        return g_strdup_printf("%s/%s", id, mr_name);
> > > +    } else {
> > > +        return g_strdup(mr_name);
> > > +    }
> > > +}
> > > +
> > >   size_t qemu_ram_pagesize(RAMBlock *rb)
> > >   {
> > >       return rb->page_size;
> > > @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
> > >   {
> > >       size_t max_length = new_block->max_length;
> > >       MemoryRegion *mr = new_block->mr;
> > > -    const char *name = memory_region_name(mr);
> > > -    int fd;
> > > +    g_autofree char *name = cpr_name(new_block);
> > > +    int fd = cpr_find_fd(name, 0);
> > 
> > If to use the proposed patch in the reply of patch 2, here this should be
> > able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
> > 
> > > +
> > > +    if (fd >= 0) {
> > > +        if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
> > > +            error_setg_errno(errp, errno,
> > > +                             "cannot grow ram block %s fd %d to %ld bytes",
> > > +                             name, fd, max_length);
> > > +            goto err;
> > > +        }
> > 
> > I remember we discussed something similar to this, do we need ftruncate()
> > at all?  I think not.
> > 
> > This happens when booting QEMU, so I don't think it's relevant yet to what
> > size used in src, as this is dest.
> > 
> > It starts to get relevant only when cpr migration starts on src, it sents
> > ramblocks at the beginning, then parse_ramblock() will properly resize any
> > ramblock to whatever size it should use.
> > 
> > If the resize didn't happen it can only mean that used_length is correctly
> > matched on both sides.
> > 
> > So I don't see why a special truncate() call is needed yet..
> 
> You suggested truncate:
> 
>   https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
> 
>   "So after such system reset, QEMU might start to see new ROM code loaded
>   here (not the one that got migrated anymore, which will only match the
>   version installed on src QEMU).  Here the problem is the new firmware can
>   be larger, so I _think_ we need to make sure max_length is not modified by
>   CPR to allow resizing happen here, while if we use truncate=true here it
>   should just work in all cases."
> 
> ... but you suggested passing a truncate bool to the file_ram_alloc call after
> cpr_find_fd.  I could do that instead.  However, if qemu_ram_alloc_shared uses
> qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
> then I will still call ftruncate here, because qemu_ram_alloc_from_fd  does not
> take a truncate argument.

My memory was when reuse qemu_ram_alloc_from_fd() in that suggestion of
patch 2, it will only create zero-length fd (with fsize=0) and leave all
the rest to qemu_ram_alloc_from_fd(), then with that:

qemu_ram_alloc_from_fd:
    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
                                     errp);

So that'll always have truncate==!file_size==1. Then truncate will be done
at file_ram_alloc() later, iiuc.

    if (truncate && ftruncate(fd, offset + memory)) {
        perror("ftruncate");
    }

Would this work?

Meanwhile, this whole ram resize discussion reminded me that to reuse
qemu_ram_alloc_from_fd(), we may also want to make sure to pass ->resized()
hook from qemu_ram_alloc_internal() to the fd helper too..  IOW, we may
want to keep qemu_ram_resize() invoke those hooks, even after switching to
fd-based for aux mems.

Maybe the size / max_size also need to be passed over?  As for fd ramblock
it used to be always the same on used_length/max_length, but not anymore
when we switch aux mem to fd based.  Please feel free to double check..
Peter Xu Dec. 13, 2024, 3:21 p.m. UTC | #4
On Thu, Dec 12, 2024 at 05:48:03PM -0500, Peter Xu wrote:
> On Thu, Dec 12, 2024 at 03:38:14PM -0500, Steven Sistare wrote:
> > On 12/9/2024 3:07 PM, Peter Xu wrote:
> > > On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
> > > > Save the memfd for ramblocks in CPR state, along with a name that
> > > > uniquely identifies it.  The block's idstr is not yet set, so it
> > > > cannot be used for this purpose.  Find the saved memfd in new QEMU when
> > > > creating a block.  If the block size is larger in new QEMU, extend the
> > > > block using fallocate, and the extra space will be useable after a guest
> > > > reset.
> > > > 
> > > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > > > ---
> > > >   system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
> > > >   1 file changed, 34 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/system/physmem.c b/system/physmem.c
> > > > index 0bcb2cc..aa095a3 100644
> > > > --- a/system/physmem.c
> > > > +++ b/system/physmem.c
> > > > @@ -70,6 +70,7 @@
> > > >   #include "qemu/pmem.h"
> > > > +#include "migration/cpr.h"
> > > >   #include "migration/vmstate.h"
> > > >   #include "qemu/range.h"
> > > > @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
> > > >       }
> > > >   }
> > > > +static char *cpr_name(RAMBlock *block)
> > > > +{
> > > > +    MemoryRegion *mr = block->mr;
> > > > +    const char *mr_name = memory_region_name(mr);
> > > > +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> > > > +
> > > > +    if (id) {
> > > > +        return g_strdup_printf("%s/%s", id, mr_name);
> > > > +    } else {
> > > > +        return g_strdup(mr_name);
> > > > +    }
> > > > +}
> > > > +
> > > >   size_t qemu_ram_pagesize(RAMBlock *rb)
> > > >   {
> > > >       return rb->page_size;
> > > > @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
> > > >   {
> > > >       size_t max_length = new_block->max_length;
> > > >       MemoryRegion *mr = new_block->mr;
> > > > -    const char *name = memory_region_name(mr);
> > > > -    int fd;
> > > > +    g_autofree char *name = cpr_name(new_block);
> > > > +    int fd = cpr_find_fd(name, 0);
> > > 
> > > If to use the proposed patch in the reply of patch 2, here this should be
> > > able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
> > > 
> > > > +
> > > > +    if (fd >= 0) {
> > > > +        if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
> > > > +            error_setg_errno(errp, errno,
> > > > +                             "cannot grow ram block %s fd %d to %ld bytes",
> > > > +                             name, fd, max_length);
> > > > +            goto err;
> > > > +        }
> > > 
> > > I remember we discussed something similar to this, do we need ftruncate()
> > > at all?  I think not.
> > > 
> > > This happens when booting QEMU, so I don't think it's relevant yet to what
> > > size used in src, as this is dest.
> > > 
> > > It starts to get relevant only when cpr migration starts on src, it sents
> > > ramblocks at the beginning, then parse_ramblock() will properly resize any
> > > ramblock to whatever size it should use.
> > > 
> > > If the resize didn't happen it can only mean that used_length is correctly
> > > matched on both sides.
> > > 
> > > So I don't see why a special truncate() call is needed yet..
> > 
> > You suggested truncate:
> > 
> >   https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
> > 
> >   "So after such system reset, QEMU might start to see new ROM code loaded
> >   here (not the one that got migrated anymore, which will only match the
> >   version installed on src QEMU).  Here the problem is the new firmware can
> >   be larger, so I _think_ we need to make sure max_length is not modified by
> >   CPR to allow resizing happen here, while if we use truncate=true here it
> >   should just work in all cases."
> > 
> > ... but you suggested passing a truncate bool to the file_ram_alloc call after
> > cpr_find_fd.  I could do that instead.  However, if qemu_ram_alloc_shared uses
> > qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
> > then I will still call ftruncate here, because qemu_ram_alloc_from_fd  does not
> > take a truncate argument.
> 

[begin]

> My memory was when reuse qemu_ram_alloc_from_fd() in that suggestion of
> patch 2, it will only create zero-length fd (with fsize=0) and leave all
> the rest to qemu_ram_alloc_from_fd(), then with that:
> 
> qemu_ram_alloc_from_fd:
>     new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
>                                      errp);
> 
> So that'll always have truncate==!file_size==1. Then truncate will be done
> at file_ram_alloc() later, iiuc.
> 
>     if (truncate && ftruncate(fd, offset + memory)) {
>         perror("ftruncate");
>     }
> 
> Would this work?

[end]

Please feel free to ignore [begin]->[end]..  I guess I didn't really answer
it.

Now after I re-read the question.. considering that we have been very
cautious on the fsize here:

    file_size = get_file_size(fd);
    if (file_size > offset && file_size < (offset + size)) {
        error_setg(errp, "backing store size 0x%" PRIx64
                   " does not match 'size' option 0x" RAM_ADDR_FMT,
                   file_size, size);
        return NULL;
    }

I suppose your change makes sense.  So please feel free to keep the
truncation change.  I wished we could already auto-enlarge the file size
there already instead of failing, but I think I see why we're over cautious
on this - we want to still provide some safety in case some wrong file path
passed over to QEMU, to not easily corrupt the file when that happens.  So
we assume the file must be pre-truncated to say this is the right ram file.

Though if you wouldn't mind, I'd still request a comment explaining it,
because it probably isn't obvious..

AFAICT it's only relevant to resizable RAM and also the fact that it'll be
able to present now in fd-based ramblocks.  Maybe also mention the fact of
our cautious on changing file sizes on fd-based, but not avoidable to do it
here to make resizable work for firmwares.  Any form of comment would help.

OTOH, below comments should still worth checking.

> 
> Meanwhile, this whole ram resize discussion reminded me that to reuse
> qemu_ram_alloc_from_fd(), we may also want to make sure to pass ->resized()
> hook from qemu_ram_alloc_internal() to the fd helper too..  IOW, we may
> want to keep qemu_ram_resize() invoke those hooks, even after switching to
> fd-based for aux mems.
> 
> Maybe the size / max_size also need to be passed over?  As for fd ramblock
> it used to be always the same on used_length/max_length, but not anymore
> when we switch aux mem to fd based.  Please feel free to double check..
> 
> -- 
> Peter Xu
Steve Sistare Dec. 13, 2024, 3:30 p.m. UTC | #5
On 12/13/2024 10:21 AM, Peter Xu wrote:
> On Thu, Dec 12, 2024 at 05:48:03PM -0500, Peter Xu wrote:
>> On Thu, Dec 12, 2024 at 03:38:14PM -0500, Steven Sistare wrote:
>>> On 12/9/2024 3:07 PM, Peter Xu wrote:
>>>> On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
>>>>> Save the memfd for ramblocks in CPR state, along with a name that
>>>>> uniquely identifies it.  The block's idstr is not yet set, so it
>>>>> cannot be used for this purpose.  Find the saved memfd in new QEMU when
>>>>> creating a block.  If the block size is larger in new QEMU, extend the
>>>>> block using fallocate, and the extra space will be useable after a guest
>>>>> reset.
>>>>>
>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>> ---
>>>>>    system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
>>>>>    1 file changed, 34 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/system/physmem.c b/system/physmem.c
>>>>> index 0bcb2cc..aa095a3 100644
>>>>> --- a/system/physmem.c
>>>>> +++ b/system/physmem.c
>>>>> @@ -70,6 +70,7 @@
>>>>>    #include "qemu/pmem.h"
>>>>> +#include "migration/cpr.h"
>>>>>    #include "migration/vmstate.h"
>>>>>    #include "qemu/range.h"
>>>>> @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>>>>>        }
>>>>>    }
>>>>> +static char *cpr_name(RAMBlock *block)
>>>>> +{
>>>>> +    MemoryRegion *mr = block->mr;
>>>>> +    const char *mr_name = memory_region_name(mr);
>>>>> +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>>>>> +
>>>>> +    if (id) {
>>>>> +        return g_strdup_printf("%s/%s", id, mr_name);
>>>>> +    } else {
>>>>> +        return g_strdup(mr_name);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>    size_t qemu_ram_pagesize(RAMBlock *rb)
>>>>>    {
>>>>>        return rb->page_size;
>>>>> @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>>>>>    {
>>>>>        size_t max_length = new_block->max_length;
>>>>>        MemoryRegion *mr = new_block->mr;
>>>>> -    const char *name = memory_region_name(mr);
>>>>> -    int fd;
>>>>> +    g_autofree char *name = cpr_name(new_block);
>>>>> +    int fd = cpr_find_fd(name, 0);
>>>>
>>>> If to use the proposed patch in the reply of patch 2, here this should be
>>>> able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
>>>>
>>>>> +
>>>>> +    if (fd >= 0) {
>>>>> +        if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
>>>>> +            error_setg_errno(errp, errno,
>>>>> +                             "cannot grow ram block %s fd %d to %ld bytes",
>>>>> +                             name, fd, max_length);
>>>>> +            goto err;
>>>>> +        }
>>>>
>>>> I remember we discussed something similar to this, do we need ftruncate()
>>>> at all?  I think not.
>>>>
>>>> This happens when booting QEMU, so I don't think it's relevant yet to what
>>>> size used in src, as this is dest.
>>>>
>>>> It starts to get relevant only when cpr migration starts on src, it sents
>>>> ramblocks at the beginning, then parse_ramblock() will properly resize any
>>>> ramblock to whatever size it should use.
>>>>
>>>> If the resize didn't happen it can only mean that used_length is correctly
>>>> matched on both sides.
>>>>
>>>> So I don't see why a special truncate() call is needed yet..
>>>
>>> You suggested truncate:
>>>
>>>    https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
>>>
>>>    "So after such system reset, QEMU might start to see new ROM code loaded
>>>    here (not the one that got migrated anymore, which will only match the
>>>    version installed on src QEMU).  Here the problem is the new firmware can
>>>    be larger, so I _think_ we need to make sure max_length is not modified by
>>>    CPR to allow resizing happen here, while if we use truncate=true here it
>>>    should just work in all cases."
>>>
>>> ... but you suggested passing a truncate bool to the file_ram_alloc call after
>>> cpr_find_fd.  I could do that instead.  However, if qemu_ram_alloc_shared uses
>>> qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
>>> then I will still call ftruncate here, because qemu_ram_alloc_from_fd  does not
>>> take a truncate argument.
>>
> 
> [begin]
> 
>> My memory was when reuse qemu_ram_alloc_from_fd() in that suggestion of
>> patch 2, it will only create zero-length fd (with fsize=0) and leave all
>> the rest to qemu_ram_alloc_from_fd(), then with that:
>>
>> qemu_ram_alloc_from_fd:
>>      new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
>>                                       errp);
>>
>> So that'll always have truncate==!file_size==1. Then truncate will be done
>> at file_ram_alloc() later, iiuc.
>>
>>      if (truncate && ftruncate(fd, offset + memory)) {
>>          perror("ftruncate");
>>      }
>>
>> Would this work?
> 
> [end]
> 
> Please feel free to ignore [begin]->[end]..  I guess I didn't really answer
> it.
> 
> Now after I re-read the question.. considering that we have been very
> cautious on the fsize here:
> 
>      file_size = get_file_size(fd);
>      if (file_size > offset && file_size < (offset + size)) {
>          error_setg(errp, "backing store size 0x%" PRIx64
>                     " does not match 'size' option 0x" RAM_ADDR_FMT,
>                     file_size, size);
>          return NULL;
>      }
> 
> I suppose your change makes sense.  So please feel free to keep the
> truncation change.  I wished we could already auto-enlarge the file size
> there already instead of failing, but I think I see why we're over cautious
> on this - we want to still provide some safety in case some wrong file path
> passed over to QEMU, to not easily corrupt the file when that happens.  So
> we assume the file must be pre-truncated to say this is the right ram file.
> 
> Though if you wouldn't mind, I'd still request a comment explaining it,
> because it probably isn't obvious..
> 
> AFAICT it's only relevant to resizable RAM and also the fact that it'll be
> able to present now in fd-based ramblocks.  Maybe also mention the fact of
> our cautious on changing file sizes on fd-based, but not avoidable to do it
> here to make resizable work for firmwares.  Any form of comment would help.

Perhaps more to the point, for the incoming cpr memfd, the file size is not 0, so
the logic in qemu_ram_alloc_from_fd does not right-size it.   Calling ftruncate
prior does the right thing.

I will add comments.

> OTOH, below comments should still worth checking.
>>
>> Meanwhile, this whole ram resize discussion reminded me that to reuse
>> qemu_ram_alloc_from_fd(), we may also want to make sure to pass ->resized()
>> hook from qemu_ram_alloc_internal() to the fd helper too..  IOW, we may
>> want to keep qemu_ram_resize() invoke those hooks, even after switching to
>> fd-based for aux mems.
>>
>> Maybe the size / max_size also need to be passed over?  As for fd ramblock
>> it used to be always the same on used_length/max_length, but not anymore
>> when we switch aux mem to fd based.  Please feel free to double check..

Yes, I had already added the resize and size parameters to the helper function
prior to this email.

- Steve
Steve Sistare Dec. 18, 2024, 4:34 p.m. UTC | #6
On 12/13/2024 10:30 AM, Steven Sistare wrote:
> On 12/13/2024 10:21 AM, Peter Xu wrote:
>> On Thu, Dec 12, 2024 at 05:48:03PM -0500, Peter Xu wrote:
>>> On Thu, Dec 12, 2024 at 03:38:14PM -0500, Steven Sistare wrote:
>>>> On 12/9/2024 3:07 PM, Peter Xu wrote:
>>>>> On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
>>>>>> Save the memfd for ramblocks in CPR state, along with a name that
>>>>>> uniquely identifies it.  The block's idstr is not yet set, so it
>>>>>> cannot be used for this purpose.  Find the saved memfd in new QEMU when
>>>>>> creating a block.  If the block size is larger in new QEMU, extend the
>>>>>> block using fallocate, and the extra space will be useable after a guest
>>>>>> reset.
>>>>>>
>>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>>> ---
>>>>>>    system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
>>>>>>    1 file changed, 34 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/system/physmem.c b/system/physmem.c
>>>>>> index 0bcb2cc..aa095a3 100644
>>>>>> --- a/system/physmem.c
>>>>>> +++ b/system/physmem.c
>>>>>> @@ -70,6 +70,7 @@
>>>>>>    #include "qemu/pmem.h"
>>>>>> +#include "migration/cpr.h"
>>>>>>    #include "migration/vmstate.h"
>>>>>>    #include "qemu/range.h"
>>>>>> @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>>>>>>        }
>>>>>>    }
>>>>>> +static char *cpr_name(RAMBlock *block)
>>>>>> +{
>>>>>> +    MemoryRegion *mr = block->mr;
>>>>>> +    const char *mr_name = memory_region_name(mr);
>>>>>> +    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>>>>>> +
>>>>>> +    if (id) {
>>>>>> +        return g_strdup_printf("%s/%s", id, mr_name);
>>>>>> +    } else {
>>>>>> +        return g_strdup(mr_name);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>>    size_t qemu_ram_pagesize(RAMBlock *rb)
>>>>>>    {
>>>>>>        return rb->page_size;
>>>>>> @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>>>>>>    {
>>>>>>        size_t max_length = new_block->max_length;
>>>>>>        MemoryRegion *mr = new_block->mr;
>>>>>> -    const char *name = memory_region_name(mr);
>>>>>> -    int fd;
>>>>>> +    g_autofree char *name = cpr_name(new_block);
>>>>>> +    int fd = cpr_find_fd(name, 0);
>>>>>
>>>>> If to use the proposed patch in the reply of patch 2, here this should be
>>>>> able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
>>>>>
>>>>>> +
>>>>>> +    if (fd >= 0) {
>>>>>> +        if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
>>>>>> +            error_setg_errno(errp, errno,
>>>>>> +                             "cannot grow ram block %s fd %d to %ld bytes",
>>>>>> +                             name, fd, max_length);
>>>>>> +            goto err;
>>>>>> +        }
>>>>>
>>>>> I remember we discussed something similar to this, do we need ftruncate()
>>>>> at all?  I think not.
>>>>>
>>>>> This happens when booting QEMU, so I don't think it's relevant yet to what
>>>>> size used in src, as this is dest.
>>>>>
>>>>> It starts to get relevant only when cpr migration starts on src, it sents
>>>>> ramblocks at the beginning, then parse_ramblock() will properly resize any
>>>>> ramblock to whatever size it should use.
>>>>>
>>>>> If the resize didn't happen it can only mean that used_length is correctly
>>>>> matched on both sides.
>>>>>
>>>>> So I don't see why a special truncate() call is needed yet..
>>>>
>>>> You suggested truncate:
>>>>
>>>>    https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
>>>>
>>>>    "So after such system reset, QEMU might start to see new ROM code loaded
>>>>    here (not the one that got migrated anymore, which will only match the
>>>>    version installed on src QEMU).  Here the problem is the new firmware can
>>>>    be larger, so I _think_ we need to make sure max_length is not modified by
>>>>    CPR to allow resizing happen here, while if we use truncate=true here it
>>>>    should just work in all cases."
>>>>
>>>> ... but you suggested passing a truncate bool to the file_ram_alloc call after
>>>> cpr_find_fd.  I could do that instead.  However, if qemu_ram_alloc_shared uses
>>>> qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
>>>> then I will still call ftruncate here, because qemu_ram_alloc_from_fd  does not
>>>> take a truncate argument.
>>>
>>
>> [begin]
>>
>>> My memory was when reuse qemu_ram_alloc_from_fd() in that suggestion of
>>> patch 2, it will only create zero-length fd (with fsize=0) and leave all
>>> the rest to qemu_ram_alloc_from_fd(), then with that:
>>>
>>> qemu_ram_alloc_from_fd:
>>>      new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
>>>                                       errp);
>>>
>>> So that'll always have truncate==!file_size==1. Then truncate will be done
>>> at file_ram_alloc() later, iiuc.
>>>
>>>      if (truncate && ftruncate(fd, offset + memory)) {
>>>          perror("ftruncate");
>>>      }
>>>
>>> Would this work?
>>
>> [end]
>>
>> Please feel free to ignore [begin]->[end]..  I guess I didn't really answer
>> it.
>>
>> Now after I re-read the question.. considering that we have been very
>> cautious on the fsize here:
>>
>>      file_size = get_file_size(fd);
>>      if (file_size > offset && file_size < (offset + size)) {
>>          error_setg(errp, "backing store size 0x%" PRIx64
>>                     " does not match 'size' option 0x" RAM_ADDR_FMT,
>>                     file_size, size);
>>          return NULL;
>>      }
>>
>> I suppose your change makes sense.  So please feel free to keep the
>> truncation change.  I wished we could already auto-enlarge the file size
>> there already instead of failing, but I think I see why we're over cautious
>> on this - we want to still provide some safety in case some wrong file path
>> passed over to QEMU, to not easily corrupt the file when that happens.  So
>> we assume the file must be pre-truncated to say this is the right ram file.
>>
>> Though if you wouldn't mind, I'd still request a comment explaining it,
>> because it probably isn't obvious..
>>
>> AFAICT it's only relevant to resizable RAM and also the fact that it'll be
>> able to present now in fd-based ramblocks.  Maybe also mention the fact of
>> our cautious on changing file sizes on fd-based, but not avoidable to do it
>> here to make resizable work for firmwares.  Any form of comment would help.
> 
> Perhaps more to the point, for the incoming cpr memfd, the file size is not 0, so
> the logic in qemu_ram_alloc_from_fd does not right-size it.   Calling ftruncate
> prior does the right thing.
> 
> I will add comments.

After adding resizable support to qemu_ram_alloc_from_fd, I can also tweak it
to grow the file while preserving error checking for the general case, and
delete the explicit ftruncate in its caller:

     /*
      * Allow file_ram_alloc to grow the file during CPR, if a resizable
      * memory region wants a larger block than the incoming current size.
      */
     file_size = get_file_size(fd);
     if (file_size && file_size < offset + max_size && size == max_size &&
         migrate_mode() != MIG_MODE_CPR_TRANSFER) {
         error_setg(errp, "backing store size 0x%" PRIx64
                    " does not match 'size' option 0x" RAM_ADDR_FMT,
                    file_size, max_size);
         return NULL;
     }
     ...
     new_block->host = file_ram_alloc(new_block, max_size, fd,
                                      file_size < offset + max_size,
                                      offset, errp);

- Steve
Peter Xu Dec. 18, 2024, 5 p.m. UTC | #7
On Wed, Dec 18, 2024 at 11:34:34AM -0500, Steven Sistare wrote:
> After adding resizable support to qemu_ram_alloc_from_fd, I can also tweak it
> to grow the file while preserving error checking for the general case, and
> delete the explicit ftruncate in its caller:
> 
>     /*
>      * Allow file_ram_alloc to grow the file during CPR, if a resizable
>      * memory region wants a larger block than the incoming current size.
>      */
>     file_size = get_file_size(fd);
>     if (file_size && file_size < offset + max_size && size == max_size &&
>         migrate_mode() != MIG_MODE_CPR_TRANSFER) {

Firstly, this check is growing too long, maybe worthwhile to have a helper
already.

file_size_check():
    // COMMENTS...
    if (migrate_mode() == XXX) {
        return true;
    }

Said that, I think it's better we also add the flag to enforce the
truncation, only if cpr found a fd.  E.g. we may want to keep the old
behavior even if the user sets migrate mode to CPR (even without a
migration happening at all), then create a fd ramblock.

>         error_setg(errp, "backing store size 0x%" PRIx64
>                    " does not match 'size' option 0x" RAM_ADDR_FMT,
>                    file_size, max_size);
>         return NULL;
>     }
>     ...
>     new_block->host = file_ram_alloc(new_block, max_size, fd,
>                                      file_size < offset + max_size,
>                                      offset, errp);
> 
> - Steve
>
Steve Sistare Dec. 18, 2024, 8:22 p.m. UTC | #8
On 12/18/2024 12:00 PM, Peter Xu wrote:
> On Wed, Dec 18, 2024 at 11:34:34AM -0500, Steven Sistare wrote:
>> After adding resizable support to qemu_ram_alloc_from_fd, I can also tweak it
>> to grow the file while preserving error checking for the general case, and
>> delete the explicit ftruncate in its caller:
>>
>>      /*
>>       * Allow file_ram_alloc to grow the file during CPR, if a resizable
>>       * memory region wants a larger block than the incoming current size.
>>       */
>>      file_size = get_file_size(fd);
>>      if (file_size && file_size < offset + max_size && size == max_size &&
>>          migrate_mode() != MIG_MODE_CPR_TRANSFER) {
>>[...]
> 
> Firstly, this check is growing too long, maybe worthwhile to have a helper
> already.
> 
> file_size_check():
>      // COMMENTS...
>      if (migrate_mode() == XXX) {
>          return true;
>      }
> 
> Said that, I think it's better we also add the flag to enforce the
> truncation, only if cpr found a fd.  E.g. we may want to keep the old
> behavior even if the user sets migrate mode to CPR (even without a
> migration happening at all), then create a fd ramblock.

That was my intent.  Normally mode becomes TRANSFER only when outgoing migration
is about to start, or is incoming, but conceivably the source qemu user could
set mode early, before creating some object requiring aux memory.

I can add a grow parameter to qemu_ram_alloc_from_fd and pass true only
if the fd came from cpr_find_fd.  Sound OK?

RAMBlock *qemu_ram_alloc_from_fd(..., bool grow)
     if (file_size && file_size < offset + max_size && !grow) {
         error_setg(...
     ...
     new_block->host = file_ram_alloc(new_block, max_size, fd,
                                      file_size < offset + max_size,
                                      offset, errp);

- Steve
Peter Xu Dec. 18, 2024, 8:33 p.m. UTC | #9
On Wed, Dec 18, 2024 at 03:22:50PM -0500, Steven Sistare wrote:
> On 12/18/2024 12:00 PM, Peter Xu wrote:
> > On Wed, Dec 18, 2024 at 11:34:34AM -0500, Steven Sistare wrote:
> > > After adding resizable support to qemu_ram_alloc_from_fd, I can also tweak it
> > > to grow the file while preserving error checking for the general case, and
> > > delete the explicit ftruncate in its caller:
> > > 
> > >      /*
> > >       * Allow file_ram_alloc to grow the file during CPR, if a resizable
> > >       * memory region wants a larger block than the incoming current size.
> > >       */
> > >      file_size = get_file_size(fd);
> > >      if (file_size && file_size < offset + max_size && size == max_size &&
> > >          migrate_mode() != MIG_MODE_CPR_TRANSFER) {
> > > [...]
> > 
> > Firstly, this check is growing too long, maybe worthwhile to have a helper
> > already.
> > 
> > file_size_check():
> >      // COMMENTS...
> >      if (migrate_mode() == XXX) {
> >          return true;
> >      }
> > 
> > Said that, I think it's better we also add the flag to enforce the
> > truncation, only if cpr found a fd.  E.g. we may want to keep the old
> > behavior even if the user sets migrate mode to CPR (even without a
> > migration happening at all), then create a fd ramblock.
> 
> That was my intent.  Normally mode becomes TRANSFER only when outgoing migration
> is about to start, or is incoming, but conceivably the source qemu user could
> set mode early, before creating some object requiring aux memory.

Such ordering may not be wanted, and can be too trivial.

We used to discuss with Dan, and we wished all migration caps/params/modes
will only be set in the QMP "migrate" command, rather than being separate.

Actually we may start supporting such in the near future, taking all
migration setup in the QMP command 'migrate'.  Then none of migration
caps/params/modes will be global anymore, but only taken from the QMP
command.  From that POV, better not rely on that.

> 
> I can add a grow parameter to qemu_ram_alloc_from_fd and pass true only
> if the fd came from cpr_find_fd.  Sound OK?
> 
> RAMBlock *qemu_ram_alloc_from_fd(..., bool grow)
>     if (file_size && file_size < offset + max_size && !grow) {
>         error_setg(...
>     ...
>     new_block->host = file_ram_alloc(new_block, max_size, fd,
>                                      file_size < offset + max_size,
>                                      offset, errp);

Sounds good.

Thanks,
diff mbox series

Patch

diff --git a/system/physmem.c b/system/physmem.c
index 0bcb2cc..aa095a3 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -70,6 +70,7 @@ 
 
 #include "qemu/pmem.h"
 
+#include "migration/cpr.h"
 #include "migration/vmstate.h"
 
 #include "qemu/range.h"
@@ -1661,6 +1662,19 @@  void qemu_ram_unset_idstr(RAMBlock *block)
     }
 }
 
+static char *cpr_name(RAMBlock *block)
+{
+    MemoryRegion *mr = block->mr;
+    const char *mr_name = memory_region_name(mr);
+    g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
+
+    if (id) {
+        return g_strdup_printf("%s/%s", id, mr_name);
+    } else {
+        return g_strdup(mr_name);
+    }
+}
+
 size_t qemu_ram_pagesize(RAMBlock *rb)
 {
     return rb->page_size;
@@ -2080,8 +2094,18 @@  static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
 {
     size_t max_length = new_block->max_length;
     MemoryRegion *mr = new_block->mr;
-    const char *name = memory_region_name(mr);
-    int fd;
+    g_autofree char *name = cpr_name(new_block);
+    int fd = cpr_find_fd(name, 0);
+
+    if (fd >= 0) {
+        if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
+            error_setg_errno(errp, errno,
+                             "cannot grow ram block %s fd %d to %ld bytes",
+                             name, fd, max_length);
+            goto err;
+        }
+        goto have_fd;
+    }
 
     if (qemu_memfd_available()) {
         fd = qemu_memfd_create(name, max_length + mr->align, 0, 0, 0, errp);
@@ -2111,7 +2135,9 @@  static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
             return true;
         }
     }
+    cpr_save_fd(name, 0, fd);
 
+have_fd:
     new_block->mr->align = QEMU_VMALLOC_ALIGN;
     new_block->host = file_ram_alloc(new_block, max_length, fd, false, 0, errp);
 
@@ -2122,6 +2148,8 @@  static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
         return true;
     }
 
+err:
+    cpr_delete_fd(name, 0);
     close(fd);
     return false;
 }
@@ -2234,6 +2262,8 @@  static void reclaim_ramblock(RAMBlock *block)
 
 void qemu_ram_free(RAMBlock *block)
 {
+    g_autofree char *name = NULL;
+
     if (!block) {
         return;
     }
@@ -2244,6 +2274,8 @@  void qemu_ram_free(RAMBlock *block)
     }
 
     qemu_mutex_lock_ramlist();
+    name = cpr_name(block);
+    cpr_delete_fd(name, 0);
     QLIST_REMOVE_RCU(block, next);
     ram_list.mru_block = NULL;
     /* Write list before version */