Message ID | 1727725244-105198-14-git-send-email-steven.sistare@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Live update: cpr-transfer | expand |
On Mon, Sep 30, 2024 at 12:40:44PM -0700, Steve Sistare wrote: > Add the cpr-transfer migration mode. Usage: > qemu-system-$arch -machine anon-alloc=memfd ... > > start new QEMU with "-incoming <uri-1> -cpr-uri <uri-2>" > > Issue commands to old QEMU: > migrate_set_parameter mode cpr-transfer > migrate_set_parameter cpr-uri <uri-2> > migrate -d <uri-1> > > The migrate command stops the VM, saves CPR state to uri-2, saves > normal migration state to uri-1, and old QEMU enters the postmigrate > state. The user starts new QEMU on the same host as old QEMU, with the > same arguments as old QEMU, plus the -incoming option. Guest RAM is > preserved in place, albeit with new virtual addresses in new QEMU. > > This mode requires a second migration channel, specified by the > cpr-uri migration property on the outgoing side, and by the cpr-uri > QEMU command-line option on the incoming side. The channel must > be a type, such as unix socket, that supports SCM_RIGHTS. > > Memory-backend objects must have the share=on attribute, but > memory-backend-epc is not supported. The VM must be started with > the '-machine anon-alloc=memfd' option, which allows anonymous > memory to be transferred in place to the new process. The memfds > are kept open by sending the descriptors to new QEMU via the > cpr-uri, which must support SCM_RIGHTS, and they are mmap'd > in new QEMU. > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > --- > include/migration/cpr.h | 1 + > migration/cpr.c | 34 +++++++++++++++++++---- > migration/migration.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-- > migration/migration.h | 2 ++ > migration/ram.c | 2 ++ > migration/vmstate-types.c | 5 ++-- > qapi/migration.json | 27 ++++++++++++++++++- > stubs/vmstate.c | 7 +++++ > 8 files changed, 137 insertions(+), 10 deletions(-) > > diff --git a/include/migration/cpr.h b/include/migration/cpr.h > index e886c98..5cd373f 100644 > --- a/include/migration/cpr.h > +++ b/include/migration/cpr.h > @@ -30,6 +30,7 @@ int cpr_state_save(Error **errp); > int cpr_state_load(Error **errp); > void cpr_state_close(void); > struct QIOChannel *cpr_state_ioc(void); > +bool cpr_needed_for_reuse(void *opaque); > > QEMUFile *cpr_transfer_output(const char *uri, Error **errp); > QEMUFile *cpr_transfer_input(const char *uri, Error **errp); > diff --git a/migration/cpr.c b/migration/cpr.c > index 86f66c1..911b556 100644 > --- a/migration/cpr.c > +++ b/migration/cpr.c > @@ -9,6 +9,7 @@ > #include "qapi/error.h" > #include "migration/cpr.h" > #include "migration/misc.h" > +#include "migration/options.h" > #include "migration/qemu-file.h" > #include "migration/savevm.h" > #include "migration/vmstate.h" > @@ -57,7 +58,7 @@ static const VMStateDescription vmstate_cpr_fd = { > VMSTATE_UINT32(namelen, CprFd), > VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen), > VMSTATE_INT32(id, CprFd), > - VMSTATE_INT32(fd, CprFd), > + VMSTATE_FD(fd, CprFd), > VMSTATE_END_OF_LIST() > } > }; > @@ -174,9 +175,16 @@ int cpr_state_save(Error **errp) > { > int ret; > QEMUFile *f; > + MigMode mode = migrate_mode(); > > - /* set f based on mode in a later patch in this series */ > - return 0; > + if (mode == MIG_MODE_CPR_TRANSFER) { > + f = cpr_transfer_output(migrate_cpr_uri(), errp); > + } else { > + return 0; > + } > + if (!f) { > + return -1; > + } > > qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); > qemu_put_be32(f, QEMU_CPR_FILE_VERSION); > @@ -205,8 +213,18 @@ int cpr_state_load(Error **errp) > uint32_t v; > QEMUFile *f; > > - /* set f based on mode in a later patch in this series */ > - return 0; > + /* > + * Mode will be loaded in CPR state, so cannot use it to decide which > + * form of state to load. > + */ > + if (cpr_uri) { > + f = cpr_transfer_input(cpr_uri, errp); > + } else { > + return 0; > + } > + if (!f) { > + return -1; > + } > > v = qemu_get_be32(f); > if (v != QEMU_CPR_FILE_MAGIC) { > @@ -243,3 +261,9 @@ void cpr_state_close(void) > cpr_state_file = NULL; > } > } > + > +bool cpr_needed_for_reuse(void *opaque) > +{ > + MigMode mode = migrate_mode(); > + return mode == MIG_MODE_CPR_TRANSFER; > +} Drop it until used? > diff --git a/migration/migration.c b/migration/migration.c > index 3301583..73b85aa 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -76,6 +76,7 @@ > static NotifierWithReturnList migration_state_notifiers[] = { > NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL), > NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT), > + NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER), > }; > > /* Messages sent on the return path from destination to source */ > @@ -109,6 +110,7 @@ static int migration_maybe_pause(MigrationState *s, > static void migrate_fd_cancel(MigrationState *s); > static bool close_return_path_on_source(MigrationState *s); > static void migration_completion_end(MigrationState *s); > +static void migrate_hup_delete(MigrationState *s); > > static void migration_downtime_start(MigrationState *s) > { > @@ -204,6 +206,12 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, > return false; > } > > + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && > + addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { > + error_setg(errp, "Migration requires streamable transport (eg unix)"); > + return false; > + } > + > return true; > } > > @@ -316,6 +324,7 @@ void migration_cancel(const Error *error) > qmp_cancel_vcpu_dirty_limit(false, -1, NULL); > } > migrate_fd_cancel(current_migration); > + migrate_hup_delete(current_migration); > } > > void migration_shutdown(void) > @@ -718,6 +727,9 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, > } else { > error_setg(errp, "unknown migration protocol: %s", uri); > } > + > + /* Close cpr socket to tell source that we are listening */ > + cpr_state_close(); Would it be possible to use some explicit reply message to mark this? So far looks like src QEMU will continue with qmp_migrate_finish() even if the cpr channel was closed due to error. I still didn't see how that kind of issue was captured below [1] (e.g., cpr channel broken after sending partial fds)? > } > > static void process_incoming_migration_bh(void *opaque) > @@ -1414,6 +1426,8 @@ static void migrate_fd_cleanup(MigrationState *s) > s->vmdesc = NULL; > > qemu_savevm_state_cleanup(); > + cpr_state_close(); > + migrate_hup_delete(s); > > close_return_path_on_source(s); > > @@ -1698,7 +1712,9 @@ bool migration_thread_is_self(void) > > bool migrate_mode_is_cpr(MigrationState *s) > { > - return s->parameters.mode == MIG_MODE_CPR_REBOOT; > + MigMode mode = s->parameters.mode; > + return mode == MIG_MODE_CPR_REBOOT || > + mode == MIG_MODE_CPR_TRANSFER; > } > > int migrate_init(MigrationState *s, Error **errp) > @@ -2033,6 +2049,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) > return false; > } > > + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && > + !s->parameters.cpr_uri) { > + error_setg(errp, "cpr-transfer mode requires setting cpr-uri"); > + return false; > + } > + > if (migration_is_blocked(errp)) { > return false; > } > @@ -2076,6 +2098,37 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) > static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested, > Error **errp); > > +static void migrate_hup_add(MigrationState *s, QIOChannel *ioc, GSourceFunc cb, > + void *opaque) > +{ > + s->hup_source = qio_channel_create_watch(ioc, G_IO_HUP); > + g_source_set_callback(s->hup_source, cb, opaque, NULL); > + g_source_attach(s->hup_source, NULL); > +} > + > +static void migrate_hup_delete(MigrationState *s) > +{ > + if (s->hup_source) { > + g_source_destroy(s->hup_source); > + g_source_unref(s->hup_source); > + s->hup_source = NULL; > + } > +} > + > +static gboolean qmp_migrate_finish_cb(QIOChannel *channel, > + GIOCondition cond, > + void *opaque) > +{ > + MigrationAddress *addr = opaque; [1] > + > + qmp_migrate_finish(addr, false, NULL); > + > + cpr_state_close(); > + migrate_hup_delete(migrate_get_current()); > + qapi_free_MigrationAddress(addr); > + return G_SOURCE_REMOVE; > +} > + > void qmp_migrate(const char *uri, bool has_channels, > MigrationChannelList *channels, bool has_detach, bool detach, > bool has_resume, bool resume, Error **errp) > @@ -2136,7 +2189,19 @@ void qmp_migrate(const char *uri, bool has_channels, > goto out; > } > > - qmp_migrate_finish(addr, resume_requested, errp); > + /* > + * For cpr-transfer, the target may not be listening yet on the migration > + * channel, because first it must finish cpr_load_state. The target tells > + * us it is listening by closing the cpr-state socket. Wait for that HUP > + * event before connecting in qmp_migrate_finish. > + */ > + if (s->parameters.mode == MIG_MODE_CPR_TRANSFER) { > + migrate_hup_add(s, cpr_state_ioc(), (GSourceFunc)qmp_migrate_finish_cb, > + QAPI_CLONE(MigrationAddress, addr)); > + > + } else { > + qmp_migrate_finish(addr, resume_requested, errp); > + } > > out: > if (local_err) { > diff --git a/migration/migration.h b/migration/migration.h > index 38aa140..74c167b 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -457,6 +457,8 @@ struct MigrationState { > bool switchover_acked; > /* Is this a rdma migration */ > bool rdma_migration; > + > + GSource *hup_source; > }; > > void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, > diff --git a/migration/ram.c b/migration/ram.c > index 81eda27..e2cef50 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -216,7 +216,9 @@ static bool postcopy_preempt_active(void) > > bool migrate_ram_is_ignored(RAMBlock *block) > { > + MigMode mode = migrate_mode(); > return !qemu_ram_is_migratable(block) || > + mode == MIG_MODE_CPR_TRANSFER || > (migrate_ignore_shared() && qemu_ram_is_shared(block) > && qemu_ram_is_named_file(block)); > } > diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c > index 6e45a4a..b5a55b8 100644 > --- a/migration/vmstate-types.c > +++ b/migration/vmstate-types.c > @@ -15,6 +15,7 @@ > #include "qemu-file.h" > #include "migration.h" > #include "migration/vmstate.h" > +#include "migration/client-options.h" > #include "qemu/error-report.h" > #include "qemu/queue.h" > #include "trace.h" > @@ -321,7 +322,7 @@ static int get_fd(QEMUFile *f, void *pv, size_t size, > { > int32_t *v = pv; > qemu_get_sbe32s(f, v); > - if (*v < 0) { > + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { > return 0; > } > *v = qemu_file_get_fd(f); > @@ -334,7 +335,7 @@ static int put_fd(QEMUFile *f, void *pv, size_t size, > int32_t *v = pv; > > qemu_put_sbe32s(f, v); > - if (*v < 0) { > + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { So I suppose you wanted to guard VMSTATE_FD being abused. Then I wonder whether it'll help more by adding a comment above VMSTATE_FD instead; it'll be more straightforward to me. And if you want to fail hard, assert should work better too in runtime, or the "return 0" can be pretty hard to notice. > return 0; > } > return qemu_file_put_fd(f, *v); > diff --git a/qapi/migration.json b/qapi/migration.json > index c0d8bcc..f51b4cb 100644 > --- a/qapi/migration.json > +++ b/qapi/migration.json > @@ -611,9 +611,34 @@ > # or COLO. > # > # (since 8.2) > +# > +# @cpr-transfer: This mode allows the user to transfer a guest to a > +# new QEMU instance on the same host with minimal guest pause > +# time, by preserving guest RAM in place, albeit with new virtual > +# addresses in new QEMU. > +# > +# The user starts new QEMU on the same host as old QEMU, with the > +# the same arguments as old QEMU, plus the -incoming option. The > +# user issues the migrate command to old QEMU, which stops the VM, > +# saves state to the migration channels, and enters the > +# postmigrate state. Execution resumes in new QEMU. Guest RAM is > +# preserved in place, albeit with new virtual addresses in new > +# QEMU. The incoming migration channel cannot be a file type. > +# > +# This mode requires a second migration channel, specified by the > +# cpr-uri migration property on the outgoing side, and by > +# the cpr-uri QEMU command-line option on the incoming > +# side. The channel must be a type, such as unix socket, that > +# supports SCM_RIGHTS. > +# > +# Memory-backend objects must have the share=on attribute, but > +# memory-backend-epc is not supported. The VM must be started > +# with the '-machine anon-alloc=memfd' option. > +# > +# (since 9.2) > ## > { 'enum': 'MigMode', > - 'data': [ 'normal', 'cpr-reboot' ] } > + 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] } No need to rush, but please add the CPR.rst and unit test updates when you feel confident on the protocol. It looks pretty good to me now. Especially it'll be nice to describe the separate cpr-channel protocol in the new doc page. Thanks, > > ## > # @ZeroPageDetection: > diff --git a/stubs/vmstate.c b/stubs/vmstate.c > index 8513d92..c190762 100644 > --- a/stubs/vmstate.c > +++ b/stubs/vmstate.c > @@ -1,5 +1,7 @@ > #include "qemu/osdep.h" > #include "migration/vmstate.h" > +#include "qapi/qapi-types-migration.h" > +#include "migration/client-options.h" > > int vmstate_register_with_alias_id(VMStateIf *obj, > uint32_t instance_id, > @@ -21,3 +23,8 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd) > { > return true; > } > + > +MigMode migrate_mode(void) > +{ > + return MIG_MODE_NORMAL; > +} > -- > 1.8.3.1 >
On 10/7/2024 3:44 PM, Peter Xu wrote: > On Mon, Sep 30, 2024 at 12:40:44PM -0700, Steve Sistare wrote: >> Add the cpr-transfer migration mode. Usage: >> qemu-system-$arch -machine anon-alloc=memfd ... >> >> start new QEMU with "-incoming <uri-1> -cpr-uri <uri-2>" >> >> Issue commands to old QEMU: >> migrate_set_parameter mode cpr-transfer >> migrate_set_parameter cpr-uri <uri-2> >> migrate -d <uri-1> >> >> The migrate command stops the VM, saves CPR state to uri-2, saves >> normal migration state to uri-1, and old QEMU enters the postmigrate >> state. The user starts new QEMU on the same host as old QEMU, with the >> same arguments as old QEMU, plus the -incoming option. Guest RAM is >> preserved in place, albeit with new virtual addresses in new QEMU. >> >> This mode requires a second migration channel, specified by the >> cpr-uri migration property on the outgoing side, and by the cpr-uri >> QEMU command-line option on the incoming side. The channel must >> be a type, such as unix socket, that supports SCM_RIGHTS. >> >> Memory-backend objects must have the share=on attribute, but >> memory-backend-epc is not supported. The VM must be started with >> the '-machine anon-alloc=memfd' option, which allows anonymous >> memory to be transferred in place to the new process. The memfds >> are kept open by sending the descriptors to new QEMU via the >> cpr-uri, which must support SCM_RIGHTS, and they are mmap'd >> in new QEMU. >> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >> --- >> include/migration/cpr.h | 1 + >> migration/cpr.c | 34 +++++++++++++++++++---- >> migration/migration.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-- >> migration/migration.h | 2 ++ >> migration/ram.c | 2 ++ >> migration/vmstate-types.c | 5 ++-- >> qapi/migration.json | 27 ++++++++++++++++++- >> stubs/vmstate.c | 7 +++++ >> 8 files changed, 137 insertions(+), 10 deletions(-) >> >> diff --git a/include/migration/cpr.h b/include/migration/cpr.h >> index e886c98..5cd373f 100644 >> --- a/include/migration/cpr.h >> +++ b/include/migration/cpr.h >> @@ -30,6 +30,7 @@ int cpr_state_save(Error **errp); >> int cpr_state_load(Error **errp); >> void cpr_state_close(void); >> struct QIOChannel *cpr_state_ioc(void); >> +bool cpr_needed_for_reuse(void *opaque); >> >> QEMUFile *cpr_transfer_output(const char *uri, Error **errp); >> QEMUFile *cpr_transfer_input(const char *uri, Error **errp); >> diff --git a/migration/cpr.c b/migration/cpr.c >> index 86f66c1..911b556 100644 >> --- a/migration/cpr.c >> +++ b/migration/cpr.c >> @@ -9,6 +9,7 @@ >> #include "qapi/error.h" >> #include "migration/cpr.h" >> #include "migration/misc.h" >> +#include "migration/options.h" >> #include "migration/qemu-file.h" >> #include "migration/savevm.h" >> #include "migration/vmstate.h" >> @@ -57,7 +58,7 @@ static const VMStateDescription vmstate_cpr_fd = { >> VMSTATE_UINT32(namelen, CprFd), >> VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen), >> VMSTATE_INT32(id, CprFd), >> - VMSTATE_INT32(fd, CprFd), >> + VMSTATE_FD(fd, CprFd), >> VMSTATE_END_OF_LIST() >> } >> }; >> @@ -174,9 +175,16 @@ int cpr_state_save(Error **errp) >> { >> int ret; >> QEMUFile *f; >> + MigMode mode = migrate_mode(); >> >> - /* set f based on mode in a later patch in this series */ >> - return 0; >> + if (mode == MIG_MODE_CPR_TRANSFER) { >> + f = cpr_transfer_output(migrate_cpr_uri(), errp); >> + } else { >> + return 0; >> + } >> + if (!f) { >> + return -1; >> + } >> >> qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); >> qemu_put_be32(f, QEMU_CPR_FILE_VERSION); >> @@ -205,8 +213,18 @@ int cpr_state_load(Error **errp) >> uint32_t v; >> QEMUFile *f; >> >> - /* set f based on mode in a later patch in this series */ >> - return 0; >> + /* >> + * Mode will be loaded in CPR state, so cannot use it to decide which >> + * form of state to load. >> + */ >> + if (cpr_uri) { >> + f = cpr_transfer_input(cpr_uri, errp); >> + } else { >> + return 0; >> + } >> + if (!f) { >> + return -1; >> + } >> >> v = qemu_get_be32(f); >> if (v != QEMU_CPR_FILE_MAGIC) { >> @@ -243,3 +261,9 @@ void cpr_state_close(void) >> cpr_state_file = NULL; >> } >> } >> + >> +bool cpr_needed_for_reuse(void *opaque) >> +{ >> + MigMode mode = migrate_mode(); >> + return mode == MIG_MODE_CPR_TRANSFER; >> +} > > Drop it until used? Maybe, but here is my reason for including it here. These common functions like cpr_needed_for_reuse and cpr_resave_fd are needed by multiple follow-on series: vfio, tap, iommufd. To send those for comment, as I have beem, I need to prepend a patch for cpr_needed_for_reuse to each of those series, which is redundant. It makes more sense IMO to include them in this initial series. But, it's your call. >> diff --git a/migration/migration.c b/migration/migration.c >> index 3301583..73b85aa 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -76,6 +76,7 @@ >> static NotifierWithReturnList migration_state_notifiers[] = { >> NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL), >> NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT), >> + NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER), >> }; >> >> /* Messages sent on the return path from destination to source */ >> @@ -109,6 +110,7 @@ static int migration_maybe_pause(MigrationState *s, >> static void migrate_fd_cancel(MigrationState *s); >> static bool close_return_path_on_source(MigrationState *s); >> static void migration_completion_end(MigrationState *s); >> +static void migrate_hup_delete(MigrationState *s); >> >> static void migration_downtime_start(MigrationState *s) >> { >> @@ -204,6 +206,12 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, >> return false; >> } >> >> + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && >> + addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { >> + error_setg(errp, "Migration requires streamable transport (eg unix)"); >> + return false; >> + } >> + >> return true; >> } >> >> @@ -316,6 +324,7 @@ void migration_cancel(const Error *error) >> qmp_cancel_vcpu_dirty_limit(false, -1, NULL); >> } >> migrate_fd_cancel(current_migration); >> + migrate_hup_delete(current_migration); >> } >> >> void migration_shutdown(void) >> @@ -718,6 +727,9 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, >> } else { >> error_setg(errp, "unknown migration protocol: %s", uri); >> } >> + >> + /* Close cpr socket to tell source that we are listening */ >> + cpr_state_close(); > > Would it be possible to use some explicit reply message to mark this? In theory yes, but I fear that using a return channel with message parsing and dispatch adds more code than it is worth. > So > far looks like src QEMU will continue with qmp_migrate_finish() even if the > cpr channel was closed due to error. Yes, but we recover just fine. The target hits some error, fails to read all the cpr state, closes the channel prematurely, and does *not* create a listen socket for the normal migration channel. Hence qmp_migrate_finish fails to connect to the normal channel, and recovers. > I still didn't see how that kind of issue was captured below [1] (e.g., cpr > channel broken after sending partial fds)? Same as above. >> } >> >> static void process_incoming_migration_bh(void *opaque) >> @@ -1414,6 +1426,8 @@ static void migrate_fd_cleanup(MigrationState *s) >> s->vmdesc = NULL; >> >> qemu_savevm_state_cleanup(); >> + cpr_state_close(); >> + migrate_hup_delete(s); >> >> close_return_path_on_source(s); >> >> @@ -1698,7 +1712,9 @@ bool migration_thread_is_self(void) >> >> bool migrate_mode_is_cpr(MigrationState *s) >> { >> - return s->parameters.mode == MIG_MODE_CPR_REBOOT; >> + MigMode mode = s->parameters.mode; >> + return mode == MIG_MODE_CPR_REBOOT || >> + mode == MIG_MODE_CPR_TRANSFER; >> } >> >> int migrate_init(MigrationState *s, Error **errp) >> @@ -2033,6 +2049,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) >> return false; >> } >> >> + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && >> + !s->parameters.cpr_uri) { >> + error_setg(errp, "cpr-transfer mode requires setting cpr-uri"); >> + return false; >> + } >> + >> if (migration_is_blocked(errp)) { >> return false; >> } >> @@ -2076,6 +2098,37 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) >> static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested, >> Error **errp); >> >> +static void migrate_hup_add(MigrationState *s, QIOChannel *ioc, GSourceFunc cb, >> + void *opaque) >> +{ >> + s->hup_source = qio_channel_create_watch(ioc, G_IO_HUP); >> + g_source_set_callback(s->hup_source, cb, opaque, NULL); >> + g_source_attach(s->hup_source, NULL); >> +} >> + >> +static void migrate_hup_delete(MigrationState *s) >> +{ >> + if (s->hup_source) { >> + g_source_destroy(s->hup_source); >> + g_source_unref(s->hup_source); >> + s->hup_source = NULL; >> + } >> +} >> + >> +static gboolean qmp_migrate_finish_cb(QIOChannel *channel, >> + GIOCondition cond, >> + void *opaque) >> +{ >> + MigrationAddress *addr = opaque; > > [1] > >> + >> + qmp_migrate_finish(addr, false, NULL); >> + >> + cpr_state_close(); >> + migrate_hup_delete(migrate_get_current()); >> + qapi_free_MigrationAddress(addr); >> + return G_SOURCE_REMOVE; >> +} >> + >> void qmp_migrate(const char *uri, bool has_channels, >> MigrationChannelList *channels, bool has_detach, bool detach, >> bool has_resume, bool resume, Error **errp) >> @@ -2136,7 +2189,19 @@ void qmp_migrate(const char *uri, bool has_channels, >> goto out; >> } >> >> - qmp_migrate_finish(addr, resume_requested, errp); >> + /* >> + * For cpr-transfer, the target may not be listening yet on the migration >> + * channel, because first it must finish cpr_load_state. The target tells >> + * us it is listening by closing the cpr-state socket. Wait for that HUP >> + * event before connecting in qmp_migrate_finish. >> + */ >> + if (s->parameters.mode == MIG_MODE_CPR_TRANSFER) { >> + migrate_hup_add(s, cpr_state_ioc(), (GSourceFunc)qmp_migrate_finish_cb, >> + QAPI_CLONE(MigrationAddress, addr)); >> + >> + } else { >> + qmp_migrate_finish(addr, resume_requested, errp); >> + } >> >> out: >> if (local_err) { >> diff --git a/migration/migration.h b/migration/migration.h >> index 38aa140..74c167b 100644 >> --- a/migration/migration.h >> +++ b/migration/migration.h >> @@ -457,6 +457,8 @@ struct MigrationState { >> bool switchover_acked; >> /* Is this a rdma migration */ >> bool rdma_migration; >> + >> + GSource *hup_source; >> }; >> >> void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, >> diff --git a/migration/ram.c b/migration/ram.c >> index 81eda27..e2cef50 100644 >> --- a/migration/ram.c >> +++ b/migration/ram.c >> @@ -216,7 +216,9 @@ static bool postcopy_preempt_active(void) >> >> bool migrate_ram_is_ignored(RAMBlock *block) >> { >> + MigMode mode = migrate_mode(); >> return !qemu_ram_is_migratable(block) || >> + mode == MIG_MODE_CPR_TRANSFER || >> (migrate_ignore_shared() && qemu_ram_is_shared(block) >> && qemu_ram_is_named_file(block)); >> } >> diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c >> index 6e45a4a..b5a55b8 100644 >> --- a/migration/vmstate-types.c >> +++ b/migration/vmstate-types.c >> @@ -15,6 +15,7 @@ >> #include "qemu-file.h" >> #include "migration.h" >> #include "migration/vmstate.h" >> +#include "migration/client-options.h" >> #include "qemu/error-report.h" >> #include "qemu/queue.h" >> #include "trace.h" >> @@ -321,7 +322,7 @@ static int get_fd(QEMUFile *f, void *pv, size_t size, >> { >> int32_t *v = pv; >> qemu_get_sbe32s(f, v); >> - if (*v < 0) { >> + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { >> return 0; >> } >> *v = qemu_file_get_fd(f); >> @@ -334,7 +335,7 @@ static int put_fd(QEMUFile *f, void *pv, size_t size, >> int32_t *v = pv; >> >> qemu_put_sbe32s(f, v); >> - if (*v < 0) { >> + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { > > So I suppose you wanted to guard VMSTATE_FD being abused. Then I wonder > whether it'll help more by adding a comment above VMSTATE_FD instead; it'll > be more straightforward to me. > > And if you want to fail hard, assert should work better too in runtime, or > the "return 0" can be pretty hard to notice. No, this code is not about detecting abuse or errors. It is there to skip the qemu_file_put_fd for cpr-exec mode. In my next version this function will simply be: static int put_fd(QEMUFile *f, void *pv, size_t size, const VMStateField *field, JSONWriter *vmdesc) { int32_t *v = pv; return qemu_file_put_fd(f, *v); } >> return 0; >> } >> return qemu_file_put_fd(f, *v); >> diff --git a/qapi/migration.json b/qapi/migration.json >> index c0d8bcc..f51b4cb 100644 >> --- a/qapi/migration.json >> +++ b/qapi/migration.json >> @@ -611,9 +611,34 @@ >> # or COLO. >> # >> # (since 8.2) >> +# >> +# @cpr-transfer: This mode allows the user to transfer a guest to a >> +# new QEMU instance on the same host with minimal guest pause >> +# time, by preserving guest RAM in place, albeit with new virtual >> +# addresses in new QEMU. >> +# >> +# The user starts new QEMU on the same host as old QEMU, with the >> +# the same arguments as old QEMU, plus the -incoming option. The >> +# user issues the migrate command to old QEMU, which stops the VM, >> +# saves state to the migration channels, and enters the >> +# postmigrate state. Execution resumes in new QEMU. Guest RAM is >> +# preserved in place, albeit with new virtual addresses in new >> +# QEMU. The incoming migration channel cannot be a file type. >> +# >> +# This mode requires a second migration channel, specified by the >> +# cpr-uri migration property on the outgoing side, and by >> +# the cpr-uri QEMU command-line option on the incoming >> +# side. The channel must be a type, such as unix socket, that >> +# supports SCM_RIGHTS. >> +# >> +# Memory-backend objects must have the share=on attribute, but >> +# memory-backend-epc is not supported. The VM must be started >> +# with the '-machine anon-alloc=memfd' option. >> +# >> +# (since 9.2) >> ## >> { 'enum': 'MigMode', >> - 'data': [ 'normal', 'cpr-reboot' ] } >> + 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] } > > No need to rush, but please add the CPR.rst and unit test updates when you > feel confident on the protocol. It looks pretty good to me now. > > Especially it'll be nice to describe the separate cpr-channel protocol in > the new doc page. Will do, now that there is light at the end of the tunnel. - Steve >> ## >> # @ZeroPageDetection: >> diff --git a/stubs/vmstate.c b/stubs/vmstate.c >> index 8513d92..c190762 100644 >> --- a/stubs/vmstate.c >> +++ b/stubs/vmstate.c >> @@ -1,5 +1,7 @@ >> #include "qemu/osdep.h" >> #include "migration/vmstate.h" >> +#include "qapi/qapi-types-migration.h" >> +#include "migration/client-options.h" >> >> int vmstate_register_with_alias_id(VMStateIf *obj, >> uint32_t instance_id, >> @@ -21,3 +23,8 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd) >> { >> return true; >> } >> + >> +MigMode migrate_mode(void) >> +{ >> + return MIG_MODE_NORMAL; >> +} >> -- >> 1.8.3.1 >> >
On Mon, Oct 07, 2024 at 04:39:25PM -0400, Steven Sistare wrote: > On 10/7/2024 3:44 PM, Peter Xu wrote: > > On Mon, Sep 30, 2024 at 12:40:44PM -0700, Steve Sistare wrote: > > > Add the cpr-transfer migration mode. Usage: > > > qemu-system-$arch -machine anon-alloc=memfd ... > > > > > > start new QEMU with "-incoming <uri-1> -cpr-uri <uri-2>" > > > > > > Issue commands to old QEMU: > > > migrate_set_parameter mode cpr-transfer > > > migrate_set_parameter cpr-uri <uri-2> > > > migrate -d <uri-1> > > > > > > The migrate command stops the VM, saves CPR state to uri-2, saves > > > normal migration state to uri-1, and old QEMU enters the postmigrate > > > state. The user starts new QEMU on the same host as old QEMU, with the > > > same arguments as old QEMU, plus the -incoming option. Guest RAM is > > > preserved in place, albeit with new virtual addresses in new QEMU. > > > > > > This mode requires a second migration channel, specified by the > > > cpr-uri migration property on the outgoing side, and by the cpr-uri > > > QEMU command-line option on the incoming side. The channel must > > > be a type, such as unix socket, that supports SCM_RIGHTS. > > > > > > Memory-backend objects must have the share=on attribute, but > > > memory-backend-epc is not supported. The VM must be started with > > > the '-machine anon-alloc=memfd' option, which allows anonymous > > > memory to be transferred in place to the new process. The memfds > > > are kept open by sending the descriptors to new QEMU via the > > > cpr-uri, which must support SCM_RIGHTS, and they are mmap'd > > > in new QEMU. > > > > > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > > > --- > > > include/migration/cpr.h | 1 + > > > migration/cpr.c | 34 +++++++++++++++++++---- > > > migration/migration.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-- > > > migration/migration.h | 2 ++ > > > migration/ram.c | 2 ++ > > > migration/vmstate-types.c | 5 ++-- > > > qapi/migration.json | 27 ++++++++++++++++++- > > > stubs/vmstate.c | 7 +++++ > > > 8 files changed, 137 insertions(+), 10 deletions(-) > > > > > > diff --git a/include/migration/cpr.h b/include/migration/cpr.h > > > index e886c98..5cd373f 100644 > > > --- a/include/migration/cpr.h > > > +++ b/include/migration/cpr.h > > > @@ -30,6 +30,7 @@ int cpr_state_save(Error **errp); > > > int cpr_state_load(Error **errp); > > > void cpr_state_close(void); > > > struct QIOChannel *cpr_state_ioc(void); > > > +bool cpr_needed_for_reuse(void *opaque); > > > QEMUFile *cpr_transfer_output(const char *uri, Error **errp); > > > QEMUFile *cpr_transfer_input(const char *uri, Error **errp); > > > diff --git a/migration/cpr.c b/migration/cpr.c > > > index 86f66c1..911b556 100644 > > > --- a/migration/cpr.c > > > +++ b/migration/cpr.c > > > @@ -9,6 +9,7 @@ > > > #include "qapi/error.h" > > > #include "migration/cpr.h" > > > #include "migration/misc.h" > > > +#include "migration/options.h" > > > #include "migration/qemu-file.h" > > > #include "migration/savevm.h" > > > #include "migration/vmstate.h" > > > @@ -57,7 +58,7 @@ static const VMStateDescription vmstate_cpr_fd = { > > > VMSTATE_UINT32(namelen, CprFd), > > > VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen), > > > VMSTATE_INT32(id, CprFd), > > > - VMSTATE_INT32(fd, CprFd), > > > + VMSTATE_FD(fd, CprFd), > > > VMSTATE_END_OF_LIST() > > > } > > > }; > > > @@ -174,9 +175,16 @@ int cpr_state_save(Error **errp) > > > { > > > int ret; > > > QEMUFile *f; > > > + MigMode mode = migrate_mode(); > > > - /* set f based on mode in a later patch in this series */ > > > - return 0; > > > + if (mode == MIG_MODE_CPR_TRANSFER) { > > > + f = cpr_transfer_output(migrate_cpr_uri(), errp); > > > + } else { > > > + return 0; > > > + } > > > + if (!f) { > > > + return -1; > > > + } > > > qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); > > > qemu_put_be32(f, QEMU_CPR_FILE_VERSION); > > > @@ -205,8 +213,18 @@ int cpr_state_load(Error **errp) > > > uint32_t v; > > > QEMUFile *f; > > > - /* set f based on mode in a later patch in this series */ > > > - return 0; > > > + /* > > > + * Mode will be loaded in CPR state, so cannot use it to decide which > > > + * form of state to load. > > > + */ > > > + if (cpr_uri) { > > > + f = cpr_transfer_input(cpr_uri, errp); > > > + } else { > > > + return 0; > > > + } > > > + if (!f) { > > > + return -1; > > > + } > > > v = qemu_get_be32(f); > > > if (v != QEMU_CPR_FILE_MAGIC) { > > > @@ -243,3 +261,9 @@ void cpr_state_close(void) > > > cpr_state_file = NULL; > > > } > > > } > > > + > > > +bool cpr_needed_for_reuse(void *opaque) > > > +{ > > > + MigMode mode = migrate_mode(); > > > + return mode == MIG_MODE_CPR_TRANSFER; > > > +} > > > > Drop it until used? > > Maybe, but here is my reason for including it here. > > These common functions like cpr_needed_for_reuse and cpr_resave_fd are needed > by multiple follow-on series: vfio, tap, iommufd. To send those for comment, > as I have beem, I need to prepend a patch for cpr_needed_for_reuse to each of > those series, which is redundant. It makes more sense IMO to include them in > this initial series. > > But, it's your call. Hmm, logically we shouldn't keep any dead code in QEMU, but indeed this is slightly special. Would you mind keeping all these helpers in a separate patch after the base patches? The commit message should describe what future projects will start to use it, then whoever noticed later (I at least know Dave has quite a few patches recently removing dead code in QEMU) will know that's potentially to-be-used code, so should keep them around. > > > > diff --git a/migration/migration.c b/migration/migration.c > > > index 3301583..73b85aa 100644 > > > --- a/migration/migration.c > > > +++ b/migration/migration.c > > > @@ -76,6 +76,7 @@ > > > static NotifierWithReturnList migration_state_notifiers[] = { > > > NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL), > > > NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT), > > > + NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER), > > > }; > > > /* Messages sent on the return path from destination to source */ > > > @@ -109,6 +110,7 @@ static int migration_maybe_pause(MigrationState *s, > > > static void migrate_fd_cancel(MigrationState *s); > > > static bool close_return_path_on_source(MigrationState *s); > > > static void migration_completion_end(MigrationState *s); > > > +static void migrate_hup_delete(MigrationState *s); > > > static void migration_downtime_start(MigrationState *s) > > > { > > > @@ -204,6 +206,12 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, > > > return false; > > > } > > > + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && > > > + addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { > > > + error_setg(errp, "Migration requires streamable transport (eg unix)"); > > > + return false; > > > + } > > > + > > > return true; > > > } > > > @@ -316,6 +324,7 @@ void migration_cancel(const Error *error) > > > qmp_cancel_vcpu_dirty_limit(false, -1, NULL); > > > } > > > migrate_fd_cancel(current_migration); > > > + migrate_hup_delete(current_migration); > > > } > > > void migration_shutdown(void) > > > @@ -718,6 +727,9 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, > > > } else { > > > error_setg(errp, "unknown migration protocol: %s", uri); > > > } > > > + > > > + /* Close cpr socket to tell source that we are listening */ > > > + cpr_state_close(); > > > > Would it be possible to use some explicit reply message to mark this? > > In theory yes, but I fear that using a return channel with message parsing and > dispatch adds more code than it is worth. > > > So > > far looks like src QEMU will continue with qmp_migrate_finish() even if the > > cpr channel was closed due to error. > > Yes, but we recover just fine. The target hits some error, fails to read all the > cpr state, closes the channel prematurely, and does *not* create a listen socket > for the normal migration channel. Hence qmp_migrate_finish fails to connect to the > normal channel, and recovers. This is slightly tricky part and would be nice to be documented somewhere, perhaps starting from in the commit message. Then the error will say "failed to connect to destination QEMU" hiding the real failure (cpr save/load failed), right? That's slightly a pity. I'm OK with the HUP as of now, but if you care about accurate CPR-stage error reporting, then feel free to draft something else in the next post. > > > I still didn't see how that kind of issue was captured below [1] (e.g., cpr > > channel broken after sending partial fds)? > > Same as above. > > > > } > > > static void process_incoming_migration_bh(void *opaque) > > > @@ -1414,6 +1426,8 @@ static void migrate_fd_cleanup(MigrationState *s) > > > s->vmdesc = NULL; > > > qemu_savevm_state_cleanup(); > > > + cpr_state_close(); > > > + migrate_hup_delete(s); > > > close_return_path_on_source(s); > > > @@ -1698,7 +1712,9 @@ bool migration_thread_is_self(void) > > > bool migrate_mode_is_cpr(MigrationState *s) > > > { > > > - return s->parameters.mode == MIG_MODE_CPR_REBOOT; > > > + MigMode mode = s->parameters.mode; > > > + return mode == MIG_MODE_CPR_REBOOT || > > > + mode == MIG_MODE_CPR_TRANSFER; > > > } > > > int migrate_init(MigrationState *s, Error **errp) > > > @@ -2033,6 +2049,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) > > > return false; > > > } > > > + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && > > > + !s->parameters.cpr_uri) { > > > + error_setg(errp, "cpr-transfer mode requires setting cpr-uri"); > > > + return false; > > > + } > > > + > > > if (migration_is_blocked(errp)) { > > > return false; > > > } > > > @@ -2076,6 +2098,37 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) > > > static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested, > > > Error **errp); > > > +static void migrate_hup_add(MigrationState *s, QIOChannel *ioc, GSourceFunc cb, > > > + void *opaque) > > > +{ > > > + s->hup_source = qio_channel_create_watch(ioc, G_IO_HUP); > > > + g_source_set_callback(s->hup_source, cb, opaque, NULL); > > > + g_source_attach(s->hup_source, NULL); > > > +} > > > + > > > +static void migrate_hup_delete(MigrationState *s) > > > +{ > > > + if (s->hup_source) { > > > + g_source_destroy(s->hup_source); > > > + g_source_unref(s->hup_source); > > > + s->hup_source = NULL; > > > + } > > > +} > > > + > > > +static gboolean qmp_migrate_finish_cb(QIOChannel *channel, > > > + GIOCondition cond, > > > + void *opaque) > > > +{ > > > + MigrationAddress *addr = opaque; > > > > [1] > > > > > + > > > + qmp_migrate_finish(addr, false, NULL); > > > + > > > + cpr_state_close(); > > > + migrate_hup_delete(migrate_get_current()); > > > + qapi_free_MigrationAddress(addr); > > > + return G_SOURCE_REMOVE; > > > +} > > > + > > > void qmp_migrate(const char *uri, bool has_channels, > > > MigrationChannelList *channels, bool has_detach, bool detach, > > > bool has_resume, bool resume, Error **errp) > > > @@ -2136,7 +2189,19 @@ void qmp_migrate(const char *uri, bool has_channels, > > > goto out; > > > } > > > - qmp_migrate_finish(addr, resume_requested, errp); > > > + /* > > > + * For cpr-transfer, the target may not be listening yet on the migration > > > + * channel, because first it must finish cpr_load_state. The target tells > > > + * us it is listening by closing the cpr-state socket. Wait for that HUP > > > + * event before connecting in qmp_migrate_finish. > > > + */ > > > + if (s->parameters.mode == MIG_MODE_CPR_TRANSFER) { > > > + migrate_hup_add(s, cpr_state_ioc(), (GSourceFunc)qmp_migrate_finish_cb, > > > + QAPI_CLONE(MigrationAddress, addr)); > > > + > > > + } else { > > > + qmp_migrate_finish(addr, resume_requested, errp); > > > + } > > > out: > > > if (local_err) { > > > diff --git a/migration/migration.h b/migration/migration.h > > > index 38aa140..74c167b 100644 > > > --- a/migration/migration.h > > > +++ b/migration/migration.h > > > @@ -457,6 +457,8 @@ struct MigrationState { > > > bool switchover_acked; > > > /* Is this a rdma migration */ > > > bool rdma_migration; > > > + > > > + GSource *hup_source; > > > }; > > > void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, > > > diff --git a/migration/ram.c b/migration/ram.c > > > index 81eda27..e2cef50 100644 > > > --- a/migration/ram.c > > > +++ b/migration/ram.c > > > @@ -216,7 +216,9 @@ static bool postcopy_preempt_active(void) > > > bool migrate_ram_is_ignored(RAMBlock *block) > > > { > > > + MigMode mode = migrate_mode(); > > > return !qemu_ram_is_migratable(block) || > > > + mode == MIG_MODE_CPR_TRANSFER || > > > (migrate_ignore_shared() && qemu_ram_is_shared(block) > > > && qemu_ram_is_named_file(block)); > > > } > > > diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c > > > index 6e45a4a..b5a55b8 100644 > > > --- a/migration/vmstate-types.c > > > +++ b/migration/vmstate-types.c > > > @@ -15,6 +15,7 @@ > > > #include "qemu-file.h" > > > #include "migration.h" > > > #include "migration/vmstate.h" > > > +#include "migration/client-options.h" > > > #include "qemu/error-report.h" > > > #include "qemu/queue.h" > > > #include "trace.h" > > > @@ -321,7 +322,7 @@ static int get_fd(QEMUFile *f, void *pv, size_t size, > > > { > > > int32_t *v = pv; > > > qemu_get_sbe32s(f, v); > > > - if (*v < 0) { > > > + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { > > > return 0; > > > } > > > *v = qemu_file_get_fd(f); > > > @@ -334,7 +335,7 @@ static int put_fd(QEMUFile *f, void *pv, size_t size, > > > int32_t *v = pv; > > > qemu_put_sbe32s(f, v); > > > - if (*v < 0) { > > > + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { > > > > So I suppose you wanted to guard VMSTATE_FD being abused. Then I wonder > > whether it'll help more by adding a comment above VMSTATE_FD instead; it'll > > be more straightforward to me. > > > > And if you want to fail hard, assert should work better too in runtime, or > > the "return 0" can be pretty hard to notice. > > No, this code is not about detecting abuse or errors. It is there to skip > the qemu_file_put_fd for cpr-exec mode. In my next version this function will > simply be: > > static int put_fd(QEMUFile *f, void *pv, size_t size, > const VMStateField *field, JSONWriter *vmdesc) > { > int32_t *v = pv; > return qemu_file_put_fd(f, *v); > } Great, thanks. > > > > return 0; > > > } > > > return qemu_file_put_fd(f, *v); > > > diff --git a/qapi/migration.json b/qapi/migration.json > > > index c0d8bcc..f51b4cb 100644 > > > --- a/qapi/migration.json > > > +++ b/qapi/migration.json > > > @@ -611,9 +611,34 @@ > > > # or COLO. > > > # > > > # (since 8.2) > > > +# > > > +# @cpr-transfer: This mode allows the user to transfer a guest to a > > > +# new QEMU instance on the same host with minimal guest pause > > > +# time, by preserving guest RAM in place, albeit with new virtual > > > +# addresses in new QEMU. > > > +# > > > +# The user starts new QEMU on the same host as old QEMU, with the > > > +# the same arguments as old QEMU, plus the -incoming option. The > > > +# user issues the migrate command to old QEMU, which stops the VM, > > > +# saves state to the migration channels, and enters the > > > +# postmigrate state. Execution resumes in new QEMU. Guest RAM is > > > +# preserved in place, albeit with new virtual addresses in new > > > +# QEMU. The incoming migration channel cannot be a file type. > > > +# > > > +# This mode requires a second migration channel, specified by the > > > +# cpr-uri migration property on the outgoing side, and by > > > +# the cpr-uri QEMU command-line option on the incoming > > > +# side. The channel must be a type, such as unix socket, that > > > +# supports SCM_RIGHTS. > > > +# > > > +# Memory-backend objects must have the share=on attribute, but > > > +# memory-backend-epc is not supported. The VM must be started > > > +# with the '-machine anon-alloc=memfd' option. > > > +# > > > +# (since 9.2) > > > ## > > > { 'enum': 'MigMode', > > > - 'data': [ 'normal', 'cpr-reboot' ] } > > > + 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] } > > > > No need to rush, but please add the CPR.rst and unit test updates when you > > feel confident on the protocol. It looks pretty good to me now. > > > > Especially it'll be nice to describe the separate cpr-channel protocol in > > the new doc page. > > Will do, now that there is light at the end of the tunnel. I just noticed that we have 1 month left before soft freeze. I'll try to prioritize review of this series (and the other VFIO one) in the upcoming month. Let's see whether it can hit 9.2.
Steven Sistare <steven.sistare@oracle.com> writes: > On 10/7/2024 3:44 PM, Peter Xu wrote: >> On Mon, Sep 30, 2024 at 12:40:44PM -0700, Steve Sistare wrote: >>> Add the cpr-transfer migration mode. Usage: >>> qemu-system-$arch -machine anon-alloc=memfd ... >>> >>> start new QEMU with "-incoming <uri-1> -cpr-uri <uri-2>" >>> >>> Issue commands to old QEMU: >>> migrate_set_parameter mode cpr-transfer >>> migrate_set_parameter cpr-uri <uri-2> >>> migrate -d <uri-1> >>> >>> The migrate command stops the VM, saves CPR state to uri-2, saves >>> normal migration state to uri-1, and old QEMU enters the postmigrate >>> state. The user starts new QEMU on the same host as old QEMU, with the >>> same arguments as old QEMU, plus the -incoming option. Guest RAM is >>> preserved in place, albeit with new virtual addresses in new QEMU. >>> >>> This mode requires a second migration channel, specified by the >>> cpr-uri migration property on the outgoing side, and by the cpr-uri >>> QEMU command-line option on the incoming side. The channel must >>> be a type, such as unix socket, that supports SCM_RIGHTS. >>> >>> Memory-backend objects must have the share=on attribute, but >>> memory-backend-epc is not supported. The VM must be started with >>> the '-machine anon-alloc=memfd' option, which allows anonymous >>> memory to be transferred in place to the new process. The memfds >>> are kept open by sending the descriptors to new QEMU via the >>> cpr-uri, which must support SCM_RIGHTS, and they are mmap'd >>> in new QEMU. >>> >>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >>> --- >>> include/migration/cpr.h | 1 + >>> migration/cpr.c | 34 +++++++++++++++++++---- >>> migration/migration.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-- >>> migration/migration.h | 2 ++ >>> migration/ram.c | 2 ++ >>> migration/vmstate-types.c | 5 ++-- >>> qapi/migration.json | 27 ++++++++++++++++++- >>> stubs/vmstate.c | 7 +++++ >>> 8 files changed, 137 insertions(+), 10 deletions(-) >>> >>> diff --git a/include/migration/cpr.h b/include/migration/cpr.h >>> index e886c98..5cd373f 100644 >>> --- a/include/migration/cpr.h >>> +++ b/include/migration/cpr.h >>> @@ -30,6 +30,7 @@ int cpr_state_save(Error **errp); >>> int cpr_state_load(Error **errp); >>> void cpr_state_close(void); >>> struct QIOChannel *cpr_state_ioc(void); >>> +bool cpr_needed_for_reuse(void *opaque); >>> >>> QEMUFile *cpr_transfer_output(const char *uri, Error **errp); >>> QEMUFile *cpr_transfer_input(const char *uri, Error **errp); >>> diff --git a/migration/cpr.c b/migration/cpr.c >>> index 86f66c1..911b556 100644 >>> --- a/migration/cpr.c >>> +++ b/migration/cpr.c >>> @@ -9,6 +9,7 @@ >>> #include "qapi/error.h" >>> #include "migration/cpr.h" >>> #include "migration/misc.h" >>> +#include "migration/options.h" >>> #include "migration/qemu-file.h" >>> #include "migration/savevm.h" >>> #include "migration/vmstate.h" >>> @@ -57,7 +58,7 @@ static const VMStateDescription vmstate_cpr_fd = { >>> VMSTATE_UINT32(namelen, CprFd), >>> VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen), >>> VMSTATE_INT32(id, CprFd), >>> - VMSTATE_INT32(fd, CprFd), >>> + VMSTATE_FD(fd, CprFd), >>> VMSTATE_END_OF_LIST() >>> } >>> }; >>> @@ -174,9 +175,16 @@ int cpr_state_save(Error **errp) >>> { >>> int ret; >>> QEMUFile *f; >>> + MigMode mode = migrate_mode(); >>> >>> - /* set f based on mode in a later patch in this series */ >>> - return 0; >>> + if (mode == MIG_MODE_CPR_TRANSFER) { >>> + f = cpr_transfer_output(migrate_cpr_uri(), errp); >>> + } else { >>> + return 0; >>> + } >>> + if (!f) { >>> + return -1; >>> + } >>> >>> qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); >>> qemu_put_be32(f, QEMU_CPR_FILE_VERSION); >>> @@ -205,8 +213,18 @@ int cpr_state_load(Error **errp) >>> uint32_t v; >>> QEMUFile *f; >>> >>> - /* set f based on mode in a later patch in this series */ >>> - return 0; >>> + /* >>> + * Mode will be loaded in CPR state, so cannot use it to decide which >>> + * form of state to load. >>> + */ >>> + if (cpr_uri) { >>> + f = cpr_transfer_input(cpr_uri, errp); >>> + } else { >>> + return 0; >>> + } >>> + if (!f) { >>> + return -1; >>> + } >>> >>> v = qemu_get_be32(f); >>> if (v != QEMU_CPR_FILE_MAGIC) { >>> @@ -243,3 +261,9 @@ void cpr_state_close(void) >>> cpr_state_file = NULL; >>> } >>> } >>> + >>> +bool cpr_needed_for_reuse(void *opaque) >>> +{ >>> + MigMode mode = migrate_mode(); >>> + return mode == MIG_MODE_CPR_TRANSFER; >>> +} >> >> Drop it until used? > > Maybe, but here is my reason for including it here. > > These common functions like cpr_needed_for_reuse and cpr_resave_fd are needed > by multiple follow-on series: vfio, tap, iommufd. To send those for comment, > as I have beem, I need to prepend a patch for cpr_needed_for_reuse to each of > those series, which is redundant. It makes more sense IMO to include them in > this initial series. > > But, it's your call. > >>> diff --git a/migration/migration.c b/migration/migration.c >>> index 3301583..73b85aa 100644 >>> --- a/migration/migration.c >>> +++ b/migration/migration.c >>> @@ -76,6 +76,7 @@ >>> static NotifierWithReturnList migration_state_notifiers[] = { >>> NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL), >>> NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT), >>> + NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER), >>> }; >>> >>> /* Messages sent on the return path from destination to source */ >>> @@ -109,6 +110,7 @@ static int migration_maybe_pause(MigrationState *s, >>> static void migrate_fd_cancel(MigrationState *s); >>> static bool close_return_path_on_source(MigrationState *s); >>> static void migration_completion_end(MigrationState *s); >>> +static void migrate_hup_delete(MigrationState *s); >>> >>> static void migration_downtime_start(MigrationState *s) >>> { >>> @@ -204,6 +206,12 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, >>> return false; >>> } >>> >>> + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && >>> + addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { >>> + error_setg(errp, "Migration requires streamable transport (eg unix)"); >>> + return false; >>> + } >>> + >>> return true; >>> } >>> >>> @@ -316,6 +324,7 @@ void migration_cancel(const Error *error) >>> qmp_cancel_vcpu_dirty_limit(false, -1, NULL); >>> } >>> migrate_fd_cancel(current_migration); >>> + migrate_hup_delete(current_migration); >>> } >>> >>> void migration_shutdown(void) >>> @@ -718,6 +727,9 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, >>> } else { >>> error_setg(errp, "unknown migration protocol: %s", uri); >>> } >>> + >>> + /* Close cpr socket to tell source that we are listening */ >>> + cpr_state_close(); >> >> Would it be possible to use some explicit reply message to mark this? > > In theory yes, but I fear that using a return channel with message parsing and > dispatch adds more code than it is worth. I think this approach is fine for now, but I wonder whether we could reuse the current return path (RP) by starting it earlier and take benefit from it already having the message passing infrastructure in place. I'm actually looking ahead to the migration handshake thread[1], which could be thought to have some similarity with the early cpr channel. So having a generic channel in place early on to handle handshake, CPR, RP, etc. could be a good idea. Anyway, I'm probing on this a bit so I can start drafting something. I got surprised that we don't even have the capability bits in the stream in a useful way (currently, configuration_validate_capabilities() does kind of nothing). 1- https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake
On Tue, Oct 08, 2024 at 03:28:30PM -0300, Fabiano Rosas wrote: > >>> + /* Close cpr socket to tell source that we are listening */ > >>> + cpr_state_close(); > >> > >> Would it be possible to use some explicit reply message to mark this? > > > > In theory yes, but I fear that using a return channel with message parsing and > > dispatch adds more code than it is worth. > > I think this approach is fine for now, but I wonder whether we could > reuse the current return path (RP) by starting it earlier and take > benefit from it already having the message passing infrastructure in > place. I'm actually looking ahead to the migration handshake thread[1], > which could be thought to have some similarity with the early cpr > channel. So having a generic channel in place early on to handle > handshake, CPR, RP, etc. could be a good idea. The current design relies on CPR stage happens before device realize()s, so I assume migration channel (including RP) isn't easily applicable at as early as this stage. However I think dest qemu can directly write back to the cpr_uri channel instead if we want and then follow a protocol simple enough (even though it'll be separate from the migration stream protocol). What worries me more (besides using HUP as of now..) is cpr_state_save() is currently synchronous and can block the main iothread. It means if cpr destination is not properly setup, it can hang the main thread (including e.g. QMP monitor) at qio_channel_socket_connect_sync(). Ideally we shouldn't block the main thread. If async-mode can be done, it might be even easier, e.g. if we do cpr_state_save() in a thread, after qemu_put*() we can directly qemu_get*() in the same context with the pairing return qemufile. But maybe we can do it in two steps, merging HUP first. Then when a better protocol (plus async mode) ready, one can boost QEMU_CPR_FILE_VERSION. I'll see how Steve wants to address it. > > Anyway, I'm probing on this a bit so I can start drafting something. I > got surprised that we don't even have the capability bits in the stream > in a useful way (currently, configuration_validate_capabilities() does > kind of nothing). > > 1- https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake Happy to know this. I was thinking whether I should work on this even earlier, so if you're looking at that it'll be great. The major pain to me is the channel establishment part where we now have all kinds of channels, so we should really fix that sooner (e.g., we hope to enable multifd + postcopy very soon, that requires multifd and preempt channels appear in the same time). It was reasonable the vfio/multifd series tried to fix it.
Peter Xu <peterx@redhat.com> writes: > On Tue, Oct 08, 2024 at 03:28:30PM -0300, Fabiano Rosas wrote: >> >>> + /* Close cpr socket to tell source that we are listening */ >> >>> + cpr_state_close(); >> >> >> >> Would it be possible to use some explicit reply message to mark this? >> > >> > In theory yes, but I fear that using a return channel with message parsing and >> > dispatch adds more code than it is worth. >> >> I think this approach is fine for now, but I wonder whether we could >> reuse the current return path (RP) by starting it earlier and take >> benefit from it already having the message passing infrastructure in >> place. I'm actually looking ahead to the migration handshake thread[1], >> which could be thought to have some similarity with the early cpr >> channel. So having a generic channel in place early on to handle >> handshake, CPR, RP, etc. could be a good idea. > > The current design relies on CPR stage happens before device realize()s, so > I assume migration channel (including RP) isn't easily applicable at as > early as this stage. Well, what is the dependency for the RP? If we can send CPR state, we can send QEMU_VM_COMMAND, no? > > However I think dest qemu can directly write back to the cpr_uri channel > instead if we want and then follow a protocol simple enough (even though > it'll be separate from the migration stream protocol). > > What worries me more (besides using HUP as of now..) is cpr_state_save() is > currently synchronous and can block the main iothread. It means if cpr > destination is not properly setup, it can hang the main thread (including > e.g. QMP monitor) at qio_channel_socket_connect_sync(). Ideally we > shouldn't block the main thread. > > If async-mode can be done, it might be even easier, e.g. if we do > cpr_state_save() in a thread, after qemu_put*() we can directly qemu_get*() > in the same context with the pairing return qemufile. > > But maybe we can do it in two steps, merging HUP first. Then when a better > protocol (plus async mode) ready, one can boost QEMU_CPR_FILE_VERSION. > I'll see how Steve wants to address it. I agree HUP is fine at the moment. > >> >> Anyway, I'm probing on this a bit so I can start drafting something. I >> got surprised that we don't even have the capability bits in the stream >> in a useful way (currently, configuration_validate_capabilities() does >> kind of nothing). >> >> 1- https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake > > Happy to know this. I was thinking whether I should work on this even > earlier, so if you're looking at that it'll be great. As of half an hour ago =) We could put a feature branch up and work together, if you have more concrete thoughts on how this would look like let me know. > > The major pain to me is the channel establishment part where we now have > all kinds of channels, so we should really fix that sooner (e.g., we hope > to enable multifd + postcopy very soon, that requires multifd and preempt > channels appear in the same time). It was reasonable the vfio/multifd > series tried to fix it.
On 10/8/2024 11:45 AM, Peter Xu wrote: > On Mon, Oct 07, 2024 at 04:39:25PM -0400, Steven Sistare wrote: >> On 10/7/2024 3:44 PM, Peter Xu wrote: >>> On Mon, Sep 30, 2024 at 12:40:44PM -0700, Steve Sistare wrote: >>>> Add the cpr-transfer migration mode. Usage: >>>> qemu-system-$arch -machine anon-alloc=memfd ... >>>> >>>> start new QEMU with "-incoming <uri-1> -cpr-uri <uri-2>" >>>> >>>> Issue commands to old QEMU: >>>> migrate_set_parameter mode cpr-transfer >>>> migrate_set_parameter cpr-uri <uri-2> >>>> migrate -d <uri-1> >>>> >>>> The migrate command stops the VM, saves CPR state to uri-2, saves >>>> normal migration state to uri-1, and old QEMU enters the postmigrate >>>> state. The user starts new QEMU on the same host as old QEMU, with the >>>> same arguments as old QEMU, plus the -incoming option. Guest RAM is >>>> preserved in place, albeit with new virtual addresses in new QEMU. >>>> >>>> This mode requires a second migration channel, specified by the >>>> cpr-uri migration property on the outgoing side, and by the cpr-uri >>>> QEMU command-line option on the incoming side. The channel must >>>> be a type, such as unix socket, that supports SCM_RIGHTS. >>>> >>>> Memory-backend objects must have the share=on attribute, but >>>> memory-backend-epc is not supported. The VM must be started with >>>> the '-machine anon-alloc=memfd' option, which allows anonymous >>>> memory to be transferred in place to the new process. The memfds >>>> are kept open by sending the descriptors to new QEMU via the >>>> cpr-uri, which must support SCM_RIGHTS, and they are mmap'd >>>> in new QEMU. >>>> >>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >>>> --- >>>> include/migration/cpr.h | 1 + >>>> migration/cpr.c | 34 +++++++++++++++++++---- >>>> migration/migration.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-- >>>> migration/migration.h | 2 ++ >>>> migration/ram.c | 2 ++ >>>> migration/vmstate-types.c | 5 ++-- >>>> qapi/migration.json | 27 ++++++++++++++++++- >>>> stubs/vmstate.c | 7 +++++ >>>> 8 files changed, 137 insertions(+), 10 deletions(-) >>>> >>>> diff --git a/include/migration/cpr.h b/include/migration/cpr.h >>>> index e886c98..5cd373f 100644 >>>> --- a/include/migration/cpr.h >>>> +++ b/include/migration/cpr.h >>>> @@ -30,6 +30,7 @@ int cpr_state_save(Error **errp); >>>> int cpr_state_load(Error **errp); >>>> void cpr_state_close(void); >>>> struct QIOChannel *cpr_state_ioc(void); >>>> +bool cpr_needed_for_reuse(void *opaque); >>>> QEMUFile *cpr_transfer_output(const char *uri, Error **errp); >>>> QEMUFile *cpr_transfer_input(const char *uri, Error **errp); >>>> diff --git a/migration/cpr.c b/migration/cpr.c >>>> index 86f66c1..911b556 100644 >>>> --- a/migration/cpr.c >>>> +++ b/migration/cpr.c >>>> @@ -9,6 +9,7 @@ >>>> #include "qapi/error.h" >>>> #include "migration/cpr.h" >>>> #include "migration/misc.h" >>>> +#include "migration/options.h" >>>> #include "migration/qemu-file.h" >>>> #include "migration/savevm.h" >>>> #include "migration/vmstate.h" >>>> @@ -57,7 +58,7 @@ static const VMStateDescription vmstate_cpr_fd = { >>>> VMSTATE_UINT32(namelen, CprFd), >>>> VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen), >>>> VMSTATE_INT32(id, CprFd), >>>> - VMSTATE_INT32(fd, CprFd), >>>> + VMSTATE_FD(fd, CprFd), >>>> VMSTATE_END_OF_LIST() >>>> } >>>> }; >>>> @@ -174,9 +175,16 @@ int cpr_state_save(Error **errp) >>>> { >>>> int ret; >>>> QEMUFile *f; >>>> + MigMode mode = migrate_mode(); >>>> - /* set f based on mode in a later patch in this series */ >>>> - return 0; >>>> + if (mode == MIG_MODE_CPR_TRANSFER) { >>>> + f = cpr_transfer_output(migrate_cpr_uri(), errp); >>>> + } else { >>>> + return 0; >>>> + } >>>> + if (!f) { >>>> + return -1; >>>> + } >>>> qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); >>>> qemu_put_be32(f, QEMU_CPR_FILE_VERSION); >>>> @@ -205,8 +213,18 @@ int cpr_state_load(Error **errp) >>>> uint32_t v; >>>> QEMUFile *f; >>>> - /* set f based on mode in a later patch in this series */ >>>> - return 0; >>>> + /* >>>> + * Mode will be loaded in CPR state, so cannot use it to decide which >>>> + * form of state to load. >>>> + */ >>>> + if (cpr_uri) { >>>> + f = cpr_transfer_input(cpr_uri, errp); >>>> + } else { >>>> + return 0; >>>> + } >>>> + if (!f) { >>>> + return -1; >>>> + } >>>> v = qemu_get_be32(f); >>>> if (v != QEMU_CPR_FILE_MAGIC) { >>>> @@ -243,3 +261,9 @@ void cpr_state_close(void) >>>> cpr_state_file = NULL; >>>> } >>>> } >>>> + >>>> +bool cpr_needed_for_reuse(void *opaque) >>>> +{ >>>> + MigMode mode = migrate_mode(); >>>> + return mode == MIG_MODE_CPR_TRANSFER; >>>> +} >>> >>> Drop it until used? >> >> Maybe, but here is my reason for including it here. >> >> These common functions like cpr_needed_for_reuse and cpr_resave_fd are needed >> by multiple follow-on series: vfio, tap, iommufd. To send those for comment, >> as I have beem, I need to prepend a patch for cpr_needed_for_reuse to each of >> those series, which is redundant. It makes more sense IMO to include them in >> this initial series. >> >> But, it's your call. > > Hmm, logically we shouldn't keep any dead code in QEMU, but indeed this is > slightly special. > > Would you mind keeping all these helpers in a separate patch after the base > patches? The commit message should describe what future projects will > start to use it, then whoever noticed later (I at least know Dave has quite > a few patches recently removing dead code in QEMU) will know that's > potentially to-be-used code, so should keep them around. I have split the functions into a separate patch. I'll hold onto it until posting the next series, no big deal. >>>> diff --git a/migration/migration.c b/migration/migration.c >>>> index 3301583..73b85aa 100644 >>>> --- a/migration/migration.c >>>> +++ b/migration/migration.c >>>> @@ -76,6 +76,7 @@ >>>> static NotifierWithReturnList migration_state_notifiers[] = { >>>> NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL), >>>> NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT), >>>> + NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER), >>>> }; >>>> /* Messages sent on the return path from destination to source */ >>>> @@ -109,6 +110,7 @@ static int migration_maybe_pause(MigrationState *s, >>>> static void migrate_fd_cancel(MigrationState *s); >>>> static bool close_return_path_on_source(MigrationState *s); >>>> static void migration_completion_end(MigrationState *s); >>>> +static void migrate_hup_delete(MigrationState *s); >>>> static void migration_downtime_start(MigrationState *s) >>>> { >>>> @@ -204,6 +206,12 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, >>>> return false; >>>> } >>>> + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && >>>> + addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { >>>> + error_setg(errp, "Migration requires streamable transport (eg unix)"); >>>> + return false; >>>> + } >>>> + >>>> return true; >>>> } >>>> @@ -316,6 +324,7 @@ void migration_cancel(const Error *error) >>>> qmp_cancel_vcpu_dirty_limit(false, -1, NULL); >>>> } >>>> migrate_fd_cancel(current_migration); >>>> + migrate_hup_delete(current_migration); >>>> } >>>> void migration_shutdown(void) >>>> @@ -718,6 +727,9 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, >>>> } else { >>>> error_setg(errp, "unknown migration protocol: %s", uri); >>>> } >>>> + >>>> + /* Close cpr socket to tell source that we are listening */ >>>> + cpr_state_close(); >>> >>> Would it be possible to use some explicit reply message to mark this? >> >> In theory yes, but I fear that using a return channel with message parsing and >> dispatch adds more code than it is worth. >> >>> So >>> far looks like src QEMU will continue with qmp_migrate_finish() even if the >>> cpr channel was closed due to error. >> >> Yes, but we recover just fine. The target hits some error, fails to read all the >> cpr state, closes the channel prematurely, and does *not* create a listen socket >> for the normal migration channel. Hence qmp_migrate_finish fails to connect to the >> normal channel, and recovers. > > This is slightly tricky part and would be nice to be documented somewhere, > perhaps starting from in the commit message. I will extend the block comment in qmp_migrate: /* * For cpr-transfer, the target may not be listening yet on the migration * channel, because first it must finish cpr_load_state. The target tells * us it is listening by closing the cpr-state socket. Wait for that HUP * event before connecting in qmp_migrate_finish. * * The HUP could occur because the target fails while reading CPR state, * in which case the target will not listen for the incoming migration * connection, so qmp_migrate_finish will fail to connect, and then recover. */ > Then the error will say "failed to connect to destination QEMU" hiding the > real failure (cpr save/load failed), right? That's slightly a pity. Yes, but destination qemu will also emit a more specific message. > I'm OK with the HUP as of now, but if you care about accurate CPR-stage > error reporting, then feel free to draft something else in the next post. I'll think about it, but to get cpr into 9.2, this will probably need to be deferred as a future enhancement. >>> I still didn't see how that kind of issue was captured below [1] (e.g., cpr >>> channel broken after sending partial fds)? >> >> Same as above. >> >>>> } >>>> static void process_incoming_migration_bh(void *opaque) >>>> @@ -1414,6 +1426,8 @@ static void migrate_fd_cleanup(MigrationState *s) >>>> s->vmdesc = NULL; >>>> qemu_savevm_state_cleanup(); >>>> + cpr_state_close(); >>>> + migrate_hup_delete(s); >>>> close_return_path_on_source(s); >>>> @@ -1698,7 +1712,9 @@ bool migration_thread_is_self(void) >>>> bool migrate_mode_is_cpr(MigrationState *s) >>>> { >>>> - return s->parameters.mode == MIG_MODE_CPR_REBOOT; >>>> + MigMode mode = s->parameters.mode; >>>> + return mode == MIG_MODE_CPR_REBOOT || >>>> + mode == MIG_MODE_CPR_TRANSFER; >>>> } >>>> int migrate_init(MigrationState *s, Error **errp) >>>> @@ -2033,6 +2049,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) >>>> return false; >>>> } >>>> + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && >>>> + !s->parameters.cpr_uri) { >>>> + error_setg(errp, "cpr-transfer mode requires setting cpr-uri"); >>>> + return false; >>>> + } >>>> + >>>> if (migration_is_blocked(errp)) { >>>> return false; >>>> } >>>> @@ -2076,6 +2098,37 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) >>>> static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested, >>>> Error **errp); >>>> +static void migrate_hup_add(MigrationState *s, QIOChannel *ioc, GSourceFunc cb, >>>> + void *opaque) >>>> +{ >>>> + s->hup_source = qio_channel_create_watch(ioc, G_IO_HUP); >>>> + g_source_set_callback(s->hup_source, cb, opaque, NULL); >>>> + g_source_attach(s->hup_source, NULL); >>>> +} >>>> + >>>> +static void migrate_hup_delete(MigrationState *s) >>>> +{ >>>> + if (s->hup_source) { >>>> + g_source_destroy(s->hup_source); >>>> + g_source_unref(s->hup_source); >>>> + s->hup_source = NULL; >>>> + } >>>> +} >>>> + >>>> +static gboolean qmp_migrate_finish_cb(QIOChannel *channel, >>>> + GIOCondition cond, >>>> + void *opaque) >>>> +{ >>>> + MigrationAddress *addr = opaque; >>> >>> [1] >>> >>>> + >>>> + qmp_migrate_finish(addr, false, NULL); >>>> + >>>> + cpr_state_close(); >>>> + migrate_hup_delete(migrate_get_current()); >>>> + qapi_free_MigrationAddress(addr); >>>> + return G_SOURCE_REMOVE; >>>> +} >>>> + >>>> void qmp_migrate(const char *uri, bool has_channels, >>>> MigrationChannelList *channels, bool has_detach, bool detach, >>>> bool has_resume, bool resume, Error **errp) >>>> @@ -2136,7 +2189,19 @@ void qmp_migrate(const char *uri, bool has_channels, >>>> goto out; >>>> } >>>> - qmp_migrate_finish(addr, resume_requested, errp); >>>> + /* >>>> + * For cpr-transfer, the target may not be listening yet on the migration >>>> + * channel, because first it must finish cpr_load_state. The target tells >>>> + * us it is listening by closing the cpr-state socket. Wait for that HUP >>>> + * event before connecting in qmp_migrate_finish. >>>> + */ >>>> + if (s->parameters.mode == MIG_MODE_CPR_TRANSFER) { >>>> + migrate_hup_add(s, cpr_state_ioc(), (GSourceFunc)qmp_migrate_finish_cb, >>>> + QAPI_CLONE(MigrationAddress, addr)); >>>> + >>>> + } else { >>>> + qmp_migrate_finish(addr, resume_requested, errp); >>>> + } >>>> out: >>>> if (local_err) { >>>> diff --git a/migration/migration.h b/migration/migration.h >>>> index 38aa140..74c167b 100644 >>>> --- a/migration/migration.h >>>> +++ b/migration/migration.h >>>> @@ -457,6 +457,8 @@ struct MigrationState { >>>> bool switchover_acked; >>>> /* Is this a rdma migration */ >>>> bool rdma_migration; >>>> + >>>> + GSource *hup_source; >>>> }; >>>> void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, >>>> diff --git a/migration/ram.c b/migration/ram.c >>>> index 81eda27..e2cef50 100644 >>>> --- a/migration/ram.c >>>> +++ b/migration/ram.c >>>> @@ -216,7 +216,9 @@ static bool postcopy_preempt_active(void) >>>> bool migrate_ram_is_ignored(RAMBlock *block) >>>> { >>>> + MigMode mode = migrate_mode(); >>>> return !qemu_ram_is_migratable(block) || >>>> + mode == MIG_MODE_CPR_TRANSFER || >>>> (migrate_ignore_shared() && qemu_ram_is_shared(block) >>>> && qemu_ram_is_named_file(block)); >>>> } >>>> diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c >>>> index 6e45a4a..b5a55b8 100644 >>>> --- a/migration/vmstate-types.c >>>> +++ b/migration/vmstate-types.c >>>> @@ -15,6 +15,7 @@ >>>> #include "qemu-file.h" >>>> #include "migration.h" >>>> #include "migration/vmstate.h" >>>> +#include "migration/client-options.h" >>>> #include "qemu/error-report.h" >>>> #include "qemu/queue.h" >>>> #include "trace.h" >>>> @@ -321,7 +322,7 @@ static int get_fd(QEMUFile *f, void *pv, size_t size, >>>> { >>>> int32_t *v = pv; >>>> qemu_get_sbe32s(f, v); >>>> - if (*v < 0) { >>>> + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { >>>> return 0; >>>> } >>>> *v = qemu_file_get_fd(f); >>>> @@ -334,7 +335,7 @@ static int put_fd(QEMUFile *f, void *pv, size_t size, >>>> int32_t *v = pv; >>>> qemu_put_sbe32s(f, v); >>>> - if (*v < 0) { >>>> + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { >>> >>> So I suppose you wanted to guard VMSTATE_FD being abused. Then I wonder >>> whether it'll help more by adding a comment above VMSTATE_FD instead; it'll >>> be more straightforward to me. >>> >>> And if you want to fail hard, assert should work better too in runtime, or >>> the "return 0" can be pretty hard to notice. >> >> No, this code is not about detecting abuse or errors. It is there to skip >> the qemu_file_put_fd for cpr-exec mode. In my next version this function will >> simply be: >> >> static int put_fd(QEMUFile *f, void *pv, size_t size, >> const VMStateField *field, JSONWriter *vmdesc) >> { >> int32_t *v = pv; >> return qemu_file_put_fd(f, *v); >> } > > Great, thanks. > >> >>>> return 0; >>>> } >>>> return qemu_file_put_fd(f, *v); >>>> diff --git a/qapi/migration.json b/qapi/migration.json >>>> index c0d8bcc..f51b4cb 100644 >>>> --- a/qapi/migration.json >>>> +++ b/qapi/migration.json >>>> @@ -611,9 +611,34 @@ >>>> # or COLO. >>>> # >>>> # (since 8.2) >>>> +# >>>> +# @cpr-transfer: This mode allows the user to transfer a guest to a >>>> +# new QEMU instance on the same host with minimal guest pause >>>> +# time, by preserving guest RAM in place, albeit with new virtual >>>> +# addresses in new QEMU. >>>> +# >>>> +# The user starts new QEMU on the same host as old QEMU, with the >>>> +# the same arguments as old QEMU, plus the -incoming option. The >>>> +# user issues the migrate command to old QEMU, which stops the VM, >>>> +# saves state to the migration channels, and enters the >>>> +# postmigrate state. Execution resumes in new QEMU. Guest RAM is >>>> +# preserved in place, albeit with new virtual addresses in new >>>> +# QEMU. The incoming migration channel cannot be a file type. >>>> +# >>>> +# This mode requires a second migration channel, specified by the >>>> +# cpr-uri migration property on the outgoing side, and by >>>> +# the cpr-uri QEMU command-line option on the incoming >>>> +# side. The channel must be a type, such as unix socket, that >>>> +# supports SCM_RIGHTS. >>>> +# >>>> +# Memory-backend objects must have the share=on attribute, but >>>> +# memory-backend-epc is not supported. The VM must be started >>>> +# with the '-machine anon-alloc=memfd' option. >>>> +# >>>> +# (since 9.2) >>>> ## >>>> { 'enum': 'MigMode', >>>> - 'data': [ 'normal', 'cpr-reboot' ] } >>>> + 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] } >>> >>> No need to rush, but please add the CPR.rst and unit test updates when you >>> feel confident on the protocol. It looks pretty good to me now. >>> >>> Especially it'll be nice to describe the separate cpr-channel protocol in >>> the new doc page. >> >> Will do, now that there is light at the end of the tunnel. > > I just noticed that we have 1 month left before soft freeze. I'll try to > prioritize review of this series (and the other VFIO one) in the upcoming > month. Let's see whether it can hit 9.2. Cool, thanks, I will also make an extra effort to hit that goal. - Steve
On 10/8/2024 2:47 PM, Peter Xu wrote: > On Tue, Oct 08, 2024 at 03:28:30PM -0300, Fabiano Rosas wrote: >>>>> + /* Close cpr socket to tell source that we are listening */ >>>>> + cpr_state_close(); >>>> >>>> Would it be possible to use some explicit reply message to mark this? >>> >>> In theory yes, but I fear that using a return channel with message parsing and >>> dispatch adds more code than it is worth. >> >> I think this approach is fine for now, but I wonder whether we could >> reuse the current return path (RP) by starting it earlier and take >> benefit from it already having the message passing infrastructure in >> place. I'm actually looking ahead to the migration handshake thread[1], >> which could be thought to have some similarity with the early cpr >> channel. So having a generic channel in place early on to handle >> handshake, CPR, RP, etc. could be a good idea. > > The current design relies on CPR stage happens before device realize()s, so > I assume migration channel (including RP) isn't easily applicable at as > early as this stage. > > However I think dest qemu can directly write back to the cpr_uri channel > instead if we want and then follow a protocol simple enough (even though > it'll be separate from the migration stream protocol). > > What worries me more (besides using HUP as of now..) is cpr_state_save() is > currently synchronous and can block the main iothread. It means if cpr > destination is not properly setup, it can hang the main thread (including > e.g. QMP monitor) at qio_channel_socket_connect_sync(). Ideally we > shouldn't block the main thread. > > If async-mode can be done, it might be even easier, e.g. if we do > cpr_state_save() in a thread, after qemu_put*() we can directly qemu_get*() > in the same context with the pairing return qemufile. > > But maybe we can do it in two steps, merging HUP first. Then when a better > protocol (plus async mode) ready, one can boost QEMU_CPR_FILE_VERSION. > I'll see how Steve wants to address it. Our emails on this subject crossed. I agree that an async channel both on send and recv sounds like the way to go, but I would like to keep HUP for now, and pursue that as a future RFE. - Steve >> Anyway, I'm probing on this a bit so I can start drafting something. I >> got surprised that we don't even have the capability bits in the stream >> in a useful way (currently, configuration_validate_capabilities() does >> kind of nothing). >> >> 1- https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake > > Happy to know this. I was thinking whether I should work on this even > earlier, so if you're looking at that it'll be great. > > The major pain to me is the channel establishment part where we now have > all kinds of channels, so we should really fix that sooner (e.g., we hope > to enable multifd + postcopy very soon, that requires multifd and preempt > channels appear in the same time). It was reasonable the vfio/multifd > series tried to fix it. >
On 10/8/2024 3:11 PM, Fabiano Rosas wrote: > Peter Xu <peterx@redhat.com> writes: > >> On Tue, Oct 08, 2024 at 03:28:30PM -0300, Fabiano Rosas wrote: >>>>>> + /* Close cpr socket to tell source that we are listening */ >>>>>> + cpr_state_close(); >>>>> >>>>> Would it be possible to use some explicit reply message to mark this? >>>> >>>> In theory yes, but I fear that using a return channel with message parsing and >>>> dispatch adds more code than it is worth. >>> >>> I think this approach is fine for now, but I wonder whether we could >>> reuse the current return path (RP) by starting it earlier and take >>> benefit from it already having the message passing infrastructure in >>> place. I'm actually looking ahead to the migration handshake thread[1], >>> which could be thought to have some similarity with the early cpr >>> channel. So having a generic channel in place early on to handle >>> handshake, CPR, RP, etc. could be a good idea. >> >> The current design relies on CPR stage happens before device realize()s, so >> I assume migration channel (including RP) isn't easily applicable at as >> early as this stage. > > Well, what is the dependency for the RP? If we can send CPR state, we > can send QEMU_VM_COMMAND, no? The CPR state channel is (and must be) used before migration_object_init, and before the normal migration channel is opened. Thus we cannot use the normal return path. - Steve >> However I think dest qemu can directly write back to the cpr_uri channel >> instead if we want and then follow a protocol simple enough (even though >> it'll be separate from the migration stream protocol). >> >> What worries me more (besides using HUP as of now..) is cpr_state_save() is >> currently synchronous and can block the main iothread. It means if cpr >> destination is not properly setup, it can hang the main thread (including >> e.g. QMP monitor) at qio_channel_socket_connect_sync(). Ideally we >> shouldn't block the main thread. >> >> If async-mode can be done, it might be even easier, e.g. if we do >> cpr_state_save() in a thread, after qemu_put*() we can directly qemu_get*() >> in the same context with the pairing return qemufile. >> >> But maybe we can do it in two steps, merging HUP first. Then when a better >> protocol (plus async mode) ready, one can boost QEMU_CPR_FILE_VERSION. >> I'll see how Steve wants to address it. > > I agree HUP is fine at the moment. > >> >>> >>> Anyway, I'm probing on this a bit so I can start drafting something. I >>> got surprised that we don't even have the capability bits in the stream >>> in a useful way (currently, configuration_validate_capabilities() does >>> kind of nothing). >>> >>> 1- https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake >> >> Happy to know this. I was thinking whether I should work on this even >> earlier, so if you're looking at that it'll be great. > > As of half an hour ago =) We could put a feature branch up and work > together, if you have more concrete thoughts on how this would look like > let me know. > >> >> The major pain to me is the channel establishment part where we now have >> all kinds of channels, so we should really fix that sooner (e.g., we hope >> to enable multifd + postcopy very soon, that requires multifd and preempt >> channels appear in the same time). It was reasonable the vfio/multifd >> series tried to fix it. >
On Tue, Oct 08, 2024 at 03:12:32PM -0400, Steven Sistare wrote: > > This is slightly tricky part and would be nice to be documented somewhere, > > perhaps starting from in the commit message. > > I will extend the block comment in qmp_migrate: > > /* > * For cpr-transfer, the target may not be listening yet on the migration > * channel, because first it must finish cpr_load_state. The target tells > * us it is listening by closing the cpr-state socket. Wait for that HUP > * event before connecting in qmp_migrate_finish. > * > * The HUP could occur because the target fails while reading CPR state, > * in which case the target will not listen for the incoming migration > * connection, so qmp_migrate_finish will fail to connect, and then recover. > */ Yes this is better, thanks. > > > Then the error will say "failed to connect to destination QEMU" hiding the > > real failure (cpr save/load failed), right? That's slightly a pity. > > Yes, but destination qemu will also emit a more specific message. True. > > > I'm OK with the HUP as of now, but if you care about accurate CPR-stage > > error reporting, then feel free to draft something else in the next post. > > I'll think about it, but to get cpr into 9.2, this will probably need to be > deferred as a future enhancement. Yep that's OK.
On Tue, Oct 08, 2024 at 04:11:38PM -0300, Fabiano Rosas wrote: > As of half an hour ago =) We could put a feature branch up and work > together, if you have more concrete thoughts on how this would look like > let me know. [I'll hijack this thread with one more email, as this is not cpr-relevant] I think I listed all the things I can think of in the wiki, so please go ahead. One trivial suggestion is we can start from the very simple, which is the handshake itself, with a self-bootstrap protocol, probably feature-bit based or whatever you prefer. Then we set bit 0 saying "this QEMU knows how to handshake". Comparing to the rest requirement, IMHO we can make the channel establishment the 1st feature, then it's already good for merging, having feature bit 1 saying "this qemu understands named channel establishment". Then we add new feature bits on top of the handshake feature, by adding more feature bits. Both QEMUs should first handshake on the feature bits they support and enable only the subset that all support. Or instead of bit, feature strings, etc. would all work which you prefer. Just to say we don't need to impl all the ideas there, as some of them might take more time (e.g. device tree check), and that list is probably not complete anyway.
On 10/8/2024 3:48 PM, Peter Xu wrote: > On Tue, Oct 08, 2024 at 04:11:38PM -0300, Fabiano Rosas wrote: >> As of half an hour ago =) We could put a feature branch up and work >> together, if you have more concrete thoughts on how this would look like >> let me know. > > [I'll hijack this thread with one more email, as this is not cpr-relevant] > > I think I listed all the things I can think of in the wiki, so please go > ahead. > > One trivial suggestion is we can start from the very simple, which is the > handshake itself, with a self-bootstrap protocol, probably feature-bit > based or whatever you prefer. Then we set bit 0 saying "this QEMU knows > how to handshake". > > Comparing to the rest requirement, IMHO we can make the channel > establishment the 1st feature, then it's already good for merging, having > feature bit 1 saying "this qemu understands named channel establishment". > > Then we add new feature bits on top of the handshake feature, by adding > more feature bits. Both QEMUs should first handshake on the feature bits > they support and enable only the subset that all support. > > Or instead of bit, feature strings, etc. would all work which you > prefer. Just to say we don't need to impl all the ideas there, as some of > them might take more time (e.g. device tree check), and that list is > probably not complete anyway. While writing a qtest for cpr-transfer, I discovered a problem that could be solved with an early migration handshake, prior to cpr_save_state / cpr_load_state. There is currently no way to set migration caps on dest qemu before starting cpr-transfer, because dest qemu blocks in cpr_state_load before creating any devices or monitors. It is unblocked after the user sends the migrate command to source qemu, but then the migration starts and it is too late to set migration capabilities or parameters on the dest. Are you OK with that restriction (for now, until a handshake is implemented)? If not, I have a problem. I can hack the qtest to make it work with the restriction. - Steve
On Wed, Oct 09, 2024 at 02:43:44PM -0400, Steven Sistare wrote: > On 10/8/2024 3:48 PM, Peter Xu wrote: > > On Tue, Oct 08, 2024 at 04:11:38PM -0300, Fabiano Rosas wrote: > > > As of half an hour ago =) We could put a feature branch up and work > > > together, if you have more concrete thoughts on how this would look like > > > let me know. > > > > [I'll hijack this thread with one more email, as this is not cpr-relevant] > > > > I think I listed all the things I can think of in the wiki, so please go > > ahead. > > > > One trivial suggestion is we can start from the very simple, which is the > > handshake itself, with a self-bootstrap protocol, probably feature-bit > > based or whatever you prefer. Then we set bit 0 saying "this QEMU knows > > how to handshake". > > > > Comparing to the rest requirement, IMHO we can make the channel > > establishment the 1st feature, then it's already good for merging, having > > feature bit 1 saying "this qemu understands named channel establishment". > > > > Then we add new feature bits on top of the handshake feature, by adding > > more feature bits. Both QEMUs should first handshake on the feature bits > > they support and enable only the subset that all support. > > > > Or instead of bit, feature strings, etc. would all work which you > > prefer. Just to say we don't need to impl all the ideas there, as some of > > them might take more time (e.g. device tree check), and that list is > > probably not complete anyway. > > While writing a qtest for cpr-transfer, I discovered a problem that could be > solved with an early migration handshake, prior to cpr_save_state / cpr_load_state. > > There is currently no way to set migration caps on dest qemu before starting > cpr-transfer, because dest qemu blocks in cpr_state_load before creating any > devices or monitors. It is unblocked after the user sends the migrate command > to source qemu, but then the migration starts and it is too late to set migration > capabilities or parameters on the dest. > > Are you OK with that restriction (for now, until a handshake is implemented)? > If not, I have a problem. > > I can hack the qtest to make it work with the restriction. Hmm, the test case is one thing, but if it's a problem, then.. how in real life one could set migration capabilities on dest qemu for cpr-transfer? Now a similar question, and also what I overlooked previously, is how cpr-transfer should support "-incoming defer". We need that because that's what Libvirt uses.. with an upcoming migrate_incoming QMP command.
On Wed, Oct 09, 2024 at 03:06:53PM -0400, Peter Xu wrote: > On Wed, Oct 09, 2024 at 02:43:44PM -0400, Steven Sistare wrote: > > On 10/8/2024 3:48 PM, Peter Xu wrote: > > > On Tue, Oct 08, 2024 at 04:11:38PM -0300, Fabiano Rosas wrote: > > > > As of half an hour ago =) We could put a feature branch up and work > > > > together, if you have more concrete thoughts on how this would look like > > > > let me know. > > > > > > [I'll hijack this thread with one more email, as this is not cpr-relevant] > > > > > > I think I listed all the things I can think of in the wiki, so please go > > > ahead. > > > > > > One trivial suggestion is we can start from the very simple, which is the > > > handshake itself, with a self-bootstrap protocol, probably feature-bit > > > based or whatever you prefer. Then we set bit 0 saying "this QEMU knows > > > how to handshake". > > > > > > Comparing to the rest requirement, IMHO we can make the channel > > > establishment the 1st feature, then it's already good for merging, having > > > feature bit 1 saying "this qemu understands named channel establishment". > > > > > > Then we add new feature bits on top of the handshake feature, by adding > > > more feature bits. Both QEMUs should first handshake on the feature bits > > > they support and enable only the subset that all support. > > > > > > Or instead of bit, feature strings, etc. would all work which you > > > prefer. Just to say we don't need to impl all the ideas there, as some of > > > them might take more time (e.g. device tree check), and that list is > > > probably not complete anyway. > > > > While writing a qtest for cpr-transfer, I discovered a problem that could be > > solved with an early migration handshake, prior to cpr_save_state / cpr_load_state. > > > > There is currently no way to set migration caps on dest qemu before starting > > cpr-transfer, because dest qemu blocks in cpr_state_load before creating any > > devices or monitors. It is unblocked after the user sends the migrate command > > to source qemu, but then the migration starts and it is too late to set migration > > capabilities or parameters on the dest. > > > > Are you OK with that restriction (for now, until a handshake is implemented)? > > If not, I have a problem. > > > > I can hack the qtest to make it work with the restriction. > > Hmm, the test case is one thing, but if it's a problem, then.. how in real > life one could set migration capabilities on dest qemu for cpr-transfer? > > Now a similar question, and also what I overlooked previously, is how > cpr-transfer should support "-incoming defer". We need that because that's > what Libvirt uses.. with an upcoming migrate_incoming QMP command. Just to share some more thoughts below.. So fundamentally the question is whether there's some way cpr can have a predictable window on dest qemu that we know QMP is ready, but before incoming migration starts. With current design, incoming side will sequentially do: (1) cpr-uri load(), (2) initialize rest of QEMU (migration, qmp, devices, etc.), (3) listen port ready, then (4) close(), aka, HUP. Looks like steps 1-4 will have no way to control when kicked off, so after cpr-uri save() data dump they'll happen in one shot. It might make sense because we assumed load() of cpr-uri is during the blackout window, and enlarge that is probably not good. But.. why do we keep cpr_state_save/load() in the blackout window? AFAIU they're mostly the fds sharing so they can happen with VM still running on src, right? I still remember the vhost/tap issue you mentioned, but I wonder whether that'll ever change the vhost/tap fd at all if we forbid any device change like what we do with normal migrations. IOW, I wonder whether we can still do the cpr_state_save/load() always during VM running (but it should still be during an ACTIVE migration, IOW, device hotplug and stuff should be forbidden, just like a live precopy phase). Iff that works, then maybe there's a way out: we can make cpr-transfer two steps: - DST: start QEMU dest the same, with -cpr-uri XXX, but now let's assume it's with -incoming defer just to give an example, and no migration capabilities applied yet. - SRC: send 'migrate' QMP command, qemu should see that cpr-transfer is enabled, so it triggers sending cpr states to destination only. It doesn't run the rest migration logic. During this stage src VM will always be running, we need to make sure migration state machine start running (perhaps NONE->SETUP_CPR) so device plug/unplug will be forbidden like what happens with generic precopy, so as to stablize fds. Just need to make sure migration_is_running() returns true. - DST: receives all cpr states. When complete, it keeps running, no HUP is needed this time, because it'll wait for another "migrate_incoming". In the case of "-incoming unix:XXX" in qemu cmdline, it'll directly go into the listen code and wait, but still we don't need the HUP because we're not in blackout window, and src won't connect automatically but requires a command later from mgmt (see below). - DST: the mgmt can send whatever QMP command to dest now, including setup incoming port, setup migration capabilities/parameters if needed. Src is still running, so it can be slow. - SRC: do the real migration with another "migrate resume=true" QMP command (I simply reused postcopy's resume flag here). This time src qemu should notice this is a continuation of cpr-transfer migration, then it moves that on (SETUP_CPR->ACTIVE), migrate RAM/device/whatever is left. Same to generic migration, until COMPLETED. Not sure whether it'll work. We'll need to still properly handle things like migrate_cancel, etc, when triggered during SETUP_CPR state, but hopefully not complicated to do..
On 10/9/2024 3:06 PM, Peter Xu wrote: > On Wed, Oct 09, 2024 at 02:43:44PM -0400, Steven Sistare wrote: >> On 10/8/2024 3:48 PM, Peter Xu wrote: >>> On Tue, Oct 08, 2024 at 04:11:38PM -0300, Fabiano Rosas wrote: >>>> As of half an hour ago =) We could put a feature branch up and work >>>> together, if you have more concrete thoughts on how this would look like >>>> let me know. >>> >>> [I'll hijack this thread with one more email, as this is not cpr-relevant] >>> >>> I think I listed all the things I can think of in the wiki, so please go >>> ahead. >>> >>> One trivial suggestion is we can start from the very simple, which is the >>> handshake itself, with a self-bootstrap protocol, probably feature-bit >>> based or whatever you prefer. Then we set bit 0 saying "this QEMU knows >>> how to handshake". >>> >>> Comparing to the rest requirement, IMHO we can make the channel >>> establishment the 1st feature, then it's already good for merging, having >>> feature bit 1 saying "this qemu understands named channel establishment". >>> >>> Then we add new feature bits on top of the handshake feature, by adding >>> more feature bits. Both QEMUs should first handshake on the feature bits >>> they support and enable only the subset that all support. >>> >>> Or instead of bit, feature strings, etc. would all work which you >>> prefer. Just to say we don't need to impl all the ideas there, as some of >>> them might take more time (e.g. device tree check), and that list is >>> probably not complete anyway. >> >> While writing a qtest for cpr-transfer, I discovered a problem that could be >> solved with an early migration handshake, prior to cpr_save_state / cpr_load_state. >> >> There is currently no way to set migration caps on dest qemu before starting >> cpr-transfer, because dest qemu blocks in cpr_state_load before creating any >> devices or monitors. It is unblocked after the user sends the migrate command >> to source qemu, but then the migration starts and it is too late to set migration >> capabilities or parameters on the dest. >> >> Are you OK with that restriction (for now, until a handshake is implemented)? >> If not, I have a problem. >> >> I can hack the qtest to make it work with the restriction. > > Hmm, the test case is one thing, but if it's a problem, then.. how in real > life one could set migration capabilities on dest qemu for cpr-transfer? You will allow it via the migration handshake! But right now, one can enable capabilities by adding -global migration.xxx=yyy on the target command line. > Now a similar question, and also what I overlooked previously, is how > cpr-transfer should support "-incoming defer". We need that because that's > what Libvirt uses.. with an upcoming migrate_incoming QMP command. Defer works. Start dest qemu, issue the migrate command to source qemu. Dest qemu finishes cpr_load_state and enters the main loop, listening for montitor commands. - Steve
On 10/9/2024 3:59 PM, Peter Xu wrote: > On Wed, Oct 09, 2024 at 03:06:53PM -0400, Peter Xu wrote: >> On Wed, Oct 09, 2024 at 02:43:44PM -0400, Steven Sistare wrote: >>> On 10/8/2024 3:48 PM, Peter Xu wrote: >>>> On Tue, Oct 08, 2024 at 04:11:38PM -0300, Fabiano Rosas wrote: >>>>> As of half an hour ago =) We could put a feature branch up and work >>>>> together, if you have more concrete thoughts on how this would look like >>>>> let me know. >>>> >>>> [I'll hijack this thread with one more email, as this is not cpr-relevant] >>>> >>>> I think I listed all the things I can think of in the wiki, so please go >>>> ahead. >>>> >>>> One trivial suggestion is we can start from the very simple, which is the >>>> handshake itself, with a self-bootstrap protocol, probably feature-bit >>>> based or whatever you prefer. Then we set bit 0 saying "this QEMU knows >>>> how to handshake". >>>> >>>> Comparing to the rest requirement, IMHO we can make the channel >>>> establishment the 1st feature, then it's already good for merging, having >>>> feature bit 1 saying "this qemu understands named channel establishment". >>>> >>>> Then we add new feature bits on top of the handshake feature, by adding >>>> more feature bits. Both QEMUs should first handshake on the feature bits >>>> they support and enable only the subset that all support. >>>> >>>> Or instead of bit, feature strings, etc. would all work which you >>>> prefer. Just to say we don't need to impl all the ideas there, as some of >>>> them might take more time (e.g. device tree check), and that list is >>>> probably not complete anyway. >>> >>> While writing a qtest for cpr-transfer, I discovered a problem that could be >>> solved with an early migration handshake, prior to cpr_save_state / cpr_load_state. >>> >>> There is currently no way to set migration caps on dest qemu before starting >>> cpr-transfer, because dest qemu blocks in cpr_state_load before creating any >>> devices or monitors. It is unblocked after the user sends the migrate command >>> to source qemu, but then the migration starts and it is too late to set migration >>> capabilities or parameters on the dest. >>> >>> Are you OK with that restriction (for now, until a handshake is implemented)? >>> If not, I have a problem. >>> >>> I can hack the qtest to make it work with the restriction. >> >> Hmm, the test case is one thing, but if it's a problem, then.. how in real >> life one could set migration capabilities on dest qemu for cpr-transfer? >> >> Now a similar question, and also what I overlooked previously, is how >> cpr-transfer should support "-incoming defer". We need that because that's >> what Libvirt uses.. with an upcoming migrate_incoming QMP command. > > Just to share some more thoughts below.. > > So fundamentally the question is whether there's some way cpr can have a > predictable window on dest qemu that we know QMP is ready, but before > incoming migration starts. > > With current design, incoming side will sequentially do: (1) cpr-uri > load(), (2) initialize rest of QEMU (migration, qmp, devices, etc.), (3) > listen port ready, then (4) close(), aka, HUP. Looks like steps 1-4 will > have no way to control when kicked off, so after cpr-uri save() data dump > they'll happen in one shot. > > It might make sense because we assumed load() of cpr-uri is during the > blackout window, and enlarge that is probably not good. > > But.. why do we keep cpr_state_save/load() in the blackout window? AFAIU > they're mostly the fds sharing so they can happen with VM still running on > src, right? > > I still remember the vhost/tap issue you mentioned, but I wonder whether > that'll ever change the vhost/tap fd at all if we forbid any device change > like what we do with normal migrations. IOW, I wonder whether we can still > do the cpr_state_save/load() always during VM running (but it should still > be during an ACTIVE migration, IOW, device hotplug and stuff should be > forbidden, just like a live precopy phase). > > Iff that works, then maybe there's a way out: we can make cpr-transfer two > steps: > > - DST: start QEMU dest the same, with -cpr-uri XXX, but now let's assume > it's with -incoming defer just to give an example, and no migration > capabilities applied yet. > > - SRC: send 'migrate' QMP command, qemu should see that cpr-transfer is > enabled, so it triggers sending cpr states to destination only. It > doesn't run the rest migration logic. > > During this stage src VM will always be running, we need to make sure > migration state machine start running (perhaps NONE->SETUP_CPR) so > device plug/unplug will be forbidden like what happens with generic > precopy, so as to stablize fds. Just need to make sure > migration_is_running() returns true. > > - DST: receives all cpr states. When complete, it keeps running, no HUP > is needed this time, because it'll wait for another "migrate_incoming". > > In the case of "-incoming unix:XXX" in qemu cmdline, it'll directly go > into the listen code and wait, but still we don't need the HUP because > we're not in blackout window, and src won't connect automatically but > requires a command later from mgmt (see below). > > - DST: the mgmt can send whatever QMP command to dest now, including > setup incoming port, setup migration capabilities/parameters if needed. > Src is still running, so it can be slow. > > - SRC: do the real migration with another "migrate resume=true" QMP > command (I simply reused postcopy's resume flag here). This time src > qemu should notice this is a continuation of cpr-transfer migration, > then it moves that on (SETUP_CPR->ACTIVE), migrate RAM/device/whatever > is left. Same to generic migration, until COMPLETED. > > Not sure whether it'll work. We'll need to still properly handle things > like migrate_cancel, etc, when triggered during SETUP_CPR state, but > hopefully not complicated to do.. Yes, I am also brainstorming along these lines, looking for more gotcha's, but its a big design change. I don't love it so far. These issues all creep in because of transfer mode. Exec mode did not have this problem, as cpr-state is written to an in-memory file. - Steve
On Wed, Oct 09, 2024 at 04:09:45PM -0400, Steven Sistare wrote: > On 10/9/2024 3:06 PM, Peter Xu wrote: > > On Wed, Oct 09, 2024 at 02:43:44PM -0400, Steven Sistare wrote: > > > On 10/8/2024 3:48 PM, Peter Xu wrote: > > > > On Tue, Oct 08, 2024 at 04:11:38PM -0300, Fabiano Rosas wrote: > > > > > As of half an hour ago =) We could put a feature branch up and work > > > > > together, if you have more concrete thoughts on how this would look like > > > > > let me know. > > > > > > > > [I'll hijack this thread with one more email, as this is not cpr-relevant] > > > > > > > > I think I listed all the things I can think of in the wiki, so please go > > > > ahead. > > > > > > > > One trivial suggestion is we can start from the very simple, which is the > > > > handshake itself, with a self-bootstrap protocol, probably feature-bit > > > > based or whatever you prefer. Then we set bit 0 saying "this QEMU knows > > > > how to handshake". > > > > > > > > Comparing to the rest requirement, IMHO we can make the channel > > > > establishment the 1st feature, then it's already good for merging, having > > > > feature bit 1 saying "this qemu understands named channel establishment". > > > > > > > > Then we add new feature bits on top of the handshake feature, by adding > > > > more feature bits. Both QEMUs should first handshake on the feature bits > > > > they support and enable only the subset that all support. > > > > > > > > Or instead of bit, feature strings, etc. would all work which you > > > > prefer. Just to say we don't need to impl all the ideas there, as some of > > > > them might take more time (e.g. device tree check), and that list is > > > > probably not complete anyway. > > > > > > While writing a qtest for cpr-transfer, I discovered a problem that could be > > > solved with an early migration handshake, prior to cpr_save_state / cpr_load_state. > > > > > > There is currently no way to set migration caps on dest qemu before starting > > > cpr-transfer, because dest qemu blocks in cpr_state_load before creating any > > > devices or monitors. It is unblocked after the user sends the migrate command > > > to source qemu, but then the migration starts and it is too late to set migration > > > capabilities or parameters on the dest. > > > > > > Are you OK with that restriction (for now, until a handshake is implemented)? > > > If not, I have a problem. > > > > > > I can hack the qtest to make it work with the restriction. > > > > Hmm, the test case is one thing, but if it's a problem, then.. how in real > > life one could set migration capabilities on dest qemu for cpr-transfer? > > You will allow it via the migration handshake! > But right now, one can enable capabilities by adding -global migration.xxx=yyy > on the target command line. Those are for debugging only, so we shouldn't suggest them to be used in production.. at least not the plan. Yeah, handshake would make it work. But it's not yet there.. :( > > > Now a similar question, and also what I overlooked previously, is how > > cpr-transfer should support "-incoming defer". We need that because that's > > what Libvirt uses.. with an upcoming migrate_incoming QMP command. > > Defer works. Start dest qemu, issue the migrate command to source qemu. > Dest qemu finishes cpr_load_state and enters the main loop, listening for > montitor commands. Ahh yes, the HUP works with this case too, that's OK. What's your thoughts in the other email I wrote? That'll make QMP available in general on dest, if I read it right. But yeah I think this issue is not a blocker now at least, so I'm just curious whether that's still useful. We may still want to understand one question I raised elsewhere on whether cpr state save/load must be done during vm stopped. If so, then it means Libvirt will only go with "defer", and QMP set-capabilities might be accounted as downtime there which can be unfortunate.. Basically, it means if we can still drop patch 4 completely (while the vhost notifiers can exist in the future, but hopefully not dependent on patch 4).
On Wed, Oct 09, 2024 at 04:18:31PM -0400, Steven Sistare wrote: > Yes, I am also brainstorming along these lines, looking for more gotcha's, > but its a big design change. I don't love it so far. > > These issues all creep in because of transfer mode. Exec mode did not have this > problem, as cpr-state is written to an in-memory file. I understand. Hopefully we're getting there very soon.. I still have concern on having -global used in productions, and meanwhile it might still be challenging for handshake to work as early as cpr stage even for later, because at least in my mind the handshake still happens in the main migration channel (where it includes channel establishments etc, which is not proper during cpr stage). I don't really know whether that'll work at last.. So in my mind the previous two-steps proposal is so far the only one that all seem to work, with no unpredictable side effects. Said that, maybe we can still think about simpler solutions in the following days or see others opinions, we don't need to make a decision today, so maybe there's still better way to go.
Peter Xu <peterx@redhat.com> writes: > On Wed, Oct 09, 2024 at 04:18:31PM -0400, Steven Sistare wrote: >> Yes, I am also brainstorming along these lines, looking for more gotcha's, >> but its a big design change. I don't love it so far. >> >> These issues all creep in because of transfer mode. Exec mode did not have this >> problem, as cpr-state is written to an in-memory file. > > I understand. Hopefully we're getting there very soon.. > > I still have concern on having -global used in productions, and meanwhile Agree, but for qtests it should be fine at least. > it might still be challenging for handshake to work as early as cpr stage > even for later, because at least in my mind the handshake still happens in > the main migration channel (where it includes channel establishments etc, > which is not proper during cpr stage). I don't think any form of handshake will be implemented in a month. Maybe it's best we keep that to the side for now. (still, thinking about that virtio-net USO thread, an early handshake could be a good idea, so we could perhaps inform about device incompatibility, etc.) > > I don't really know whether that'll work at last.. > > So in my mind the previous two-steps proposal is so far the only one that > all seem to work, with no unpredictable side effects. > > Said that, maybe we can still think about simpler solutions in the > following days or see others opinions, we don't need to make a decision > today, so maybe there's still better way to go. I thought of putting the caps on the configuration vmstate and using that to set them on the destination, but there's a bit of a chicken and egg problem because we need capabilities set as soon as qemu_start_incoming_migration(). Unless we sent those via the cpr channel. We could split migration_object_init() a bit so we can instantiate some parts of the migration state earlier (I'm not even sure what are the actual dependencies are).
On 10/9/2024 6:08 PM, Fabiano Rosas wrote: > Peter Xu <peterx@redhat.com> writes: > >> On Wed, Oct 09, 2024 at 04:18:31PM -0400, Steven Sistare wrote: >>> Yes, I am also brainstorming along these lines, looking for more gotcha's, >>> but its a big design change. I don't love it so far. >>> >>> These issues all creep in because of transfer mode. Exec mode did not have this >>> problem, as cpr-state is written to an in-memory file. >> >> I understand. Hopefully we're getting there very soon.. >> >> I still have concern on having -global used in productions, and meanwhile > > Agree, but for qtests it should be fine at least. > >> it might still be challenging for handshake to work as early as cpr stage >> even for later, because at least in my mind the handshake still happens in >> the main migration channel (where it includes channel establishments etc, >> which is not proper during cpr stage). > > I don't think any form of handshake will be implemented in a > month. Maybe it's best we keep that to the side for now. Agreed, and a handshake in the main migration channel, which would be the cleanest to implement, occurs too late. We should not rely on it to solve this cpr problem. I have a new proposal which I will send in a new thread. - Steve > (still, thinking about that virtio-net USO thread, an early handshake > could be a good idea, so we could perhaps inform about device > incompatibility, etc.) > >> >> I don't really know whether that'll work at last.. >> >> So in my mind the previous two-steps proposal is so far the only one that >> all seem to work, with no unpredictable side effects. >> >> Said that, maybe we can still think about simpler solutions in the >> following days or see others opinions, we don't need to make a decision >> today, so maybe there's still better way to go. > > I thought of putting the caps on the configuration vmstate and using > that to set them on the destination, but there's a bit of a chicken and > egg problem because we need capabilities set as soon as > qemu_start_incoming_migration(). Unless we sent those via the cpr > channel. We could split migration_object_init() a bit so we can > instantiate some parts of the migration state earlier (I'm not even sure > what are the actual dependencies are).
On 10/9/2024 4:36 PM, Peter Xu wrote: > On Wed, Oct 09, 2024 at 04:09:45PM -0400, Steven Sistare wrote: >> On 10/9/2024 3:06 PM, Peter Xu wrote: >>> On Wed, Oct 09, 2024 at 02:43:44PM -0400, Steven Sistare wrote: >>>> On 10/8/2024 3:48 PM, Peter Xu wrote: >>>>> On Tue, Oct 08, 2024 at 04:11:38PM -0300, Fabiano Rosas wrote: >>>>>> As of half an hour ago =) We could put a feature branch up and work >>>>>> together, if you have more concrete thoughts on how this would look like >>>>>> let me know. >>>>> >>>>> [I'll hijack this thread with one more email, as this is not cpr-relevant] >>>>> >>>>> I think I listed all the things I can think of in the wiki, so please go >>>>> ahead. >>>>> >>>>> One trivial suggestion is we can start from the very simple, which is the >>>>> handshake itself, with a self-bootstrap protocol, probably feature-bit >>>>> based or whatever you prefer. Then we set bit 0 saying "this QEMU knows >>>>> how to handshake". >>>>> >>>>> Comparing to the rest requirement, IMHO we can make the channel >>>>> establishment the 1st feature, then it's already good for merging, having >>>>> feature bit 1 saying "this qemu understands named channel establishment". >>>>> >>>>> Then we add new feature bits on top of the handshake feature, by adding >>>>> more feature bits. Both QEMUs should first handshake on the feature bits >>>>> they support and enable only the subset that all support. >>>>> >>>>> Or instead of bit, feature strings, etc. would all work which you >>>>> prefer. Just to say we don't need to impl all the ideas there, as some of >>>>> them might take more time (e.g. device tree check), and that list is >>>>> probably not complete anyway. >>>> >>>> While writing a qtest for cpr-transfer, I discovered a problem that could be >>>> solved with an early migration handshake, prior to cpr_save_state / cpr_load_state. >>>> >>>> There is currently no way to set migration caps on dest qemu before starting >>>> cpr-transfer, because dest qemu blocks in cpr_state_load before creating any >>>> devices or monitors. It is unblocked after the user sends the migrate command >>>> to source qemu, but then the migration starts and it is too late to set migration >>>> capabilities or parameters on the dest. >>>> >>>> Are you OK with that restriction (for now, until a handshake is implemented)? >>>> If not, I have a problem. >>>> >>>> I can hack the qtest to make it work with the restriction. >>> >>> Hmm, the test case is one thing, but if it's a problem, then.. how in real >>> life one could set migration capabilities on dest qemu for cpr-transfer? >> >> You will allow it via the migration handshake! >> But right now, one can enable capabilities by adding -global migration.xxx=yyy >> on the target command line. > > Those are for debugging only, so we shouldn't suggest them to be used in > production.. at least not the plan. > > Yeah, handshake would make it work. But it's not yet there.. :( > >> >>> Now a similar question, and also what I overlooked previously, is how >>> cpr-transfer should support "-incoming defer". We need that because that's >>> what Libvirt uses.. with an upcoming migrate_incoming QMP command. >> >> Defer works. Start dest qemu, issue the migrate command to source qemu. >> Dest qemu finishes cpr_load_state and enters the main loop, listening for >> montitor commands. > > Ahh yes, the HUP works with this case too, that's OK. Defer works, but it is backwards. I believe the managers would typically send monitor configuration commands to the dest first, then send the migrate command to the source. Backwards is weird. My new proposal addresses this. > What's your thoughts in the other email I wrote? That'll make QMP > available in general on dest, if I read it right. But yeah I think this > issue is not a blocker now at least, so I'm just curious whether that's > still useful. > > We may still want to understand one question I raised elsewhere on whether > cpr state save/load must be done during vm stopped. If so, then it means > Libvirt will only go with "defer", and QMP set-capabilities might be > accounted as downtime there which can be unfortunate.. Basically, it means > if we can still drop patch 4 completely (while the vhost notifiers can > exist in the future, but hopefully not dependent on patch 4). vhost requires us to stop the vm early: qmp_migrate stop vm migration_call_notifiers MIG_EVENT_PRECOPY_CPR_SETUP vhost_cpr_notifier vhost_reset_device - must be after stop vm - and before new qemu inits devices cpr_state_save unblocks new qemu which inits devices and calls vhost_set_owner Thus config commands must be sent to the target during the guest pause interval :( My new proposal addresses this. - Steve
On Thu, Oct 10, 2024 at 04:06:13PM -0400, Steven Sistare wrote: > vhost requires us to stop the vm early: > qmp_migrate > stop vm > migration_call_notifiers MIG_EVENT_PRECOPY_CPR_SETUP > vhost_cpr_notifier > vhost_reset_device - must be after stop vm > - and before new qemu inits devices > cpr_state_save > unblocks new qemu which inits devices and calls vhost_set_owner > > Thus config commands must be sent to the target during the guest pause interval :( I can understand it needs VM stopped, but it can still happen after cpr_save(), am I right (IOW, fd wont change in the notifier)? I meant below sequence: - src: cpr_save(), when running, NONE->SETUP_CPR, all fds synced - [whatever happens..] - src: finally decide to switchover, vm stop - vhost notifier invoked. PS: it doesn't require to be named SETUP_CPR notifiers here, but something else.. > > My new proposal addresses this. Yes, we can discuss that first.
On 10/10/2024 5:23 PM, Peter Xu wrote: > On Thu, Oct 10, 2024 at 04:06:13PM -0400, Steven Sistare wrote: >> vhost requires us to stop the vm early: >> qmp_migrate >> stop vm >> migration_call_notifiers MIG_EVENT_PRECOPY_CPR_SETUP >> vhost_cpr_notifier >> vhost_reset_device - must be after stop vm >> - and before new qemu inits devices >> cpr_state_save >> unblocks new qemu which inits devices and calls vhost_set_owner >> >> Thus config commands must be sent to the target during the guest pause interval :( > > I can understand it needs VM stopped, but it can still happen after > cpr_save(), am I right (IOW, fd wont change in the notifier)? I meant > below sequence: > > - src: cpr_save(), when running, NONE->SETUP_CPR, all fds synced > > - [whatever happens..] > > - src: finally decide to switchover, vm stop > > - vhost notifier invoked. PS: it doesn't require to be named SETUP_CPR > notifiers here, but something else.. The problem is that the first step, cpr_save, causes the dest to finish cpr_load_state and proceed to initialize devices in qemu_create_late_backends -> net_init_clients. This calls ioctl VHOST_SET_OWNER which fails because the device is still owned by src qemu. src qemu releases ownership via VHOST_RESET_OWNER in the vhost notifier. Thus the guest must be paused while config commands are sent to the target. We could avoid that with any of: * do not issue config commands * precreate phase * cpr-exec mode * only pause if vhost is present. (eg no pause for vfio). - Steve
On Thu, Oct 24, 2024 at 05:12:05PM -0400, Steven Sistare wrote: > On 10/10/2024 5:23 PM, Peter Xu wrote: > > On Thu, Oct 10, 2024 at 04:06:13PM -0400, Steven Sistare wrote: > > > vhost requires us to stop the vm early: > > > qmp_migrate > > > stop vm > > > migration_call_notifiers MIG_EVENT_PRECOPY_CPR_SETUP > > > vhost_cpr_notifier > > > vhost_reset_device - must be after stop vm > > > - and before new qemu inits devices > > > cpr_state_save > > > unblocks new qemu which inits devices and calls vhost_set_owner > > > > > > Thus config commands must be sent to the target during the guest pause interval :( > > > > I can understand it needs VM stopped, but it can still happen after > > cpr_save(), am I right (IOW, fd wont change in the notifier)? I meant > > below sequence: > > > > - src: cpr_save(), when running, NONE->SETUP_CPR, all fds synced > > > > - [whatever happens..] > > > > - src: finally decide to switchover, vm stop > > > > - vhost notifier invoked. PS: it doesn't require to be named SETUP_CPR > > notifiers here, but something else.. > > The problem is that the first step, cpr_save, causes the dest to finish cpr_load_state > and proceed to initialize devices in qemu_create_late_backends -> net_init_clients. > This calls ioctl VHOST_SET_OWNER which fails because the device is still owned by src qemu. > > src qemu releases ownership via VHOST_RESET_OWNER in the vhost notifier. I think the block drives have similar issue before on ownership when disk is shared on both sides, and that ownership was only passed over to dest until switchover, rather than dest qemu init. In the CPR routines it'll be also during switchover rather than cpr_save(). Maybe it's just harder for vhost, as I assume vhost was never designed to work with using in shared mode. Otherwise logically the net_init_clients() could do the rest initialization, but provide a facility to SET_OWNER at a later point. I'm not sure if it's possible. For block it could be easier, IIRC it was mostly about the file lock and who owns it (e.g. on a NFS share, to make sure no concurrent writters to corrupt the file). > > Thus the guest must be paused while config commands are sent to the target. > We could avoid that with any of: > * do not issue config commands > * precreate phase > * cpr-exec mode > * only pause if vhost is present. (eg no pause for vfio). OK. I hope precreate will work out if that can solve this too. Thanks,
On 10/25/2024 9:55 AM, Peter Xu wrote: > On Thu, Oct 24, 2024 at 05:12:05PM -0400, Steven Sistare wrote: >> On 10/10/2024 5:23 PM, Peter Xu wrote: >>> On Thu, Oct 10, 2024 at 04:06:13PM -0400, Steven Sistare wrote: >>>> vhost requires us to stop the vm early: >>>> qmp_migrate >>>> stop vm >>>> migration_call_notifiers MIG_EVENT_PRECOPY_CPR_SETUP >>>> vhost_cpr_notifier >>>> vhost_reset_device - must be after stop vm >>>> - and before new qemu inits devices >>>> cpr_state_save >>>> unblocks new qemu which inits devices and calls vhost_set_owner >>>> >>>> Thus config commands must be sent to the target during the guest pause interval :( >>> >>> I can understand it needs VM stopped, but it can still happen after >>> cpr_save(), am I right (IOW, fd wont change in the notifier)? I meant >>> below sequence: >>> >>> - src: cpr_save(), when running, NONE->SETUP_CPR, all fds synced >>> >>> - [whatever happens..] >>> >>> - src: finally decide to switchover, vm stop >>> >>> - vhost notifier invoked. PS: it doesn't require to be named SETUP_CPR >>> notifiers here, but something else.. >> >> The problem is that the first step, cpr_save, causes the dest to finish cpr_load_state >> and proceed to initialize devices in qemu_create_late_backends -> net_init_clients. >> This calls ioctl VHOST_SET_OWNER which fails because the device is still owned by src qemu. >> >> src qemu releases ownership via VHOST_RESET_OWNER in the vhost notifier. > > I think the block drives have similar issue before on ownership when disk > is shared on both sides, and that ownership was only passed over to dest > until switchover, rather than dest qemu init. In the CPR routines it'll be > also during switchover rather than cpr_save(). > > Maybe it's just harder for vhost, as I assume vhost was never designed to > work with using in shared mode. Otherwise logically the net_init_clients() > could do the rest initialization, but provide a facility to SET_OWNER at a > later point. I'm not sure if it's possible. net_init_clients cannot do any initialization that issues vhost ioctls, because the dest process does not yet own the vhost device. - Steve > For block it could be easier, IIRC it was mostly about the file lock and > who owns it (e.g. on a NFS share, to make sure no concurrent writters to > corrupt the file). > >> >> Thus the guest must be paused while config commands are sent to the target. >> We could avoid that with any of: >> * do not issue config commands >> * precreate phase >> * cpr-exec mode >> * only pause if vhost is present. (eg no pause for vfio). > > OK. I hope precreate will work out if that can solve this too. > > Thanks, >
diff --git a/include/migration/cpr.h b/include/migration/cpr.h index e886c98..5cd373f 100644 --- a/include/migration/cpr.h +++ b/include/migration/cpr.h @@ -30,6 +30,7 @@ int cpr_state_save(Error **errp); int cpr_state_load(Error **errp); void cpr_state_close(void); struct QIOChannel *cpr_state_ioc(void); +bool cpr_needed_for_reuse(void *opaque); QEMUFile *cpr_transfer_output(const char *uri, Error **errp); QEMUFile *cpr_transfer_input(const char *uri, Error **errp); diff --git a/migration/cpr.c b/migration/cpr.c index 86f66c1..911b556 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -9,6 +9,7 @@ #include "qapi/error.h" #include "migration/cpr.h" #include "migration/misc.h" +#include "migration/options.h" #include "migration/qemu-file.h" #include "migration/savevm.h" #include "migration/vmstate.h" @@ -57,7 +58,7 @@ static const VMStateDescription vmstate_cpr_fd = { VMSTATE_UINT32(namelen, CprFd), VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen), VMSTATE_INT32(id, CprFd), - VMSTATE_INT32(fd, CprFd), + VMSTATE_FD(fd, CprFd), VMSTATE_END_OF_LIST() } }; @@ -174,9 +175,16 @@ int cpr_state_save(Error **errp) { int ret; QEMUFile *f; + MigMode mode = migrate_mode(); - /* set f based on mode in a later patch in this series */ - return 0; + if (mode == MIG_MODE_CPR_TRANSFER) { + f = cpr_transfer_output(migrate_cpr_uri(), errp); + } else { + return 0; + } + if (!f) { + return -1; + } qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); qemu_put_be32(f, QEMU_CPR_FILE_VERSION); @@ -205,8 +213,18 @@ int cpr_state_load(Error **errp) uint32_t v; QEMUFile *f; - /* set f based on mode in a later patch in this series */ - return 0; + /* + * Mode will be loaded in CPR state, so cannot use it to decide which + * form of state to load. + */ + if (cpr_uri) { + f = cpr_transfer_input(cpr_uri, errp); + } else { + return 0; + } + if (!f) { + return -1; + } v = qemu_get_be32(f); if (v != QEMU_CPR_FILE_MAGIC) { @@ -243,3 +261,9 @@ void cpr_state_close(void) cpr_state_file = NULL; } } + +bool cpr_needed_for_reuse(void *opaque) +{ + MigMode mode = migrate_mode(); + return mode == MIG_MODE_CPR_TRANSFER; +} diff --git a/migration/migration.c b/migration/migration.c index 3301583..73b85aa 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -76,6 +76,7 @@ static NotifierWithReturnList migration_state_notifiers[] = { NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL), NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT), + NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER), }; /* Messages sent on the return path from destination to source */ @@ -109,6 +110,7 @@ static int migration_maybe_pause(MigrationState *s, static void migrate_fd_cancel(MigrationState *s); static bool close_return_path_on_source(MigrationState *s); static void migration_completion_end(MigrationState *s); +static void migrate_hup_delete(MigrationState *s); static void migration_downtime_start(MigrationState *s) { @@ -204,6 +206,12 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, return false; } + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && + addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { + error_setg(errp, "Migration requires streamable transport (eg unix)"); + return false; + } + return true; } @@ -316,6 +324,7 @@ void migration_cancel(const Error *error) qmp_cancel_vcpu_dirty_limit(false, -1, NULL); } migrate_fd_cancel(current_migration); + migrate_hup_delete(current_migration); } void migration_shutdown(void) @@ -718,6 +727,9 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, } else { error_setg(errp, "unknown migration protocol: %s", uri); } + + /* Close cpr socket to tell source that we are listening */ + cpr_state_close(); } static void process_incoming_migration_bh(void *opaque) @@ -1414,6 +1426,8 @@ static void migrate_fd_cleanup(MigrationState *s) s->vmdesc = NULL; qemu_savevm_state_cleanup(); + cpr_state_close(); + migrate_hup_delete(s); close_return_path_on_source(s); @@ -1698,7 +1712,9 @@ bool migration_thread_is_self(void) bool migrate_mode_is_cpr(MigrationState *s) { - return s->parameters.mode == MIG_MODE_CPR_REBOOT; + MigMode mode = s->parameters.mode; + return mode == MIG_MODE_CPR_REBOOT || + mode == MIG_MODE_CPR_TRANSFER; } int migrate_init(MigrationState *s, Error **errp) @@ -2033,6 +2049,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) return false; } + if (migrate_mode() == MIG_MODE_CPR_TRANSFER && + !s->parameters.cpr_uri) { + error_setg(errp, "cpr-transfer mode requires setting cpr-uri"); + return false; + } + if (migration_is_blocked(errp)) { return false; } @@ -2076,6 +2098,37 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested, Error **errp); +static void migrate_hup_add(MigrationState *s, QIOChannel *ioc, GSourceFunc cb, + void *opaque) +{ + s->hup_source = qio_channel_create_watch(ioc, G_IO_HUP); + g_source_set_callback(s->hup_source, cb, opaque, NULL); + g_source_attach(s->hup_source, NULL); +} + +static void migrate_hup_delete(MigrationState *s) +{ + if (s->hup_source) { + g_source_destroy(s->hup_source); + g_source_unref(s->hup_source); + s->hup_source = NULL; + } +} + +static gboolean qmp_migrate_finish_cb(QIOChannel *channel, + GIOCondition cond, + void *opaque) +{ + MigrationAddress *addr = opaque; + + qmp_migrate_finish(addr, false, NULL); + + cpr_state_close(); + migrate_hup_delete(migrate_get_current()); + qapi_free_MigrationAddress(addr); + return G_SOURCE_REMOVE; +} + void qmp_migrate(const char *uri, bool has_channels, MigrationChannelList *channels, bool has_detach, bool detach, bool has_resume, bool resume, Error **errp) @@ -2136,7 +2189,19 @@ void qmp_migrate(const char *uri, bool has_channels, goto out; } - qmp_migrate_finish(addr, resume_requested, errp); + /* + * For cpr-transfer, the target may not be listening yet on the migration + * channel, because first it must finish cpr_load_state. The target tells + * us it is listening by closing the cpr-state socket. Wait for that HUP + * event before connecting in qmp_migrate_finish. + */ + if (s->parameters.mode == MIG_MODE_CPR_TRANSFER) { + migrate_hup_add(s, cpr_state_ioc(), (GSourceFunc)qmp_migrate_finish_cb, + QAPI_CLONE(MigrationAddress, addr)); + + } else { + qmp_migrate_finish(addr, resume_requested, errp); + } out: if (local_err) { diff --git a/migration/migration.h b/migration/migration.h index 38aa140..74c167b 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -457,6 +457,8 @@ struct MigrationState { bool switchover_acked; /* Is this a rdma migration */ bool rdma_migration; + + GSource *hup_source; }; void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, diff --git a/migration/ram.c b/migration/ram.c index 81eda27..e2cef50 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -216,7 +216,9 @@ static bool postcopy_preempt_active(void) bool migrate_ram_is_ignored(RAMBlock *block) { + MigMode mode = migrate_mode(); return !qemu_ram_is_migratable(block) || + mode == MIG_MODE_CPR_TRANSFER || (migrate_ignore_shared() && qemu_ram_is_shared(block) && qemu_ram_is_named_file(block)); } diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c index 6e45a4a..b5a55b8 100644 --- a/migration/vmstate-types.c +++ b/migration/vmstate-types.c @@ -15,6 +15,7 @@ #include "qemu-file.h" #include "migration.h" #include "migration/vmstate.h" +#include "migration/client-options.h" #include "qemu/error-report.h" #include "qemu/queue.h" #include "trace.h" @@ -321,7 +322,7 @@ static int get_fd(QEMUFile *f, void *pv, size_t size, { int32_t *v = pv; qemu_get_sbe32s(f, v); - if (*v < 0) { + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { return 0; } *v = qemu_file_get_fd(f); @@ -334,7 +335,7 @@ static int put_fd(QEMUFile *f, void *pv, size_t size, int32_t *v = pv; qemu_put_sbe32s(f, v); - if (*v < 0) { + if (*v < 0 || migrate_mode() != MIG_MODE_CPR_TRANSFER) { return 0; } return qemu_file_put_fd(f, *v); diff --git a/qapi/migration.json b/qapi/migration.json index c0d8bcc..f51b4cb 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -611,9 +611,34 @@ # or COLO. # # (since 8.2) +# +# @cpr-transfer: This mode allows the user to transfer a guest to a +# new QEMU instance on the same host with minimal guest pause +# time, by preserving guest RAM in place, albeit with new virtual +# addresses in new QEMU. +# +# The user starts new QEMU on the same host as old QEMU, with the +# the same arguments as old QEMU, plus the -incoming option. The +# user issues the migrate command to old QEMU, which stops the VM, +# saves state to the migration channels, and enters the +# postmigrate state. Execution resumes in new QEMU. Guest RAM is +# preserved in place, albeit with new virtual addresses in new +# QEMU. The incoming migration channel cannot be a file type. +# +# This mode requires a second migration channel, specified by the +# cpr-uri migration property on the outgoing side, and by +# the cpr-uri QEMU command-line option on the incoming +# side. The channel must be a type, such as unix socket, that +# supports SCM_RIGHTS. +# +# Memory-backend objects must have the share=on attribute, but +# memory-backend-epc is not supported. The VM must be started +# with the '-machine anon-alloc=memfd' option. +# +# (since 9.2) ## { 'enum': 'MigMode', - 'data': [ 'normal', 'cpr-reboot' ] } + 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] } ## # @ZeroPageDetection: diff --git a/stubs/vmstate.c b/stubs/vmstate.c index 8513d92..c190762 100644 --- a/stubs/vmstate.c +++ b/stubs/vmstate.c @@ -1,5 +1,7 @@ #include "qemu/osdep.h" #include "migration/vmstate.h" +#include "qapi/qapi-types-migration.h" +#include "migration/client-options.h" int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id, @@ -21,3 +23,8 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd) { return true; } + +MigMode migrate_mode(void) +{ + return MIG_MODE_NORMAL; +}
Add the cpr-transfer migration mode. Usage: qemu-system-$arch -machine anon-alloc=memfd ... start new QEMU with "-incoming <uri-1> -cpr-uri <uri-2>" Issue commands to old QEMU: migrate_set_parameter mode cpr-transfer migrate_set_parameter cpr-uri <uri-2> migrate -d <uri-1> The migrate command stops the VM, saves CPR state to uri-2, saves normal migration state to uri-1, and old QEMU enters the postmigrate state. The user starts new QEMU on the same host as old QEMU, with the same arguments as old QEMU, plus the -incoming option. Guest RAM is preserved in place, albeit with new virtual addresses in new QEMU. This mode requires a second migration channel, specified by the cpr-uri migration property on the outgoing side, and by the cpr-uri QEMU command-line option on the incoming side. The channel must be a type, such as unix socket, that supports SCM_RIGHTS. Memory-backend objects must have the share=on attribute, but memory-backend-epc is not supported. The VM must be started with the '-machine anon-alloc=memfd' option, which allows anonymous memory to be transferred in place to the new process. The memfds are kept open by sending the descriptors to new QEMU via the cpr-uri, which must support SCM_RIGHTS, and they are mmap'd in new QEMU. Signed-off-by: Steve Sistare <steven.sistare@oracle.com> --- include/migration/cpr.h | 1 + migration/cpr.c | 34 +++++++++++++++++++---- migration/migration.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-- migration/migration.h | 2 ++ migration/ram.c | 2 ++ migration/vmstate-types.c | 5 ++-- qapi/migration.json | 27 ++++++++++++++++++- stubs/vmstate.c | 7 +++++ 8 files changed, 137 insertions(+), 10 deletions(-)