Message ID | 20210121152458.193248-5-andrey.gruzdev@virtuozzo.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | UFFD write-tracking migration/snapshots | expand |
* Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: > Introducing implementation of 'background' snapshot thread > which in overall follows the logic of precopy migration > while internally utilizes completely different mechanism > to 'freeze' vmstate at the start of snapshot creation. > > This mechanism is based on userfault_fd with wr-protection > support and is Linux-specific. > > Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> > Acked-by: Peter Xu <peterx@redhat.com> > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Thanks for fixing the thread name; I've just noticed something though: > --- > migration/migration.c | 263 ++++++++++++++++++++++++++++++++++++++++-- > migration/migration.h | 3 + > migration/savevm.c | 1 - > migration/savevm.h | 2 + > 4 files changed, 258 insertions(+), 11 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index f018337fbc..869afa7a86 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1130,7 +1130,6 @@ static void fill_source_migration_info(MigrationInfo *info) > info->status = s->state; > } > > -#ifdef CONFIG_LINUX > typedef enum WriteTrackingSupport { > WT_SUPPORT_UNKNOWN = 0, > WT_SUPPORT_ABSENT, > @@ -1155,7 +1154,6 @@ WriteTrackingSupport migrate_query_write_tracking(void) > > return WT_SUPPORT_COMPATIBLE; > } > -#endif /* CONFIG_LINUX */ Why is this 4/5 patch removing these ifdef's (including the one below); since it's the ram_write_tracking_available that lets you know if you can do it, and htat's got ifdef's, why do these lines go in at all? Dave > /** > * @migration_caps_check - check capability validity > @@ -1219,7 +1217,6 @@ static bool migrate_caps_check(bool *cap_list, > } > > if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { > -#ifdef CONFIG_LINUX > WriteTrackingSupport wt_support; > int idx; > /* > @@ -1250,11 +1247,6 @@ static bool migrate_caps_check(bool *cap_list, > return false; > } > } > -#else > - error_setg(errp, > - "Background-snapshot is not supported on non-Linux hosts"); > - return false; > -#endif > } > > return true; > @@ -2016,6 +2008,7 @@ void migrate_init(MigrationState *s) > * locks. > */ > s->cleanup_bh = 0; > + s->vm_start_bh = 0; > s->to_dst_file = NULL; > s->state = MIGRATION_STATUS_NONE; > s->rp_state.from_dst_file = NULL; > @@ -3233,6 +3226,50 @@ fail: > MIGRATION_STATUS_FAILED); > } > > +/** > + * bg_migration_completion: Used by bg_migration_thread when after all the > + * RAM has been saved. The caller 'breaks' the loop when this returns. > + * > + * @s: Current migration state > + */ > +static void bg_migration_completion(MigrationState *s) > +{ > + int current_active_state = s->state; > + > + /* > + * Stop tracking RAM writes - un-protect memory, un-register UFFD > + * memory ranges, flush kernel wait queues and wake up threads > + * waiting for write fault to be resolved. > + */ > + ram_write_tracking_stop(); > + > + if (s->state == MIGRATION_STATUS_ACTIVE) { > + /* > + * By this moment we have RAM content saved into the migration stream. > + * The next step is to flush the non-RAM content (device state) > + * right after the ram content. The device state has been stored into > + * the temporary buffer before RAM saving started. > + */ > + qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); > + qemu_fflush(s->to_dst_file); > + } else if (s->state == MIGRATION_STATUS_CANCELLING) { > + goto fail; > + } > + > + if (qemu_file_get_error(s->to_dst_file)) { > + trace_migration_completion_file_err(); > + goto fail; > + } > + > + migrate_set_state(&s->state, current_active_state, > + MIGRATION_STATUS_COMPLETED); > + return; > + > +fail: > + migrate_set_state(&s->state, current_active_state, > + MIGRATION_STATUS_FAILED); > +} > + > bool migrate_colo_enabled(void) > { > MigrationState *s = migrate_get_current(); > @@ -3573,6 +3610,47 @@ static void migration_iteration_finish(MigrationState *s) > qemu_mutex_unlock_iothread(); > } > > +static void bg_migration_iteration_finish(MigrationState *s) > +{ > + qemu_mutex_lock_iothread(); > + switch (s->state) { > + case MIGRATION_STATUS_COMPLETED: > + migration_calculate_complete(s); > + break; > + > + case MIGRATION_STATUS_ACTIVE: > + case MIGRATION_STATUS_FAILED: > + case MIGRATION_STATUS_CANCELLED: > + case MIGRATION_STATUS_CANCELLING: > + break; > + > + default: > + /* Should not reach here, but if so, forgive the VM. */ > + error_report("%s: Unknown ending state %d", __func__, s->state); > + break; > + } > + > + migrate_fd_cleanup_schedule(s); > + qemu_mutex_unlock_iothread(); > +} > + > +/* > + * Return true if continue to the next iteration directly, false > + * otherwise. > + */ > +static MigIterateState bg_migration_iteration_run(MigrationState *s) > +{ > + int res; > + > + res = qemu_savevm_state_iterate(s->to_dst_file, false); > + if (res > 0) { > + bg_migration_completion(s); > + return MIG_ITERATE_BREAK; > + } > + > + return MIG_ITERATE_RESUME; > +} > + > void migration_make_urgent_request(void) > { > qemu_sem_post(&migrate_get_current()->rate_limit_sem); > @@ -3720,6 +3798,165 @@ static void *migration_thread(void *opaque) > return NULL; > } > > +static void bg_migration_vm_start_bh(void *opaque) > +{ > + MigrationState *s = opaque; > + > + qemu_bh_delete(s->vm_start_bh); > + s->vm_start_bh = NULL; > + > + vm_start(); > + s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start; > +} > + > +/** > + * Background snapshot thread, based on live migration code. > + * This is an alternative implementation of live migration mechanism > + * introduced specifically to support background snapshots. > + * > + * It takes advantage of userfault_fd write protection mechanism introduced > + * in v5.7 kernel. Compared to existing dirty page logging migration much > + * lesser stream traffic is produced resulting in smaller snapshot images, > + * simply cause of no page duplicates can get into the stream. > + * > + * Another key point is that generated vmstate stream reflects machine state > + * 'frozen' at the beginning of snapshot creation compared to dirty page logging > + * mechanism, which effectively results in that saved snapshot is the state of VM > + * at the end of the process. > + */ > +static void *bg_migration_thread(void *opaque) > +{ > + MigrationState *s = opaque; > + int64_t setup_start; > + MigThrError thr_error; > + QEMUFile *fb; > + bool early_fail = true; > + > + rcu_register_thread(); > + object_ref(OBJECT(s)); > + > + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); > + > + setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); > + /* > + * We want to save vmstate for the moment when migration has been > + * initiated but also we want to save RAM content while VM is running. > + * The RAM content should appear first in the vmstate. So, we first > + * stash the non-RAM part of the vmstate to the temporary buffer, > + * then write RAM part of the vmstate to the migration stream > + * with vCPUs running and, finally, write stashed non-RAM part of > + * the vmstate from the buffer to the migration stream. > + */ > + s->bioc = qio_channel_buffer_new(128 * 1024); > + qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); > + fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc)); > + object_unref(OBJECT(s->bioc)); > + > + update_iteration_initial_status(s); > + > + qemu_savevm_state_header(s->to_dst_file); > + qemu_savevm_state_setup(s->to_dst_file); > + > + if (qemu_savevm_state_guest_unplug_pending()) { > + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, > + MIGRATION_STATUS_WAIT_UNPLUG); > + > + while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && > + qemu_savevm_state_guest_unplug_pending()) { > + qemu_sem_timedwait(&s->wait_unplug_sem, 250); > + } > + > + migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, > + MIGRATION_STATUS_ACTIVE); > + } else { > + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, > + MIGRATION_STATUS_ACTIVE); > + } > + s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; > + > + trace_migration_thread_setup_complete(); > + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > + > + qemu_mutex_lock_iothread(); > + > + /* > + * If VM is currently in suspended state, then, to make a valid runstate > + * transition in vm_stop_force_state() we need to wakeup it up. > + */ > + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); > + s->vm_was_running = runstate_is_running(); > + > + if (global_state_store()) { > + goto fail; > + } > + /* Forcibly stop VM before saving state of vCPUs and devices */ > + if (vm_stop_force_state(RUN_STATE_PAUSED)) { > + goto fail; > + } > + /* > + * Put vCPUs in sync with shadow context structures, then > + * save their state to channel-buffer along with devices. > + */ > + cpu_synchronize_all_states(); > + if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { > + goto fail; > + } > + /* Now initialize UFFD context and start tracking RAM writes */ > + if (ram_write_tracking_start()) { > + goto fail; > + } > + early_fail = false; > + > + /* > + * Start VM from BH handler to avoid write-fault lock here. > + * UFFD-WP protection for the whole RAM is already enabled so > + * calling VM state change notifiers from vm_start() would initiate > + * writes to virtio VQs memory which is in write-protected region. > + */ > + s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); > + qemu_bh_schedule(s->vm_start_bh); > + > + qemu_mutex_unlock_iothread(); > + > + while (migration_is_active(s)) { > + MigIterateState iter_state = bg_migration_iteration_run(s); > + if (iter_state == MIG_ITERATE_SKIP) { > + continue; > + } else if (iter_state == MIG_ITERATE_BREAK) { > + break; > + } > + > + /* > + * Try to detect any kind of failures, and see whether we > + * should stop the migration now. > + */ > + thr_error = migration_detect_error(s); > + if (thr_error == MIG_THR_ERR_FATAL) { > + /* Stop migration */ > + break; > + } > + > + migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); > + } > + > + trace_migration_thread_after_loop(); > + > +fail: > + if (early_fail) { > + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > + MIGRATION_STATUS_FAILED); > + qemu_mutex_unlock_iothread(); > + } > + > + bg_migration_iteration_finish(s); > + > + qemu_fclose(fb); > + object_unref(OBJECT(s)); > + rcu_unregister_thread(); > + > + return NULL; > +} > + > void migrate_fd_connect(MigrationState *s, Error *error_in) > { > Error *local_err = NULL; > @@ -3783,8 +4020,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) > migrate_fd_cleanup(s); > return; > } > - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, > - QEMU_THREAD_JOINABLE); > + > + if (migrate_background_snapshot()) { > + qemu_thread_create(&s->thread, "bg_snapshot", > + bg_migration_thread, s, QEMU_THREAD_JOINABLE); > + } else { > + qemu_thread_create(&s->thread, "live_migration", > + migration_thread, s, QEMU_THREAD_JOINABLE); > + } > s->migration_thread_running = true; > } > > diff --git a/migration/migration.h b/migration/migration.h > index f40338cfbf..0723955cd7 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -20,6 +20,7 @@ > #include "qemu/thread.h" > #include "qemu/coroutine_int.h" > #include "io/channel.h" > +#include "io/channel-buffer.h" > #include "net/announce.h" > #include "qom/object.h" > > @@ -147,8 +148,10 @@ struct MigrationState { > > /*< public >*/ > QemuThread thread; > + QEMUBH *vm_start_bh; > QEMUBH *cleanup_bh; > QEMUFile *to_dst_file; > + QIOChannelBuffer *bioc; > /* > * Protects to_dst_file pointer. We need to make sure we won't > * yield or hang during the critical section, since this lock will > diff --git a/migration/savevm.c b/migration/savevm.c > index 4f3b69ecfc..9f8ad5e0f5 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1355,7 +1355,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) > return 0; > } > > -static > int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > bool in_postcopy, > bool inactivate_disks) > diff --git a/migration/savevm.h b/migration/savevm.h > index ba64a7e271..aaee2528ed 100644 > --- a/migration/savevm.h > +++ b/migration/savevm.h > @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); > void qemu_loadvm_state_cleanup(void); > int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); > int qemu_load_device_state(QEMUFile *f); > +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, > + bool in_postcopy, bool inactivate_disks); > > #endif > -- > 2.25.1 >
On 28.01.2021 21:29, Dr. David Alan Gilbert wrote: > * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote: >> Introducing implementation of 'background' snapshot thread >> which in overall follows the logic of precopy migration >> while internally utilizes completely different mechanism >> to 'freeze' vmstate at the start of snapshot creation. >> >> This mechanism is based on userfault_fd with wr-protection >> support and is Linux-specific. >> >> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> >> Acked-by: Peter Xu <peterx@redhat.com> >> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > Thanks for fixing the thread name; I've just noticed something though: > >> --- >> migration/migration.c | 263 ++++++++++++++++++++++++++++++++++++++++-- >> migration/migration.h | 3 + >> migration/savevm.c | 1 - >> migration/savevm.h | 2 + >> 4 files changed, 258 insertions(+), 11 deletions(-) >> >> diff --git a/migration/migration.c b/migration/migration.c >> index f018337fbc..869afa7a86 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -1130,7 +1130,6 @@ static void fill_source_migration_info(MigrationInfo *info) >> info->status = s->state; >> } >> >> -#ifdef CONFIG_LINUX >> typedef enum WriteTrackingSupport { >> WT_SUPPORT_UNKNOWN = 0, >> WT_SUPPORT_ABSENT, >> @@ -1155,7 +1154,6 @@ WriteTrackingSupport migrate_query_write_tracking(void) >> >> return WT_SUPPORT_COMPATIBLE; >> } >> -#endif /* CONFIG_LINUX */ > Why is this 4/5 patch removing these ifdef's (including the one below); > since it's the ram_write_tracking_available that lets you know if you > can do it, and htat's got ifdef's, why do these lines go in at all? > > Dave > I think I just missed to remove #ifdef's from the first patch, that's the reason. I'll fix it. Andrey >> /** >> * @migration_caps_check - check capability validity >> @@ -1219,7 +1217,6 @@ static bool migrate_caps_check(bool *cap_list, >> } >> >> if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { >> -#ifdef CONFIG_LINUX >> WriteTrackingSupport wt_support; >> int idx; >> /* >> @@ -1250,11 +1247,6 @@ static bool migrate_caps_check(bool *cap_list, >> return false; >> } >> } >> -#else >> - error_setg(errp, >> - "Background-snapshot is not supported on non-Linux hosts"); >> - return false; >> -#endif >> } >> >> return true; >> @@ -2016,6 +2008,7 @@ void migrate_init(MigrationState *s) >> * locks. >> */ >> s->cleanup_bh = 0; >> + s->vm_start_bh = 0; >> s->to_dst_file = NULL; >> s->state = MIGRATION_STATUS_NONE; >> s->rp_state.from_dst_file = NULL; >> @@ -3233,6 +3226,50 @@ fail: >> MIGRATION_STATUS_FAILED); >> } >> >> +/** >> + * bg_migration_completion: Used by bg_migration_thread when after all the >> + * RAM has been saved. The caller 'breaks' the loop when this returns. >> + * >> + * @s: Current migration state >> + */ >> +static void bg_migration_completion(MigrationState *s) >> +{ >> + int current_active_state = s->state; >> + >> + /* >> + * Stop tracking RAM writes - un-protect memory, un-register UFFD >> + * memory ranges, flush kernel wait queues and wake up threads >> + * waiting for write fault to be resolved. >> + */ >> + ram_write_tracking_stop(); >> + >> + if (s->state == MIGRATION_STATUS_ACTIVE) { >> + /* >> + * By this moment we have RAM content saved into the migration stream. >> + * The next step is to flush the non-RAM content (device state) >> + * right after the ram content. The device state has been stored into >> + * the temporary buffer before RAM saving started. >> + */ >> + qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); >> + qemu_fflush(s->to_dst_file); >> + } else if (s->state == MIGRATION_STATUS_CANCELLING) { >> + goto fail; >> + } >> + >> + if (qemu_file_get_error(s->to_dst_file)) { >> + trace_migration_completion_file_err(); >> + goto fail; >> + } >> + >> + migrate_set_state(&s->state, current_active_state, >> + MIGRATION_STATUS_COMPLETED); >> + return; >> + >> +fail: >> + migrate_set_state(&s->state, current_active_state, >> + MIGRATION_STATUS_FAILED); >> +} >> + >> bool migrate_colo_enabled(void) >> { >> MigrationState *s = migrate_get_current(); >> @@ -3573,6 +3610,47 @@ static void migration_iteration_finish(MigrationState *s) >> qemu_mutex_unlock_iothread(); >> } >> >> +static void bg_migration_iteration_finish(MigrationState *s) >> +{ >> + qemu_mutex_lock_iothread(); >> + switch (s->state) { >> + case MIGRATION_STATUS_COMPLETED: >> + migration_calculate_complete(s); >> + break; >> + >> + case MIGRATION_STATUS_ACTIVE: >> + case MIGRATION_STATUS_FAILED: >> + case MIGRATION_STATUS_CANCELLED: >> + case MIGRATION_STATUS_CANCELLING: >> + break; >> + >> + default: >> + /* Should not reach here, but if so, forgive the VM. */ >> + error_report("%s: Unknown ending state %d", __func__, s->state); >> + break; >> + } >> + >> + migrate_fd_cleanup_schedule(s); >> + qemu_mutex_unlock_iothread(); >> +} >> + >> +/* >> + * Return true if continue to the next iteration directly, false >> + * otherwise. >> + */ >> +static MigIterateState bg_migration_iteration_run(MigrationState *s) >> +{ >> + int res; >> + >> + res = qemu_savevm_state_iterate(s->to_dst_file, false); >> + if (res > 0) { >> + bg_migration_completion(s); >> + return MIG_ITERATE_BREAK; >> + } >> + >> + return MIG_ITERATE_RESUME; >> +} >> + >> void migration_make_urgent_request(void) >> { >> qemu_sem_post(&migrate_get_current()->rate_limit_sem); >> @@ -3720,6 +3798,165 @@ static void *migration_thread(void *opaque) >> return NULL; >> } >> >> +static void bg_migration_vm_start_bh(void *opaque) >> +{ >> + MigrationState *s = opaque; >> + >> + qemu_bh_delete(s->vm_start_bh); >> + s->vm_start_bh = NULL; >> + >> + vm_start(); >> + s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start; >> +} >> + >> +/** >> + * Background snapshot thread, based on live migration code. >> + * This is an alternative implementation of live migration mechanism >> + * introduced specifically to support background snapshots. >> + * >> + * It takes advantage of userfault_fd write protection mechanism introduced >> + * in v5.7 kernel. Compared to existing dirty page logging migration much >> + * lesser stream traffic is produced resulting in smaller snapshot images, >> + * simply cause of no page duplicates can get into the stream. >> + * >> + * Another key point is that generated vmstate stream reflects machine state >> + * 'frozen' at the beginning of snapshot creation compared to dirty page logging >> + * mechanism, which effectively results in that saved snapshot is the state of VM >> + * at the end of the process. >> + */ >> +static void *bg_migration_thread(void *opaque) >> +{ >> + MigrationState *s = opaque; >> + int64_t setup_start; >> + MigThrError thr_error; >> + QEMUFile *fb; >> + bool early_fail = true; >> + >> + rcu_register_thread(); >> + object_ref(OBJECT(s)); >> + >> + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); >> + >> + setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); >> + /* >> + * We want to save vmstate for the moment when migration has been >> + * initiated but also we want to save RAM content while VM is running. >> + * The RAM content should appear first in the vmstate. So, we first >> + * stash the non-RAM part of the vmstate to the temporary buffer, >> + * then write RAM part of the vmstate to the migration stream >> + * with vCPUs running and, finally, write stashed non-RAM part of >> + * the vmstate from the buffer to the migration stream. >> + */ >> + s->bioc = qio_channel_buffer_new(128 * 1024); >> + qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); >> + fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc)); >> + object_unref(OBJECT(s->bioc)); >> + >> + update_iteration_initial_status(s); >> + >> + qemu_savevm_state_header(s->to_dst_file); >> + qemu_savevm_state_setup(s->to_dst_file); >> + >> + if (qemu_savevm_state_guest_unplug_pending()) { >> + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, >> + MIGRATION_STATUS_WAIT_UNPLUG); >> + >> + while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && >> + qemu_savevm_state_guest_unplug_pending()) { >> + qemu_sem_timedwait(&s->wait_unplug_sem, 250); >> + } >> + >> + migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, >> + MIGRATION_STATUS_ACTIVE); >> + } else { >> + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, >> + MIGRATION_STATUS_ACTIVE); >> + } >> + s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; >> + >> + trace_migration_thread_setup_complete(); >> + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); >> + >> + qemu_mutex_lock_iothread(); >> + >> + /* >> + * If VM is currently in suspended state, then, to make a valid runstate >> + * transition in vm_stop_force_state() we need to wakeup it up. >> + */ >> + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); >> + s->vm_was_running = runstate_is_running(); >> + >> + if (global_state_store()) { >> + goto fail; >> + } >> + /* Forcibly stop VM before saving state of vCPUs and devices */ >> + if (vm_stop_force_state(RUN_STATE_PAUSED)) { >> + goto fail; >> + } >> + /* >> + * Put vCPUs in sync with shadow context structures, then >> + * save their state to channel-buffer along with devices. >> + */ >> + cpu_synchronize_all_states(); >> + if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { >> + goto fail; >> + } >> + /* Now initialize UFFD context and start tracking RAM writes */ >> + if (ram_write_tracking_start()) { >> + goto fail; >> + } >> + early_fail = false; >> + >> + /* >> + * Start VM from BH handler to avoid write-fault lock here. >> + * UFFD-WP protection for the whole RAM is already enabled so >> + * calling VM state change notifiers from vm_start() would initiate >> + * writes to virtio VQs memory which is in write-protected region. >> + */ >> + s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); >> + qemu_bh_schedule(s->vm_start_bh); >> + >> + qemu_mutex_unlock_iothread(); >> + >> + while (migration_is_active(s)) { >> + MigIterateState iter_state = bg_migration_iteration_run(s); >> + if (iter_state == MIG_ITERATE_SKIP) { >> + continue; >> + } else if (iter_state == MIG_ITERATE_BREAK) { >> + break; >> + } >> + >> + /* >> + * Try to detect any kind of failures, and see whether we >> + * should stop the migration now. >> + */ >> + thr_error = migration_detect_error(s); >> + if (thr_error == MIG_THR_ERR_FATAL) { >> + /* Stop migration */ >> + break; >> + } >> + >> + migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); >> + } >> + >> + trace_migration_thread_after_loop(); >> + >> +fail: >> + if (early_fail) { >> + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, >> + MIGRATION_STATUS_FAILED); >> + qemu_mutex_unlock_iothread(); >> + } >> + >> + bg_migration_iteration_finish(s); >> + >> + qemu_fclose(fb); >> + object_unref(OBJECT(s)); >> + rcu_unregister_thread(); >> + >> + return NULL; >> +} >> + >> void migrate_fd_connect(MigrationState *s, Error *error_in) >> { >> Error *local_err = NULL; >> @@ -3783,8 +4020,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) >> migrate_fd_cleanup(s); >> return; >> } >> - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, >> - QEMU_THREAD_JOINABLE); >> + >> + if (migrate_background_snapshot()) { >> + qemu_thread_create(&s->thread, "bg_snapshot", >> + bg_migration_thread, s, QEMU_THREAD_JOINABLE); >> + } else { >> + qemu_thread_create(&s->thread, "live_migration", >> + migration_thread, s, QEMU_THREAD_JOINABLE); >> + } >> s->migration_thread_running = true; >> } >> >> diff --git a/migration/migration.h b/migration/migration.h >> index f40338cfbf..0723955cd7 100644 >> --- a/migration/migration.h >> +++ b/migration/migration.h >> @@ -20,6 +20,7 @@ >> #include "qemu/thread.h" >> #include "qemu/coroutine_int.h" >> #include "io/channel.h" >> +#include "io/channel-buffer.h" >> #include "net/announce.h" >> #include "qom/object.h" >> >> @@ -147,8 +148,10 @@ struct MigrationState { >> >> /*< public >*/ >> QemuThread thread; >> + QEMUBH *vm_start_bh; >> QEMUBH *cleanup_bh; >> QEMUFile *to_dst_file; >> + QIOChannelBuffer *bioc; >> /* >> * Protects to_dst_file pointer. We need to make sure we won't >> * yield or hang during the critical section, since this lock will >> diff --git a/migration/savevm.c b/migration/savevm.c >> index 4f3b69ecfc..9f8ad5e0f5 100644 >> --- a/migration/savevm.c >> +++ b/migration/savevm.c >> @@ -1355,7 +1355,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) >> return 0; >> } >> >> -static >> int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >> bool in_postcopy, >> bool inactivate_disks) >> diff --git a/migration/savevm.h b/migration/savevm.h >> index ba64a7e271..aaee2528ed 100644 >> --- a/migration/savevm.h >> +++ b/migration/savevm.h >> @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); >> void qemu_loadvm_state_cleanup(void); >> int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); >> int qemu_load_device_state(QEMUFile *f); >> +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, >> + bool in_postcopy, bool inactivate_disks); >> >> #endif >> -- >> 2.25.1 >>
diff --git a/migration/migration.c b/migration/migration.c index f018337fbc..869afa7a86 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1130,7 +1130,6 @@ static void fill_source_migration_info(MigrationInfo *info) info->status = s->state; } -#ifdef CONFIG_LINUX typedef enum WriteTrackingSupport { WT_SUPPORT_UNKNOWN = 0, WT_SUPPORT_ABSENT, @@ -1155,7 +1154,6 @@ WriteTrackingSupport migrate_query_write_tracking(void) return WT_SUPPORT_COMPATIBLE; } -#endif /* CONFIG_LINUX */ /** * @migration_caps_check - check capability validity @@ -1219,7 +1217,6 @@ static bool migrate_caps_check(bool *cap_list, } if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { -#ifdef CONFIG_LINUX WriteTrackingSupport wt_support; int idx; /* @@ -1250,11 +1247,6 @@ static bool migrate_caps_check(bool *cap_list, return false; } } -#else - error_setg(errp, - "Background-snapshot is not supported on non-Linux hosts"); - return false; -#endif } return true; @@ -2016,6 +2008,7 @@ void migrate_init(MigrationState *s) * locks. */ s->cleanup_bh = 0; + s->vm_start_bh = 0; s->to_dst_file = NULL; s->state = MIGRATION_STATUS_NONE; s->rp_state.from_dst_file = NULL; @@ -3233,6 +3226,50 @@ fail: MIGRATION_STATUS_FAILED); } +/** + * bg_migration_completion: Used by bg_migration_thread when after all the + * RAM has been saved. The caller 'breaks' the loop when this returns. + * + * @s: Current migration state + */ +static void bg_migration_completion(MigrationState *s) +{ + int current_active_state = s->state; + + /* + * Stop tracking RAM writes - un-protect memory, un-register UFFD + * memory ranges, flush kernel wait queues and wake up threads + * waiting for write fault to be resolved. + */ + ram_write_tracking_stop(); + + if (s->state == MIGRATION_STATUS_ACTIVE) { + /* + * By this moment we have RAM content saved into the migration stream. + * The next step is to flush the non-RAM content (device state) + * right after the ram content. The device state has been stored into + * the temporary buffer before RAM saving started. + */ + qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); + qemu_fflush(s->to_dst_file); + } else if (s->state == MIGRATION_STATUS_CANCELLING) { + goto fail; + } + + if (qemu_file_get_error(s->to_dst_file)) { + trace_migration_completion_file_err(); + goto fail; + } + + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_COMPLETED); + return; + +fail: + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_FAILED); +} + bool migrate_colo_enabled(void) { MigrationState *s = migrate_get_current(); @@ -3573,6 +3610,47 @@ static void migration_iteration_finish(MigrationState *s) qemu_mutex_unlock_iothread(); } +static void bg_migration_iteration_finish(MigrationState *s) +{ + qemu_mutex_lock_iothread(); + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: + migration_calculate_complete(s); + break; + + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_CANCELLING: + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; + } + + migrate_fd_cleanup_schedule(s); + qemu_mutex_unlock_iothread(); +} + +/* + * Return true if continue to the next iteration directly, false + * otherwise. + */ +static MigIterateState bg_migration_iteration_run(MigrationState *s) +{ + int res; + + res = qemu_savevm_state_iterate(s->to_dst_file, false); + if (res > 0) { + bg_migration_completion(s); + return MIG_ITERATE_BREAK; + } + + return MIG_ITERATE_RESUME; +} + void migration_make_urgent_request(void) { qemu_sem_post(&migrate_get_current()->rate_limit_sem); @@ -3720,6 +3798,165 @@ static void *migration_thread(void *opaque) return NULL; } +static void bg_migration_vm_start_bh(void *opaque) +{ + MigrationState *s = opaque; + + qemu_bh_delete(s->vm_start_bh); + s->vm_start_bh = NULL; + + vm_start(); + s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start; +} + +/** + * Background snapshot thread, based on live migration code. + * This is an alternative implementation of live migration mechanism + * introduced specifically to support background snapshots. + * + * It takes advantage of userfault_fd write protection mechanism introduced + * in v5.7 kernel. Compared to existing dirty page logging migration much + * lesser stream traffic is produced resulting in smaller snapshot images, + * simply cause of no page duplicates can get into the stream. + * + * Another key point is that generated vmstate stream reflects machine state + * 'frozen' at the beginning of snapshot creation compared to dirty page logging + * mechanism, which effectively results in that saved snapshot is the state of VM + * at the end of the process. + */ +static void *bg_migration_thread(void *opaque) +{ + MigrationState *s = opaque; + int64_t setup_start; + MigThrError thr_error; + QEMUFile *fb; + bool early_fail = true; + + rcu_register_thread(); + object_ref(OBJECT(s)); + + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); + + setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); + /* + * We want to save vmstate for the moment when migration has been + * initiated but also we want to save RAM content while VM is running. + * The RAM content should appear first in the vmstate. So, we first + * stash the non-RAM part of the vmstate to the temporary buffer, + * then write RAM part of the vmstate to the migration stream + * with vCPUs running and, finally, write stashed non-RAM part of + * the vmstate from the buffer to the migration stream. + */ + s->bioc = qio_channel_buffer_new(128 * 1024); + qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); + fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc)); + object_unref(OBJECT(s->bioc)); + + update_iteration_initial_status(s); + + qemu_savevm_state_header(s->to_dst_file); + qemu_savevm_state_setup(s->to_dst_file); + + if (qemu_savevm_state_guest_unplug_pending()) { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_WAIT_UNPLUG); + + while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && + qemu_savevm_state_guest_unplug_pending()) { + qemu_sem_timedwait(&s->wait_unplug_sem, 250); + } + + migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, + MIGRATION_STATUS_ACTIVE); + } else { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_ACTIVE); + } + s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; + + trace_migration_thread_setup_complete(); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + qemu_mutex_lock_iothread(); + + /* + * If VM is currently in suspended state, then, to make a valid runstate + * transition in vm_stop_force_state() we need to wakeup it up. + */ + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); + s->vm_was_running = runstate_is_running(); + + if (global_state_store()) { + goto fail; + } + /* Forcibly stop VM before saving state of vCPUs and devices */ + if (vm_stop_force_state(RUN_STATE_PAUSED)) { + goto fail; + } + /* + * Put vCPUs in sync with shadow context structures, then + * save their state to channel-buffer along with devices. + */ + cpu_synchronize_all_states(); + if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { + goto fail; + } + /* Now initialize UFFD context and start tracking RAM writes */ + if (ram_write_tracking_start()) { + goto fail; + } + early_fail = false; + + /* + * Start VM from BH handler to avoid write-fault lock here. + * UFFD-WP protection for the whole RAM is already enabled so + * calling VM state change notifiers from vm_start() would initiate + * writes to virtio VQs memory which is in write-protected region. + */ + s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); + qemu_bh_schedule(s->vm_start_bh); + + qemu_mutex_unlock_iothread(); + + while (migration_is_active(s)) { + MigIterateState iter_state = bg_migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { + continue; + } else if (iter_state == MIG_ITERATE_BREAK) { + break; + } + + /* + * Try to detect any kind of failures, and see whether we + * should stop the migration now. + */ + thr_error = migration_detect_error(s); + if (thr_error == MIG_THR_ERR_FATAL) { + /* Stop migration */ + break; + } + + migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); + } + + trace_migration_thread_after_loop(); + +fail: + if (early_fail) { + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_FAILED); + qemu_mutex_unlock_iothread(); + } + + bg_migration_iteration_finish(s); + + qemu_fclose(fb); + object_unref(OBJECT(s)); + rcu_unregister_thread(); + + return NULL; +} + void migrate_fd_connect(MigrationState *s, Error *error_in) { Error *local_err = NULL; @@ -3783,8 +4020,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) migrate_fd_cleanup(s); return; } - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, - QEMU_THREAD_JOINABLE); + + if (migrate_background_snapshot()) { + qemu_thread_create(&s->thread, "bg_snapshot", + bg_migration_thread, s, QEMU_THREAD_JOINABLE); + } else { + qemu_thread_create(&s->thread, "live_migration", + migration_thread, s, QEMU_THREAD_JOINABLE); + } s->migration_thread_running = true; } diff --git a/migration/migration.h b/migration/migration.h index f40338cfbf..0723955cd7 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -20,6 +20,7 @@ #include "qemu/thread.h" #include "qemu/coroutine_int.h" #include "io/channel.h" +#include "io/channel-buffer.h" #include "net/announce.h" #include "qom/object.h" @@ -147,8 +148,10 @@ struct MigrationState { /*< public >*/ QemuThread thread; + QEMUBH *vm_start_bh; QEMUBH *cleanup_bh; QEMUFile *to_dst_file; + QIOChannelBuffer *bioc; /* * Protects to_dst_file pointer. We need to make sure we won't * yield or hang during the critical section, since this lock will diff --git a/migration/savevm.c b/migration/savevm.c index 4f3b69ecfc..9f8ad5e0f5 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1355,7 +1355,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) return 0; } -static int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy, bool inactivate_disks) diff --git a/migration/savevm.h b/migration/savevm.h index ba64a7e271..aaee2528ed 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f); void qemu_loadvm_state_cleanup(void); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, + bool in_postcopy, bool inactivate_disks); #endif