@@ -2013,6 +2013,7 @@ void migrate_init(MigrationState *s)
* locks.
*/
s->cleanup_bh = 0;
+ s->wt_vm_start_bh = 0;
s->to_dst_file = NULL;
s->state = MIGRATION_STATUS_NONE;
s->rp_state.from_dst_file = NULL;
@@ -3551,6 +3552,21 @@ static void migration_iteration_finish(MigrationState *s)
qemu_mutex_unlock_iothread();
}
+static void wt_migration_iteration_finish(MigrationState *s)
+{
+ /* TODO: implement */
+}
+
+/*
+ * Return true if continue to the next iteration directly, false
+ * otherwise.
+ */
+static MigIterateState wt_migration_iteration_run(MigrationState *s)
+{
+ /* TODO: implement */
+ return MIG_ITERATE_RESUME;
+}
+
void migration_make_urgent_request(void)
{
qemu_sem_post(&migrate_get_current()->rate_limit_sem);
@@ -3698,6 +3714,154 @@ static void *migration_thread(void *opaque)
return NULL;
}
+static void wt_migration_vm_start_bh(void *opaque)
+{
+ /* TODO: implement */
+}
+
+/*
+ * Master migration thread on the source VM.
+ * This is an alternative implementation of live migration
+ * which uses userfault_fd write protection mechanism introduced in
+ * 5.7 kernel. Compared to existing dirty page logging migration
+ * it produces much lesser traffic and smaller snapshot images since
+ * no page duplicates can get into the stream. Another the key point
+ * is that generated vmstate stream reflects machine state 'frozen'
+ * at the beginning of migration compared to dirty page logging
+ * mechanism, which effectively results in that saved snapshot is the
+ * state at the end of migration process.
+ */
+static void *wt_migration_thread(void *opaque)
+{
+ MigrationState *s = opaque;
+ int64_t setup_start;
+ MigThrError thr_error;
+ QEMUFile *fb;
+
+ rcu_register_thread();
+ object_ref(OBJECT(s));
+
+ qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
+
+ setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+ /*
+ * We want to save vmstate for the moment when migration has been
+ * initiated but also we want to save RAM content while VM is running.
+ * The RAM content should appear first in the vmstate. So, we first
+ * stash the non-RAM part of the vmstate to the temporary buffer,
+ * then write RAM part of the vmstate to the migration stream
+ * with vCPUs running and, finally, write stashed non-RAM part of
+ * the vmstate from the buffer to the migration stream.
+ */
+ s->bioc = qio_channel_buffer_new(128 * 1024);
+ qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
+ fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc));
+ object_unref(OBJECT(s->bioc));
+
+ update_iteration_initial_status(s);
+
+ qemu_savevm_state_header(s->to_dst_file);
+ qemu_savevm_state_setup(s->to_dst_file);
+
+ if (qemu_savevm_state_guest_unplug_pending()) {
+ migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_WAIT_UNPLUG);
+
+ while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
+ qemu_savevm_state_guest_unplug_pending()) {
+ qemu_sem_timedwait(&s->wait_unplug_sem, 250);
+ }
+
+ migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG,
+ MIGRATION_STATUS_ACTIVE);
+ }
+ s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+
+ migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_ACTIVE);
+ trace_migration_thread_setup_complete();
+ s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+ qemu_mutex_lock_iothread();
+
+ bool fail_early = false;
+ do {
+ if (global_state_store()) {
+ fail_early = true;
+ break;
+ }
+ /* Forcibly stop VM before saving state of vCPUs and devices */
+ if (vm_stop_force_state(RUN_STATE_PAUSED)) {
+ fail_early = true;
+ break;
+ }
+ /*
+ * Put vCPUs in sync with shadow context structures, then
+ * save their state to channel-buffer along with devices.
+ */
+ cpu_synchronize_all_states();
+ if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
+ fail_early = true;
+ break;
+ }
+ /* Now initialize UFFD context and start tracking RAM writes */
+ if (ram_write_tracking_start()) {
+ fail_early = true;
+ }
+ } while (false);
+
+ if (fail_early) {
+ migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
+ MIGRATION_STATUS_FAILED);
+
+ qemu_mutex_unlock_iothread();
+ goto fail;
+ }
+
+ /*
+ * Start VM from BH handler to avoid write-fault lock here.
+ * UFFD-WP protection for the whole RAM is already enabled so
+ * calling VM state change notifiers from vm_start() would initiate
+ * writes to virtio VQs memory which is in write-protected region.
+ */
+ s->wt_vm_start_bh = qemu_bh_new(wt_migration_vm_start_bh, s);
+ qemu_bh_schedule(s->wt_vm_start_bh);
+
+ qemu_mutex_unlock_iothread();
+
+ while (migration_is_active(s)) {
+ MigIterateState iter_state = wt_migration_iteration_run(s);
+ if (iter_state == MIG_ITERATE_SKIP) {
+ continue;
+ } else if (iter_state == MIG_ITERATE_BREAK) {
+ break;
+ }
+
+ /*
+ * Try to detect any kind of failures, and see whether we
+ * should stop the migration now.
+ */
+ thr_error = migration_detect_error(s);
+ if (thr_error == MIG_THR_ERR_FATAL) {
+ /* Stop migration */
+ break;
+ }
+
+ migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
+ }
+
+ trace_migration_thread_after_loop();
+
+fail:
+ wt_migration_iteration_finish(s);
+
+ qemu_fclose(fb);
+ object_unref(OBJECT(s));
+ rcu_unregister_thread();
+
+ return NULL;
+}
+
void migrate_fd_connect(MigrationState *s, Error *error_in)
{
Error *local_err = NULL;
@@ -3761,8 +3925,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
migrate_fd_cleanup(s);
return;
}
- qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
- QEMU_THREAD_JOINABLE);
+
+ if (migrate_track_writes_ram()) {
+ qemu_thread_create(&s->thread, "wt_live_migration",
+ wt_migration_thread, s, QEMU_THREAD_JOINABLE);
+ } else {
+ qemu_thread_create(&s->thread, "live_migration",
+ migration_thread, s, QEMU_THREAD_JOINABLE);
+ }
s->migration_thread_running = true;
}
@@ -20,6 +20,7 @@
#include "qemu/thread.h"
#include "qemu/coroutine_int.h"
#include "io/channel.h"
+#include "io/channel-buffer.h"
#include "net/announce.h"
#include "qom/object.h"
@@ -147,8 +148,10 @@ struct MigrationState {
/*< public >*/
QemuThread thread;
+ QEMUBH *wt_vm_start_bh;
QEMUBH *cleanup_bh;
QEMUFile *to_dst_file;
+ QIOChannelBuffer *bioc;
/*
* Protects to_dst_file pointer. We need to make sure we won't
* yield or hang during the critical section, since this lock will
@@ -1352,7 +1352,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
return 0;
}
-static
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy,
bool inactivate_disks)
@@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f);
void qemu_loadvm_state_cleanup(void);
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
int qemu_load_device_state(QEMUFile *f);
+int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
+ bool in_postcopy, bool inactivate_disks);
#endif
Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> --- migration/migration.c | 174 +++++++++++++++++++++++++++++++++++++++++- migration/migration.h | 3 + migration/savevm.c | 1 - migration/savevm.h | 2 + 4 files changed, 177 insertions(+), 3 deletions(-)