@@ -894,23 +894,25 @@ ETEXI
{
.name = "migrate",
- .args_type = "detach:-d,blk:-b,inc:-i,uri:s",
- .params = "[-d] [-b] [-i] uri",
+ .args_type = "detach:-d,recover:-r,blk:-b,inc:-i,uri:s",
+ .params = "[-d] [-r] [-b] [-i] uri",
.help = "migrate to URI (using -d to not wait for completion)"
- "\n\t\t\t -b for migration without shared storage with"
- " full copy of disk\n\t\t\t -i for migration without "
- "shared storage with incremental copy of disk "
- "(base image shared between src and destination)",
+ "\n\t\t\t -r to recover from a broken migration\n\t\t\t"
+ " -b for migration without shared storage with"
+ " full copy of disk\n\t\t\t -i for migration without "
+ "shared storage with incremental copy of disk "
+ "(base image shared between src and destination)",
.mhandler.cmd = hmp_migrate,
},
STEXI
-@item migrate [-d] [-b] [-i] @var{uri}
+@item migrate [-d] [-r] [-b] [-i] @var{uri}
@findex migrate
Migrate to @var{uri} (using -d to not wait for completion).
- -b for migration with full copy of disk
- -i for migration with incremental copy of disk (base image is shared)
+ -r to recover from a broken migration
+ -b for migration with full copy of disk
+ -i for migration with incremental copy of disk (base image is shared)
ETEXI
{
@@ -1563,12 +1563,14 @@ static void hmp_migrate_status_cb(void *opaque)
void hmp_migrate(Monitor *mon, const QDict *qdict)
{
bool detach = qdict_get_try_bool(qdict, "detach", false);
+ bool recover = qdict_get_try_bool(qdict, "recover", false);
bool blk = qdict_get_try_bool(qdict, "blk", false);
bool inc = qdict_get_try_bool(qdict, "inc", false);
const char *uri = qdict_get_str(qdict, "uri");
Error *err = NULL;
- qmp_migrate(uri, !!blk, blk, !!inc, inc, false, false, &err);
+ qmp_migrate(uri, !!recover, recover, !!blk, blk, !!inc, inc, false, false,
+ &err);
if (err) {
error_report_err(err);
return;
@@ -142,6 +142,7 @@ struct MigrationState
int state;
/* Old style params from 'migrate' command */
MigrationParams params;
+ bool in_recovery;
/* State related to return path */
struct {
@@ -351,6 +352,9 @@ void flush_page_queue(MigrationState *ms);
int ram_save_queue_pages(MigrationState *ms, const char *rbname,
ram_addr_t start, ram_addr_t len);
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
+int qemu_migrate_postcopy_incoming_recovery(QEMUFile **f,MigrationIncomingState* mis);
+
PostcopyState postcopy_state_get(void);
/* Set the state and return the old state */
PostcopyState postcopy_state_set(PostcopyState new_state);
@@ -709,6 +709,33 @@ MigrationInfo *qmp_query_migrate(Error **errp)
case MIGRATION_STATUS_CANCELLED:
info->has_status = true;
break;
+ case MIGRATION_STATUS_POSTCOPY_RECOVERY:
+ info->has_status = true;
+ info->has_total_time = true;
+ info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+ info->has_ram = true;
+ info->ram = g_malloc0(sizeof(*info->ram));
+ info->ram->transferred = ram_bytes_transferred();
+ info->ram->remaining = ram_bytes_remaining();
+ info->ram->total = ram_bytes_total();
+ info->ram->duplicate = dup_mig_pages_transferred();
+ info->ram->skipped = skipped_mig_pages_transferred();
+ info->ram->normal = norm_mig_pages_transferred();
+ info->ram->normal_bytes = norm_mig_bytes_transferred();
+ info->ram->dirty_pages_rate = s->dirty_pages_rate;
+ info->ram->mbps = s->mbps;
+ info->ram->dirty_sync_count = s->dirty_sync_count;
+
+ if (blk_mig_active()) {
+ info->has_disk = true;
+ info->disk = g_malloc0(sizeof(*info->disk));
+ info->disk->transferred = blk_mig_bytes_transferred();
+ info->disk->remaining = blk_mig_bytes_remaining();
+ info->disk->total = blk_mig_bytes_total();
+ }
+
+ get_xbzrle_cache_stats(info);
}
info->status = s->state;
@@ -993,6 +1020,7 @@ MigrationState *migrate_init(const MigrationParams *params)
s->xfer_limit = 0;
s->cleanup_bh = 0;
s->to_dst_file = NULL;
+ s->in_recovery = false;
s->state = MIGRATION_STATUS_NONE;
s->params = *params;
s->rp_state.from_dst_file = NULL;
@@ -1069,13 +1097,14 @@ bool migration_is_blocked(Error **errp)
return false;
}
-void qmp_migrate(const char *uri, bool has_blk, bool blk,
- bool has_inc, bool inc, bool has_detach, bool detach,
+void qmp_migrate(const char *uri, bool in_recover, bool recover, bool has_blk,
+ bool blk, bool has_inc, bool inc, bool has_detach, bool detach,
Error **errp)
{
Error *local_err = NULL;
MigrationState *s = migrate_get_current();
MigrationParams params;
+ bool recovery = in_recover && recover;
const char *p;
params.blk = has_blk && blk;
@@ -1095,7 +1124,39 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
return;
}
- s = migrate_init(¶ms);
+ if (recovery ^ atomic_mb_read(&s->in_recovery)) {
+ if (recovery) {
+ /* No VM is waiting for recovery and
+ * recovery option was set
+ */
+
+ error_setg(errp, "No VM to recover");
+ return;
+ } else {
+ /* A VM is waiting for recovery and
+ * no recovery option is set
+ */
+
+ error_setg(errp, "A migration is in recovery state");
+ return;
+ }
+ } else {
+ if (!recovery) {
+ /* No VM is waiting for recovery and
+ * no recovery option is set
+ */
+ s = migrate_init(¶ms);
+ } else {
+ /* A VM is waiting for recovery and
+ * recovery option was set
+ */
+ s->to_dst_file = NULL;
+ if (s->rp_state.from_dst_file) {
+ /* shutdown the rp socket, so causing the rp thread to shutdown */
+ qemu_file_shutdown(s->rp_state.from_dst_file);
+ }
+ }
+ }
if (strstart(uri, "tcp:", &p)) {
tcp_start_outgoing_migration(s, p, &local_err);
@@ -1336,6 +1397,8 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
*/
static void *source_return_path_thread(void *opaque)
{
+ fprintf(stderr, "Return path started on source\n");
+
MigrationState *ms = opaque;
QEMUFile *rp = ms->rp_state.from_dst_file;
uint16_t header_len, header_type;
@@ -1439,8 +1502,8 @@ static void *source_return_path_thread(void *opaque)
trace_source_return_path_thread_end();
out:
- ms->rp_state.from_dst_file = NULL;
qemu_fclose(rp);
+ fprintf(stderr, "Return path failed on source\n");
return NULL;
}
@@ -1714,6 +1777,7 @@ static void *migration_thread(void *opaque)
bool entered_postcopy = false;
/* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
+ int ret;
rcu_register_thread();
@@ -1781,7 +1845,26 @@ static void *migration_thread(void *opaque)
}
}
- if (qemu_file_get_error(s->to_dst_file)) {
+ if ((ret = qemu_file_get_error(s->to_dst_file))) {
+ /* This check is based on how the error is set during the network
+ * recv(). When recv() returns 0 (i.e. no data to read), the error
+ * is set to -EIO. For all other network errors, it is set
+ * according to the return value received.
+ */
+ if (ret == -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+ /* Network Failure during postcopy */
+
+ current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;
+ runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
+ ret = qemu_migrate_postcopy_outgoing_recovery(s);
+ if(ret == 0) {
+ current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
+ runstate_set(RUN_STATE_FINISH_MIGRATE);
+ qemu_file_clear_error(s->to_dst_file);
+ continue;
+ }
+
+ }
migrate_set_state(&s->state, current_active_state,
MIGRATION_STATUS_FAILED);
trace_migration_thread_file_err();
@@ -1852,17 +1935,6 @@ static void *migration_thread(void *opaque)
void migrate_fd_connect(MigrationState *s)
{
- /* This is a best 1st approximation. ns to ms */
- s->expected_downtime = max_downtime/1000000;
- s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
-
- qemu_file_set_blocking(s->to_dst_file, true);
- qemu_file_set_rate_limit(s->to_dst_file,
- s->bandwidth_limit / XFER_LIMIT_RATIO);
-
- /* Notify before starting migration thread */
- notifier_list_notify(&migration_state_notifiers, s);
-
/*
* Open the return path; currently for postcopy but other things might
* also want it.
@@ -1877,12 +1949,61 @@ void migrate_fd_connect(MigrationState *s)
}
}
+ qemu_file_set_blocking(s->to_dst_file, true);
+ qemu_file_set_rate_limit(s->to_dst_file,
+ s->bandwidth_limit / XFER_LIMIT_RATIO);
+
+ if (atomic_mb_read(&s->in_recovery)) {
+ qemu_mutex_lock(&migration_recovery_mutex);
+ atomic_mb_set(&s->in_recovery, false);
+ qemu_cond_signal(&migration_recovery_cond);
+ qemu_mutex_unlock(&migration_recovery_mutex);
+
+ fprintf(stderr, "recovered\n");
+ return;
+ }
+
+ /* This is a best 1st approximation. ns to ms */
+ s->expected_downtime = max_downtime/1000000;
+ s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
+
+
+ /* Notify before starting migration thread */
+ notifier_list_notify(&migration_state_notifiers, s);
+
migrate_compress_threads_create();
qemu_thread_create(&s->thread, "migration", migration_thread, s,
QEMU_THREAD_JOINABLE);
s->migration_thread_running = true;
}
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
+{
+ migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+ MIGRATION_STATUS_POSTCOPY_RECOVERY);
+
+ atomic_mb_set(&ms->in_recovery, true);
+ /* Code for network recovery to be added here */
+ qemu_mutex_lock(&migration_recovery_mutex);
+ while(atomic_mb_read(&ms->in_recovery) == true) {
+ fprintf(stderr, "Under recovery, not letting it fail %p\n", ms->to_dst_file);
+ qemu_cond_wait(&migration_recovery_cond, &migration_recovery_mutex);
+ }
+ qemu_mutex_unlock(&migration_recovery_mutex);
+
+ if(ms->to_dst_file != NULL) {
+ /* Recovery successfull */
+ migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_RECOVERY,
+ MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+ qemu_savevm_send_open_return_path(ms->to_dst_file);
+ return 0;
+ }
+
+ return -1;
+
+}
+
PostcopyState postcopy_state_get(void)
{
return atomic_mb_read(&incoming_postcopy_state);
@@ -154,12 +154,15 @@
# @watchdog: the watchdog action is configured to pause and has been triggered
#
# @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @postmigrate-recovery: guest is paused for recovery after a network failure
+# (since 2.7)
##
{ 'enum': 'RunState',
'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
- 'guest-panicked' ] }
+ 'guest-panicked', 'postmigrate-recovery' ] }
##
# @StatusInfo:
@@ -438,12 +441,15 @@
#
# @failed: some error occurred during migration process.
#
+# @postcopy-recovery: in recovery mode, after a network failure. (since 2.7)
+#
# Since: 2.3
#
##
{ 'enum': 'MigrationStatus',
'data': [ 'none', 'setup', 'cancelling', 'cancelled',
- 'active', 'postcopy-active', 'completed', 'failed' ] }
+ 'active', 'postcopy-active', 'completed', 'failed',
+ 'postcopy-recovery' ] }
##
# @MigrationInfo
@@ -2119,6 +2125,8 @@
#
# @uri: the Uniform Resource Identifier of the destination VM
#
+# @recover: #optional recover from a broken migration (since 2.7)
+#
# @blk: #optional do block migration (full disk copy)
#
# @inc: #optional incremental disk copy migration
@@ -2131,7 +2139,7 @@
# Since: 0.14.0
##
{ 'command': 'migrate',
- 'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } }
+ 'data': {'uri': 'str', '*recover': 'bool', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } }
##
# @migrate-incoming
@@ -2142,6 +2150,8 @@
# @uri: The Uniform Resource Identifier identifying the source or
# address to listen on
#
+# @recover: #optional recover from a broken migration (since 2.7)
+#
# Returns: nothing on success
#
# Since: 2.3
@@ -639,7 +639,7 @@ EQMP
{
.name = "migrate",
- .args_type = "detach:-d,blk:-b,inc:-i,uri:s",
+ .args_type = "detach:-d,recover:-r,blk:-b,inc:-i,uri:s",
.mhandler.cmd_new = qmp_marshal_migrate,
},
@@ -651,6 +651,7 @@ Migrate to URI.
Arguments:
+- "recover": recover migration (json-bool, optional)
- "blk": block migration, full disk copy (json-bool, optional)
- "inc": incremental disk copy (json-bool, optional)
- "uri": Destination URI (json-string)
@@ -597,6 +597,10 @@ static const RunStateTransition runstate_transitions_def[] = {
{ RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
{ RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
{ RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
+ { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
+
+ { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
+ { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
{ RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
{ RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
Signed-off-by: Md Haris Iqbal <haris.phnx@gmail.com> --- hmp-commands.hx | 20 +++--- hmp.c | 4 +- include/migration/migration.h | 4 ++ migration/migration.c | 153 +++++++++++++++++++++++++++++++++++++----- qapi-schema.json | 16 ++++- qmp-commands.hx | 3 +- vl.c | 4 ++ 7 files changed, 174 insertions(+), 30 deletions(-)