Message ID | 20240621143221.198784-3-elena.ufimtseva@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | migration: introduce strict SLA | expand |
On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote: > During live migration, receive current downtime from source > and start a downtime timer. When the destination dowtime > and added source downtime exceeds downtime limit for more > than switchover limit, abort live migration on destination. > > Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> > --- > migration/migration.c | 2 ++ > migration/migration.h | 15 ++++++++++ > migration/savevm.c | 68 ++++++++++++++++++++++++++++++++++++++++++ > migration/savevm.h | 2 ++ > migration/trace-events | 3 ++ > 5 files changed, 90 insertions(+) > > diff --git a/migration/migration.c b/migration/migration.c > index 5cc304d2db..64d7290997 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -240,6 +240,8 @@ void migration_object_init(void) > current_incoming->page_requested = g_tree_new(page_request_addr_cmp); > > current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR; > + /* Downtime will start when source sends its current downtime. */ > + current_incoming->downtime_start = 0; > > migration_object_check(current_migration, &error_fatal); > > diff --git a/migration/migration.h b/migration/migration.h > index aa56b70795..06f4ebe214 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -230,6 +230,21 @@ struct MigrationIncomingState { > > /* Do exit on incoming migration failure */ > bool exit_on_error; > + > + /* Initial downtime on destination set by MIG_CMD_SEND_SRC_DOWNTIME */ > + uint64_t downtime_start; > + /* > + * Current donwtime on destination that initially set equal to source by > + * MIG_CMD_SEND_SRC_DOWNTIME, then updated by destination itself. > + */ > + uint64_t downtime_now; Is this needed? > + /* > + * Abort live migration on destination when current destination downtime > + * exceeds the abort_limit. abort_limit is being set by > + * MIG_CMD_SEND_SRC_DOWNTIME sent from source. > + */ > + uint64_t abort_limit; If we assume both QEMUs will be applied with the same cap/params, we may not need this and we can directly read the parameter on dest. > + uint64_t src_downtime; > }; > > MigrationIncomingState *migration_incoming_get_current(void); > diff --git a/migration/savevm.c b/migration/savevm.c > index 031ab03915..f3b5ea98bf 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -90,6 +90,7 @@ enum qemu_vm_cmd { > MIG_CMD_ENABLE_COLO, /* Enable COLO */ > MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ > MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ > + MIG_CMD_SEND_SRC_DOWNTIME, /* Send current downtime to dst */ > MIG_CMD_MAX > }; > > @@ -109,6 +110,7 @@ static struct mig_cmd_args { > [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, > [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, > [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, > + [MIG_CMD_SEND_SRC_DOWNTIME] = { .len = -1, .name = "SEND_SRC_DOWNTIME" }, > [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, > }; > > @@ -1218,6 +1220,18 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) > qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); > } > > +void qemu_savevm_send_downtime(QEMUFile *f, int64_t abort_limit_ms, > + int64_t source_downtime) > +{ > + uint64_t tmp[2]; > + tmp[0] = cpu_to_be64(abort_limit_ms); > + tmp[1] = cpu_to_be64(source_downtime); > + > + trace_qemu_savevm_send_downtime(abort_limit_ms, source_downtime); > + qemu_savevm_command_send(f, MIG_CMD_SEND_SRC_DOWNTIME, > + 16, (uint8_t *)tmp); > +} > + > bool qemu_savevm_state_blocked(Error **errp) > { > SaveStateEntry *se; > @@ -1635,6 +1649,14 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only, > } > } > > + if (migrate_switchover_abort()) { > + MigrationState *s = migrate_get_current(); > + uint64_t abort_limit_ms = > + s->parameters.downtime_limit + s->parameters.switchover_limit; > + qemu_savevm_send_downtime(f, abort_limit_ms, > + migration_get_current_downtime(s)); > + } > + > if (iterable_only) { > goto flush; > } > @@ -1919,6 +1941,20 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, > return 0; > } > > +static int loadvm_handle_src_downtime(MigrationIncomingState *mis, > + uint16_t len) > +{ > + uint64_t src_abort_limit = qemu_get_be64(mis->from_src_file); > + uint64_t src_current_downtime = qemu_get_be64(mis->from_src_file); > + > + mis->abort_limit = src_abort_limit; > + mis->src_downtime = src_current_downtime; > + mis->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); I guess this didn't count in the delay of sending this packet. Since the whole point of this feature will be "making sure downtime is less than xxx or cancel migration", I think this might cause the real downtime still more than expected. Not sure how big a concern this is. E.g., have you measured how a packet can be delayed when the socket pipeline is mostly full, then queue this SRC_DOWNTIME message after all that? I think that's very possible the case here at switchover where src is trying to dump as fast as possible. I'm not sure whether it's easy to measure, either.. > + > + trace_loadvm_handle_src_downtime(src_abort_limit, src_current_downtime); > + return 0; > +} > + > /* After postcopy we will be told to throw some pages away since they're > * dirty and will have to be demand fetched. Must happen before CPU is > * started. > @@ -2540,6 +2576,9 @@ static int loadvm_process_command(QEMUFile *f) > > case MIG_CMD_ENABLE_COLO: > return loadvm_process_enable_colo(mis); > + > + case MIG_CMD_SEND_SRC_DOWNTIME: > + return loadvm_handle_src_downtime(mis, len); > } > > return 0; > @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis, > trace_vmstate_downtime_load("non-iterable", se->idstr, > se->instance_id, end_ts - start_ts); > } > + if (migrate_switchover_abort() && type == QEMU_VM_SECTION_FULL && > + mis->downtime_start) { > + mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > + uint64_t dst_downtime = mis->downtime_now - mis->downtime_start; > + if (mis->src_downtime + dst_downtime >= mis->abort_limit) { > + error_report("Shutdown destination migration, migration abort_limit" > + "(%lu ms) was reached.", mis->abort_limit); > + trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime, > + mis->src_downtime); > + return -EINVAL; > + } > + } > > if (!check_section_footer(f, se)) { > return -EINVAL; > @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis, > se->instance_id, end_ts - start_ts); > } > > + if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END && > + mis->downtime_start) { > + mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > + uint64_t dst_downtime = mis->downtime_now - mis->downtime_start; > + if (mis->src_downtime + dst_downtime >= mis->abort_limit) { > + error_report("Shutdown destination migration, migration abort_limit (%lu ms)" > + "was reached.", mis->abort_limit); > + trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime, > + mis->src_downtime); > + return -EINVAL; > + } > + } So this traps both iteration and non-iteration phase. What if the downtime was caused by after these, or unluckily at the last non-iterator device state? After trying to think about it, I figured this is not easy to do right. Also, I start to doubt whether it's definitely a good idea on having this in the first place. Firstly, I'm wondering how we should treat this new feature v.s. downtime_limit. It's about how the user should set both. If this is about "cancel migration if downtime more than xxx", then.. AFAICT that's exactly what downtime_limit was "designed" to be.. It's just that downtime_limit says the other way round, as: "don't switchover if the downtime will be more than xxx". Then, if the user has option to set both these parameters, what would be the right thing to do? Shouldn't they simply always set both parameters to be the same value already? But if so, what's the point on having two? This caused me to think whether the 2nd parameter is needed at all, instead whether we should simply make downtime_limit more accurate, so that it will start to account more things than before. It won't be accurate, but the same case here: making this new feature "accurate" can also be very hard. Thanks,
On 24/06/2024 20:41, Peter Xu wrote: > On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote: >> @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis, >> if (!check_section_footer(f, se)) { >> return -EINVAL; >> @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis, >> se->instance_id, end_ts - start_ts); >> } >> >> + if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END && >> + mis->downtime_start) { >> + mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); >> + uint64_t dst_downtime = mis->downtime_now - mis->downtime_start; >> + if (mis->src_downtime + dst_downtime >= mis->abort_limit) { >> + error_report("Shutdown destination migration, migration abort_limit (%lu ms)" >> + "was reached.", mis->abort_limit); >> + trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime, >> + mis->src_downtime); >> + return -EINVAL; >> + } >> + } > > So this traps both iteration and non-iteration phase. What if the downtime > was caused by after these, or unluckily at the last non-iterator device > state? > > After trying to think about it, I figured this is not easy to do right. > Also, I start to doubt whether it's definitely a good idea on having this > in the first place. > > Firstly, I'm wondering how we should treat this new feature > v.s. downtime_limit. It's about how the user should set both. > > If this is about "cancel migration if downtime more than xxx", > then.. AFAICT that's exactly what downtime_limit was "designed" to be.. > It's just that downtime_limit says the other way round, as: "don't > switchover if the downtime will be more than xxx". > > Then, if the user has option to set both these parameters, what would be > the right thing to do? Shouldn't they simply always set both parameters to > be the same value already? But if so, what's the point on having two? > > This caused me to think whether the 2nd parameter is needed at all, instead > whether we should simply make downtime_limit more accurate, so that it will > start to account more things than before. It won't be accurate, but the > same case here: making this new feature "accurate" can also be very hard. > The way I think about it is that current downtime-limit captures nicely the data part as the only calculations it cares about is how much outstanding data it sends to the destination (be it VF device state or RAM). This second parameter captures what is *not* related to data, i.e. costs of hypervisor quiescing the VM or added latencies in hypervisor *in addition* to sending outstanding data to destination. If we were to merge this all into a single parameter (aka downtime-limit) we are possibility artificially increasing the downtime thanks to relaxing the oustanding data part, as opposed to trying to capture the switchover cost (hence the name the parameter) that can't be reflected in the former equation. So I feel like this needs two parameters whereby this second new parameter covers 'switchover cost' (hence the name) with current migration algorithm. Joao
On Tue, Jun 25, 2024 at 12:38:50PM +0100, Joao Martins wrote: > On 24/06/2024 20:41, Peter Xu wrote: > > On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote: > >> @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis, > >> if (!check_section_footer(f, se)) { > >> return -EINVAL; > >> @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis, > >> se->instance_id, end_ts - start_ts); > >> } > >> > >> + if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END && > >> + mis->downtime_start) { > >> + mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > >> + uint64_t dst_downtime = mis->downtime_now - mis->downtime_start; > >> + if (mis->src_downtime + dst_downtime >= mis->abort_limit) { > >> + error_report("Shutdown destination migration, migration abort_limit (%lu ms)" > >> + "was reached.", mis->abort_limit); > >> + trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime, > >> + mis->src_downtime); > >> + return -EINVAL; > >> + } > >> + } > > > > So this traps both iteration and non-iteration phase. What if the downtime > > was caused by after these, or unluckily at the last non-iterator device > > state? > > > > After trying to think about it, I figured this is not easy to do right. > > Also, I start to doubt whether it's definitely a good idea on having this > > in the first place. > > > > Firstly, I'm wondering how we should treat this new feature > > v.s. downtime_limit. It's about how the user should set both. > > > > If this is about "cancel migration if downtime more than xxx", > > then.. AFAICT that's exactly what downtime_limit was "designed" to be.. > > It's just that downtime_limit says the other way round, as: "don't > > switchover if the downtime will be more than xxx". > > > > Then, if the user has option to set both these parameters, what would be > > the right thing to do? Shouldn't they simply always set both parameters to > > be the same value already? But if so, what's the point on having two? > > > > This caused me to think whether the 2nd parameter is needed at all, instead > > whether we should simply make downtime_limit more accurate, so that it will > > start to account more things than before. It won't be accurate, but the > > same case here: making this new feature "accurate" can also be very hard. > > > > The way I think about it is that current downtime-limit captures nicely the data > part as the only calculations it cares about is how much outstanding data it > sends to the destination (be it VF device state or RAM). This second parameter > captures what is *not* related to data, i.e. costs of hypervisor quiescing the > VM or added latencies in hypervisor *in addition* to sending outstanding data to > destination. > > If we were to merge this all into a single parameter (aka downtime-limit) we are > possibility artificially increasing the downtime thanks to relaxing the > oustanding data part, as opposed to trying to capture the switchover cost (hence > the name the parameter) that can't be reflected in the former equation. > > So I feel like this needs two parameters whereby this second new parameter > covers 'switchover cost' (hence the name) with current migration algorithm. Then the question is how should we suggest the user to specify these two parameters. The cover letter used: migrate_set_parameter downtime-limit 300 migrate_set_parameter switchover-limit 10 But it does look like a testing sample only and not valid in real world setup, as logically the new limit should be larger than the old one, afaict. If the new limit is larger, how larger should it be? So far I can understand how it works intenrally, but even with that I don't know how I should set this parameter, e.g., when downtime_limit used to be 300ms, I'd say 500ms could be a suitable value, but is it? In that case, perhaps the 500ms is the "real" limit that I don't want a VM to be halted for more than 500ms, but in that case the user should have already setup downtime_limit to 500ms. I also don't know how should an user understand all these details and figure out how to set these. Note that currently we definitely allow the user to specify downtime_limit and it's so far understandable, even though the user may assume that contains the whole blackout phase (that's also what we wish it can achieve..), rather than the fact that it only takes ram/vfio/... reported remaining data into account v.s. the bandwidth. Maybe we could consider introducing the parameters/caps in some other form, so that we can keep the downtime_limit to be "the total downtime", instead of a new definition of downtime. E.g., it's at least not fair indeed to take the whole "downtime_limit" for data transfers, so maybe some ratio parameter can be introduced to say how much portion of that downtime can be used for data transfer, and then it might be ok to work with the new cap introduced so that when total downtime is over the limit it will abort the migration (but without a new "max downtime" definition which might be confusing). Said that, I also wonder whether you have thought about improving downtime_limit itself. It'll be definitely better if we simply don't switchover at all if that won't make it. IOW, I wonder whether there can be case where user specifies these parameters, migration simply keeps switching over and keep aborting, requiring a retry. That's pretty unwanted. We may simply doesn't allow that switchover to happen at all. Thanks,
On 25/06/2024 15:53, Peter Xu wrote: > On Tue, Jun 25, 2024 at 12:38:50PM +0100, Joao Martins wrote: >> On 24/06/2024 20:41, Peter Xu wrote: >>> On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote: >>>> @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis, >> >> if (!check_section_footer(f, se)) { >>>> return -EINVAL; >>>> @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis, >>>> se->instance_id, end_ts - start_ts); >>>> } >>>> >>>> + if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END && >>>> + mis->downtime_start) { >>>> + mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); >>>> + uint64_t dst_downtime = mis->downtime_now - mis->downtime_start; >>>> + if (mis->src_downtime + dst_downtime >= mis->abort_limit) { >>>> + error_report("Shutdown destination migration, migration abort_limit (%lu ms)" >>>> + "was reached.", mis->abort_limit); >>>> + trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime, >>>> + mis->src_downtime); >>>> + return -EINVAL; >>>> + } >>>> + } >>> >>> So this traps both iteration and non-iteration phase. What if the downtime >>> was caused by after these, or unluckily at the last non-iterator device >>> state? >>> >>> After trying to think about it, I figured this is not easy to do right. >>> Also, I start to doubt whether it's definitely a good idea on having this >>> in the first place. >>> >>> Firstly, I'm wondering how we should treat this new feature >>> v.s. downtime_limit. It's about how the user should set both. >>> >>> If this is about "cancel migration if downtime more than xxx", >>> then.. AFAICT that's exactly what downtime_limit was "designed" to be.. >>> It's just that downtime_limit says the other way round, as: "don't >>> switchover if the downtime will be more than xxx". >>> >>> Then, if the user has option to set both these parameters, what would be >>> the right thing to do? Shouldn't they simply always set both parameters to >>> be the same value already? But if so, what's the point on having two? >>> >>> This caused me to think whether the 2nd parameter is needed at all, instead >>> whether we should simply make downtime_limit more accurate, so that it will >>> start to account more things than before. It won't be accurate, but the >>> same case here: making this new feature "accurate" can also be very hard. >>> >> >> The way I think about it is that current downtime-limit captures nicely the data >> part as the only calculations it cares about is how much outstanding data it >> sends to the destination (be it VF device state or RAM). This second parameter >> captures what is *not* related to data, i.e. costs of hypervisor quiescing the >> VM or added latencies in hypervisor *in addition* to sending outstanding data to >> destination. >> >> If we were to merge this all into a single parameter (aka downtime-limit) we are >> possibility artificially increasing the downtime thanks to relaxing the >> oustanding data part, as opposed to trying to capture the switchover cost (hence >> the name the parameter) that can't be reflected in the former equation. >> >> So I feel like this needs two parameters whereby this second new parameter >> covers 'switchover cost' (hence the name) with current migration algorithm. > > Then the question is how should we suggest the user to specify these two > parameters. > > The cover letter used: > > migrate_set_parameter downtime-limit 300 > migrate_set_parameter switchover-limit 10 > > But it does look like a testing sample only and not valid in real world > setup, as logically the new limit should be larger than the old one, > afaict. If the new limit is larger, how larger should it be? > > So far I can understand how it works intenrally, but even with that I don't > know how I should set this parameter, e.g., when downtime_limit used to be > 300ms, I'd say 500ms could be a suitable value, but is it? In that case, > perhaps the 500ms is the "real" limit that I don't want a VM to be halted > for more than 500ms, but in that case the user should have already setup > downtime_limit to 500ms. > Yeap -- I think you're right that it presents a UX confusion on what should a user should set. > I also don't know how should an user understand all these details and > figure out how to set these. Note that currently we definitely allow the > user to specify downtime_limit and it's so far understandable, even though > the user may assume that contains the whole blackout phase (that's also > what we wish it can achieve..), rather than the fact that it only takes > ram/vfio/... reported remaining data into account v.s. the bandwidth. > > Maybe we could consider introducing the parameters/caps in some other form, > so that we can keep the downtime_limit to be "the total downtime", instead > of a new definition of downtime. E.g., it's at least not fair indeed to > take the whole "downtime_limit" for data transfers, so maybe some ratio > parameter can be introduced to say how much portion of that downtime can be > used for data transfer I like this idea -- it fixes another problem that downtime-limit (solely) makes the algorithm be too optimistic and just utilize the whole bandwidth (e.g. migrating after the first dirty sync depending on downtime-limit) e.g. "iterable-downtime" (or "precopy-downtime" for lack of a better idea) for such proerty Then when we set downtime-limit it can represent the whole thing including blackout as it is the case today and we configure it by minimizing iterable-downtime to give enough headroom for switchover. >, and then it might be ok to work with the new cap > introduced so that when total downtime is over the limit it will abort the > migration (but without a new "max downtime" definition which might be > confusing). > Yes, improving 'downtime-limit' with a sub parameter above for RAM/state downtime, this migration capability then becomes a boolean instead of a value where it's more like 'downtime-limit-strict' and going above downtime limit is enforced/aborted is also performed like these patches. To prevent essentially this: > IOW, I wonder whether there can > be case where user specifies these parameters, migration simply keeps > switching over and keep aborting, requiring a retry. That's pretty > unwanted. We may simply doesn't allow that switchover to happen at all. (...) > Said that, I also wonder whether you have thought about improving > downtime_limit itself. It'll be definitely better if we simply don't > switchover at all if that won't make it. The device-state multifd scaling is a take on improving switchover phase, and we will keep improving it whenever we find things... but the switchover itself can't be 'precomputed' into a downtime number equation ahead of time to encompass all possible latencies/costs. Part of the reason that at least we couldn't think of a way besides this proposal here, which at the core it's meant to bounds check switchover. Even without taking into account VFs/HW[0], it is simply not considered how long it might take and giving some sort of downtime buffer coupled with enforcement that can be enforced helps not violating migration SLAs. [0] https://lore.kernel.org/qemu-devel/20230317081904.24389-1-xuchuangxclwt@bytedance.com/
On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote: > Then the question is how should we suggest the user to specify these two > parameters. > > The cover letter used: > > migrate_set_parameter downtime-limit 300 > migrate_set_parameter switchover-limit 10 What this means is that in practice the total downtime limit is 310 ms, however, expressing this as two parameters is incredibly inflexible. If the actual RAM transfer downtime only took 50 ms, then why should the switchover downtime still be limited to 10ms, when we've still got a budget of 250 ms that was unused. IOW, if my VM tolerates a downtime of 310ms, then I want that 310ms spread across the RAM transfer downtime and switchover downtime in *any* ratio. ALl that matters is the overall completion time. IMHO exposing 2 distinct parameters is horrible from a usability POV. With regards, Daniel
On Tue, Jun 25, 2024 at 05:31:19PM +0100, Joao Martins wrote: > The device-state multifd scaling is a take on improving switchover phase, > and we will keep improving it whenever we find things... but the That'll be helpful, thanks. Just a quick note that "reducing downtime" is a separate issue comparing to "make downtime_limit accurate". > switchover itself can't be 'precomputed' into a downtime number equation > ahead of time to encompass all possible latencies/costs. Part of the > reason that at least we couldn't think of a way besides this proposal > here, which at the core it's meant to bounds check switchover. Even > without taking into account VFs/HW[0], it is simply not considered how > long it might take and giving some sort of downtime buffer coupled with > enforcement that can be enforced helps not violating migration SLAs. I agree such enforcement alone can be useful in general to be able to fallback. Said that, I think it would definitely be nice to attach more information on the downtime analysis when reposting this series, if there is any. For example, irrelevant of whether QEMU can do proper predictions at all, there can be data / results to show what is the major parts that are missing besides the current calculations, aka an expectation on when the fallback can trigger, and some justification on why they can't be predicted. IMHO the enforcement won't make much sense if it keeps triggering, in that case people will simply not use it as it stops migrations from happening. Ultimately the work will still be needed to make downtime_limit accurate. The fallback should only be an last fence to guard the promise which should be the "corner cases".
On Fri, Jun 21, 2024 at 07:32:21AM -0700, Elena Ufimtseva wrote: > During live migration, receive current downtime from source > and start a downtime timer. When the destination dowtime > and added source downtime exceeds downtime limit for more > than switchover limit, abort live migration on destination. > > Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> > --- > migration/migration.c | 2 ++ > migration/migration.h | 15 ++++++++++ > migration/savevm.c | 68 ++++++++++++++++++++++++++++++++++++++++++ > migration/savevm.h | 2 ++ > migration/trace-events | 3 ++ > 5 files changed, 90 insertions(+) > > diff --git a/migration/migration.c b/migration/migration.c > index 5cc304d2db..64d7290997 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -240,6 +240,8 @@ void migration_object_init(void) > current_incoming->page_requested = g_tree_new(page_request_addr_cmp); > > current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR; > + /* Downtime will start when source sends its current downtime. */ > + current_incoming->downtime_start = 0; > > migration_object_check(current_migration, &error_fatal); > > diff --git a/migration/migration.h b/migration/migration.h > index aa56b70795..06f4ebe214 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -230,6 +230,21 @@ struct MigrationIncomingState { > > /* Do exit on incoming migration failure */ > bool exit_on_error; > + > + /* Initial downtime on destination set by MIG_CMD_SEND_SRC_DOWNTIME */ > + uint64_t downtime_start; > + /* > + * Current donwtime on destination that initially set equal to source by > + * MIG_CMD_SEND_SRC_DOWNTIME, then updated by destination itself. > + */ > + uint64_t downtime_now; > + /* > + * Abort live migration on destination when current destination downtime > + * exceeds the abort_limit. abort_limit is being set by > + * MIG_CMD_SEND_SRC_DOWNTIME sent from source. > + */ > + uint64_t abort_limit; > + uint64_t src_downtime; > }; > > MigrationIncomingState *migration_incoming_get_current(void); > diff --git a/migration/savevm.c b/migration/savevm.c > index 031ab03915..f3b5ea98bf 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -90,6 +90,7 @@ enum qemu_vm_cmd { > MIG_CMD_ENABLE_COLO, /* Enable COLO */ > MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ > MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ > + MIG_CMD_SEND_SRC_DOWNTIME, /* Send current downtime to dst */ > MIG_CMD_MAX > }; > > @@ -109,6 +110,7 @@ static struct mig_cmd_args { > [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, > [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, > [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, > + [MIG_CMD_SEND_SRC_DOWNTIME] = { .len = -1, .name = "SEND_SRC_DOWNTIME" }, > [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, > }; > > @@ -1218,6 +1220,18 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) > qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); > } > > +void qemu_savevm_send_downtime(QEMUFile *f, int64_t abort_limit_ms, > + int64_t source_downtime) > +{ > + uint64_t tmp[2]; > + tmp[0] = cpu_to_be64(abort_limit_ms); > + tmp[1] = cpu_to_be64(source_downtime); > + > + trace_qemu_savevm_send_downtime(abort_limit_ms, source_downtime); > + qemu_savevm_command_send(f, MIG_CMD_SEND_SRC_DOWNTIME, > + 16, (uint8_t *)tmp); > +} Is having both the src and dst responsible to tracking downtime limit introducing extra failure points ? We know the src QEMU is reliably operating, but we don't know that the dst QEMU will do so. If we give the dst responsibility for checking its own downtime then that perhaps isn't robust aganist the dst deadlocking or live-locking itself. Is it better have the src exclusively responsible for tracking the downtime limit, and have it send an "abort" message to the dst when the limit is exceeded ? This process would require an explicit ack from the src before the dst is permited to start its CPUs too. With regards, Daniel
On 25/06/2024 20:03, Peter Xu wrote: > On Tue, Jun 25, 2024 at 05:31:19PM +0100, Joao Martins wrote: >> The device-state multifd scaling is a take on improving switchover phase, >> and we will keep improving it whenever we find things... but the > > That'll be helpful, thanks. Just a quick note that "reducing downtime" is > a separate issue comparing to "make downtime_limit accurate". > I see those two separately too; it's just that right now that's the only work I know we can make it better is decreasing/optimizing it (other lines of work doing similar stuff inside vDPA too, by Si-Wei for example). Making downtime_limit accurate not so sure what it entails right now from your PoV. But it depends on what this question really was about, see at the end in case I am understanding you correctly. >> switchover itself can't be 'precomputed' into a downtime number equation >> ahead of time to encompass all possible latencies/costs. Part of the >> reason that at least we couldn't think of a way besides this proposal >> here, which at the core it's meant to bounds check switchover. Even >> without taking into account VFs/HW[0], it is simply not considered how >> long it might take and giving some sort of downtime buffer coupled with >> enforcement that can be enforced helps not violating migration SLAs. > > I agree such enforcement alone can be useful in general to be able to > fallback. Said that, I think it would definitely be nice to attach more > information on the downtime analysis when reposting this series, if there > is any. > > For example, irrelevant of whether QEMU can do proper predictions at all, > there can be data / results to show what is the major parts that are > missing besides the current calculations, aka an expectation on when the > fallback can trigger, and some justification on why they can't be > predicted. > /me nods -- I think this might be a gap in the current cover letter. I recall we have looked at quite a few downtime traces (thanks to the tracing improvements made in the last dev cycle!), but it's also easy to reproduce these problems with downtime-limit even without past data with relatively simple configs. > IMHO the enforcement won't make much sense if it keeps triggering, in that > case people will simply not use it as it stops migrations from happening. Right -- The enforcement *alone* damages more than it fixes. Meaning enforcing without having some way to give some headroom within downtime-limit for switchover to be accounted. The latter is what allows the enforcement to be placed, otherwise we would just be failing migrations left and right. > Ultimately the work will still be needed to make downtime_limit accurate. > The fallback should only be an last fence to guard the promise which should > be the "corner cases". > Are you thinking in something specifically? Many "variables" affect this from the point we decide switchover, and at the worst (likely) case it means having qemu subsystems declare empirical values on how long it takes to suspend/resume/transfer-state to migration expected downtime prediction equation. Part of the reason that having headroom within downtime-limit was a simple 'catch-all' (from our PoV) in terms of maintainability while giving user something to fallback for characterizing its SLA. Personally, I think there's a tiny bit disconnect between what the user desires when setting downtime-limit vs what it really does. downtime-limit right now looks to be best viewed as 'precopy-ram-downtime-limit' :) Unless the accuracy work you're thinking is just having a better migration algorithm at obtaining the best possible downtime for outstanding-data/RAM *even if* downtime-limit is set at a high limit, like giving 1) a grace period in the beginning of migration post first dirty sync or 2) a measured value with continually incrementing target downtime limit until max downtime-limit set by user hits ... before defaulting to the current behaviour of migrating as soon as expected downtime is within the downtime-limit. As discussed in the last response, this could create the 'downtime headroom' for getting the enforcement/SLA better honored. Is this maybe your line of thinking?
On 25/06/2024 19:37, Daniel P. Berrangé wrote: > On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote: >> Then the question is how should we suggest the user to specify these two >> parameters. >> >> The cover letter used: >> >> migrate_set_parameter downtime-limit 300 >> migrate_set_parameter switchover-limit 10 > > What this means is that in practice the total downtime limit > is 310 ms, however, expressing this as two parameters is > incredibly inflexible. > > If the actual RAM transfer downtime only took 50 ms, then why > should the switchover downtime still be limited to 10ms, when > we've still got a budget of 250 ms that was unused. > The downtime limit is 300, it's more than you are giving something *extra* 10ms when you switchover regardless of where that's spent. If it makes it easier to understand you could see this parameter as: 'downtime-limit-max-error' = 10 ms The name as proposed by the RFC was meant to honor what the error margin was meant for: to account for extra time during switchover. Adding this inside downtime-limit wouldn't work as it otherwise would be used solely for RAM transfer during precopy. > IOW, if my VM tolerates a downtime of 310ms, then I want that > 310ms spread across the RAM transfer downtime and switchover > downtime in *any* ratio. ALl that matters is the overall > completion time. > That still happens with this patches, no specific budget is given to each. Though implicitly if downtime-limit captures only RAM transfer, then in theory if you're migrating a busy guest that happens to meet the SLA say expected-downtime=290, then you have a total of 20 for switchover (thanks to the extra 10 used in switchover-limit/downtime-limit-max-error 10). But keep in mind that the migration prediction *does not* account for anything other than RAM transfer. It happens that maybe your configuration is cheap, or has been optimized enough over the years that you likely don't care or noticed that it /could/ hurt the user designated SLA even if by a little. Regards, Joao
On Wed, Jun 26, 2024 at 12:29:41PM +0100, Joao Martins wrote: > On 25/06/2024 19:37, Daniel P. Berrangé wrote: > > On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote: > >> Then the question is how should we suggest the user to specify these two > >> parameters. > >> > >> The cover letter used: > >> > >> migrate_set_parameter downtime-limit 300 > >> migrate_set_parameter switchover-limit 10 > > > > What this means is that in practice the total downtime limit > > is 310 ms, however, expressing this as two parameters is > > incredibly inflexible. > > > > If the actual RAM transfer downtime only took 50 ms, then why > > should the switchover downtime still be limited to 10ms, when > > we've still got a budget of 250 ms that was unused. > > > > The downtime limit is 300, it's more than you are giving something *extra* 10ms > when you switchover regardless of where that's spent. > > If it makes it easier to understand you could see this parameter as: > > 'downtime-limit-max-error' = 10 ms > > The name as proposed by the RFC was meant to honor what the error margin was > meant for: to account for extra time during switchover. Adding this inside > downtime-limit wouldn't work as it otherwise would be used solely for RAM > transfer during precopy. > > > IOW, if my VM tolerates a downtime of 310ms, then I want that > > 310ms spread across the RAM transfer downtime and switchover > > downtime in *any* ratio. ALl that matters is the overall > > completion time. > > > That still happens with this patches, no specific budget is given to each. If no specific budget is given to each, then IMHO adding the second parameter is pointless & misleading. With regards, Daniel
On 26/06/2024 12:34, Daniel P. Berrangé wrote: > On Wed, Jun 26, 2024 at 12:29:41PM +0100, Joao Martins wrote: >> On 25/06/2024 19:37, Daniel P. Berrangé wrote: >>> On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote: >>>> Then the question is how should we suggest the user to specify these two >>>> parameters. >>>> >>>> The cover letter used: >>>> >>>> migrate_set_parameter downtime-limit 300 >>>> migrate_set_parameter switchover-limit 10 >>> >>> What this means is that in practice the total downtime limit >>> is 310 ms, however, expressing this as two parameters is >>> incredibly inflexible. >>> >>> If the actual RAM transfer downtime only took 50 ms, then why >>> should the switchover downtime still be limited to 10ms, when >>> we've still got a budget of 250 ms that was unused. >>> >> >> The downtime limit is 300, it's more than you are giving something *extra* 10ms >> when you switchover regardless of where that's spent. >> >> If it makes it easier to understand you could see this parameter as: >> >> 'downtime-limit-max-error' = 10 ms >> >> The name as proposed by the RFC was meant to honor what the error margin was >> meant for: to account for extra time during switchover. Adding this inside >> downtime-limit wouldn't work as it otherwise would be used solely for RAM >> transfer during precopy. >> >>> IOW, if my VM tolerates a downtime of 310ms, then I want that >>> 310ms spread across the RAM transfer downtime and switchover >>> downtime in *any* ratio. ALl that matters is the overall >>> completion time. >>> >> That still happens with this patches, no specific budget is given to each. > > If no specific budget is given to each, then IMHO adding the second > parameter is pointless & misleading. That is contradictory with your earlier statement. You redacted the part where I describe how this works in *the worst case* if the entire downtime-limit is used for RAM transfer then the switchover-limit might *implicitly* act as an budget: | Though implicitly if downtime-limit captures only RAM transfer, then in theory | if you're migrating a busy guest that happens to meet the SLA say | expected-downtime=290, then you have a total of 20 for switchover (thanks to | the extra 10 used in switchover-limit/downtime-limit-max-error 10). I am confused with what to make here. If budget is bad because any ratio should be used if available, but then the added parameter doesn't care about ratios specifically but *can* act as switchover ratio when RAM dominates downtime-limit. But now no budget is associated is also bad ... then what's your middle ground from your point of view to tackle switchover downtime being somehow accounted?
On Wed, Jun 26, 2024 at 01:12:15PM +0100, Joao Martins wrote: > On 26/06/2024 12:34, Daniel P. Berrangé wrote: > > On Wed, Jun 26, 2024 at 12:29:41PM +0100, Joao Martins wrote: > >> On 25/06/2024 19:37, Daniel P. Berrangé wrote: > >>> On Tue, Jun 25, 2024 at 10:53:41AM -0400, Peter Xu wrote: > >>>> Then the question is how should we suggest the user to specify these two > >>>> parameters. > >>>> > >>>> The cover letter used: > >>>> > >>>> migrate_set_parameter downtime-limit 300 > >>>> migrate_set_parameter switchover-limit 10 > >>> > >>> What this means is that in practice the total downtime limit > >>> is 310 ms, however, expressing this as two parameters is > >>> incredibly inflexible. > >>> > >>> If the actual RAM transfer downtime only took 50 ms, then why > >>> should the switchover downtime still be limited to 10ms, when > >>> we've still got a budget of 250 ms that was unused. > >>> > >> > >> The downtime limit is 300, it's more than you are giving something *extra* 10ms > >> when you switchover regardless of where that's spent. > >> > >> If it makes it easier to understand you could see this parameter as: > >> > >> 'downtime-limit-max-error' = 10 ms > >> > >> The name as proposed by the RFC was meant to honor what the error margin was > >> meant for: to account for extra time during switchover. Adding this inside > >> downtime-limit wouldn't work as it otherwise would be used solely for RAM > >> transfer during precopy. > >> > >>> IOW, if my VM tolerates a downtime of 310ms, then I want that > >>> 310ms spread across the RAM transfer downtime and switchover > >>> downtime in *any* ratio. ALl that matters is the overall > >>> completion time. > >>> > >> That still happens with this patches, no specific budget is given to each. > > > > If no specific budget is given to each, then IMHO adding the second > > parameter is pointless & misleading. > > That is contradictory with your earlier statement. I don't think it is. > You redacted the part where I describe how this works in *the worst case* if the > entire downtime-limit is used for RAM transfer then the switchover-limit might > *implicitly* act as an budget: > > | Though implicitly if downtime-limit captures only RAM transfer, then in theory > | if you're migrating a busy guest that happens to meet the SLA say > | expected-downtime=290, then you have a total of 20 for switchover (thanks to > | the extra 10 used in switchover-limit/downtime-limit-max-error 10). > > I am confused with what to make here. If budget is bad because any ratio should > be used if available, but then the added parameter doesn't care about ratios > specifically but *can* act as switchover ratio when RAM dominates > downtime-limit. But now no budget is associated is also bad ... then what's your > middle ground from your point of view to tackle switchover downtime being > somehow accounted? The pre-existing 'downtime-limit' value should apply to anything that happens between src CPUs stopping, and dst CPUs starting. If we were not correctly accounting for some parts, we just need to fix that accounting, without adding extra time parameters. With regards, Daniel
On Wed, Jun 26, 2024 at 12:04:43PM +0100, Joao Martins wrote: > Are you thinking in something specifically? Not really. I don't think I have any idea on how to make it better, unfortunately, but we did some measurement too quite some time ago and I can share some below. > > Many "variables" affect this from the point we decide switchover, and at the > worst (likely) case it means having qemu subsystems declare empirical values on > how long it takes to suspend/resume/transfer-state to migration expected > downtime prediction equation. Part of the reason that having headroom within > downtime-limit was a simple 'catch-all' (from our PoV) in terms of > maintainability while giving user something to fallback for characterizing its > SLA. Yes, I think this might be a way to go, by starting with something that can catch all. > Personally, I think there's a tiny bit disconnect between what the user > desires when setting downtime-limit vs what it really does. downtime-limit right > now looks to be best viewed as 'precopy-ram-downtime-limit' :) That's fair to say indeed.. QEMU can try to do better on this, it's just not yet straightforward to know how. > Unless the accuracy work you're thinking is just having a better > migration algorithm at obtaining the best possible downtime for > outstanding-data/RAM *even if* downtime-limit is set at a high limit, > like giving 1) a grace period in the beginning of migration post first > dirty sync Can you elaborate on this one a bit? > or 2) a measured value with continually incrementing target downtime > limit until max downtime-limit set by user hits ... before defaulting to > the current behaviour of migrating as soon as expected downtime is within > the downtime-limit. As discussed in the last response, this could create > the 'downtime headroom' for getting the enforcement/SLA better > honored. Is this maybe your line of thinking? Not what I was referring, but I think such logic existed for years, it was just not implemented in QEMU. I know at least OpenStack implemented exactly that, where instead of keeping an internal smaller downtime_limit and keep increasing that, OpenStack will keep adjusting downtime_limit parameter from time to time, starting with a relatively low value. That is also what I would suggest to most people who cares about downtime, because QEMU does treat it pretty simple: if QEMU thinks it can switchover within the downtime specified, QEMU will just do it, even if it's not the best it can do. Do you think such idea should be instead implemented in QEMU, too? Note that this will also be not about "making downtime accurate", but "reducing downtime", it may depend on how we define downtime_limit in the context, perhaps, where in OpenStack's case it simply won't directly feed that parameter with the real max downtime the user allows. Since that wasn't my original purpose, what I meant is simply see ways to make downtime_limit accurate, and by analyzing the current downtimes (as you mentioned, using the downtime tracepoints; and I'd say kudos to you on suggesting that in a formal patch). Here's something we collected by our QE team, for example, on a pretty loaded system of 384 cores + 12TB: Checkpoints analysis: downtime-start -> vm-stopped: 267635.2 (us) vm-stopped -> iterable-saved: 3558506.2 (us) iterable-saved -> non-iterable-saved: 270352.2 (us) non-iterable-saved -> downtime-end: 144264.2 (us) total downtime: 4240758.0 (us) Iterable device analysis: Device SAVE of ram: 0 took 3470420 (us) Non-iterable device analysis: Device SAVE of cpu:121 took 118090 (us) Device SAVE of apic:167 took 6899 (us) Device SAVE of cpu:296 took 3795 (us) Device SAVE of 0000:00:02.2:00.0/virtio-blk: 0 took 638 (us) Device SAVE of cpu:213 took 630 (us) Device SAVE of 0000:00:02.0:00.0/virtio-net: 0 took 534 (us) Device SAVE of cpu:374 took 517 (us) Device SAVE of cpu: 31 took 503 (us) Device SAVE of cpu:346 took 497 (us) Device SAVE of 0000:00:02.0:00.1/virtio-net: 0 took 492 (us) (1341 vmsd omitted) In this case we also see the SLA violations since we specified something much lower than 4.2sec as downtime_limit. This might not be a good example, as I think when capturing the traces we used to still have the issue on inaccurate bw estimations, and that was why I introduced switchover-bandwidth parameter, I wished after that the result can be closer to downtime_limit but we never tried to test again. I am not sure either on whether that's the best way to address this. But let's just ignore the iterable save() huge delays (which can be explained, and hopefully will still be covered by downtime_limit calculations when it can try to get closer to right), and we can also see at least a few things we didn't account: - stop vm: 268ms - non-iterables: 270ms - dest load until complete: 144ms For the last one, we did see another outlier where it can only be seen from dest: Non-iterable device analysis: Device LOAD of kvm-tpr-opt: 0 took 123976 (us) <----- this one Device LOAD of 0000:00:02.0/pcie-root-port: 0 took 6362 (us) Device LOAD of 0000:00:02.0:00.0/virtio-net: 0 took 4583 (us) Device LOAD of 0000:00:02.0:00.1/virtio-net: 0 took 4440 (us) Device LOAD of 0000:00:01.0/vga: 0 took 3740 (us) Device LOAD of 0000:00:00.0/mch: 0 took 3557 (us) Device LOAD of 0000:00:02.2:00.0/virtio-blk: 0 took 3530 (us) Device LOAD of 0000:00:02.1:00.0/xhci: 0 took 2712 (us) Device LOAD of 0000:00:02.1/pcie-root-port: 0 took 2046 (us) Device LOAD of 0000:00:02.2/pcie-root-port: 0 took 1890 (us) So we found either cpu save() taking 100+ms, or kvm-tpr-opt load() taking 100+ms. None of them sounds normal, and I didn't look into them. Now with a global ratio perhaps start to reflect "how much ratio of downtime_limit should we account into data transfer", then we'll also need to answer how the user should set that ratio value, and maybe there's a sane way to calculate that by the VM setup? I'm not sure, but those questions may need to be answered together in the next post, so that such parameter can be consumable. The answer doesn't need to be accurate, but I hope that can be based on some similar analysis like above (where I didn't do it well; as I don't think I looked into any of the issues, and maybe they're fix-able). But just to show what I meant. It'll be also great when doing the analysis we found issues fix-able, then it'll be great we fix the issues intead. That's the part when I mentioned "I still prefer fixing downtime_limit itself". Thanks,
On Wed, Jun 26, 2024 at 02:41:34PM -0400, Peter Xu wrote: > On Wed, Jun 26, 2024 at 12:04:43PM +0100, Joao Martins wrote: > > Are you thinking in something specifically? > > Not really. I don't think I have any idea on how to make it better, > unfortunately, but we did some measurement too quite some time ago and I > can share some below. Hello Peter I apologize for such a long delay with the reply. > > > > > Many "variables" affect this from the point we decide switchover, and at the > > worst (likely) case it means having qemu subsystems declare empirical values on > > how long it takes to suspend/resume/transfer-state to migration expected > > downtime prediction equation. Part of the reason that having headroom within > > downtime-limit was a simple 'catch-all' (from our PoV) in terms of > > maintainability while giving user something to fallback for characterizing its > > SLA. > > Yes, I think this might be a way to go, by starting with something that can > catch all. Possibly the title "strict SLA" is not the best choice of words as it creates impression that the guarantees will be met. But essentially this switchover limit is a safeguard against the unknowns that can contribute to the downtime during the stop-copy and can be not that easy to account for (or even impossible due to hardware implementation or other issues). To show what kind of statistics we see in our environments and what are the main contributors please see below. Example 1: host migration, default downtime set to 300: Checkpoints analysis: checkpoint=src-downtime-start -> checkpoint=src-vm-stopped: 74244 (us) checkpoint=src-vm-stopped -> checkpoint=src-iterable-saved: 154493 (us) checkpoint=src-iterable-saved -> checkpoint=src-non-iterable-saved: 4746 (us) checkpoint=src-non-iterable-saved -> checkpoint=dst-precopy-loadvm-completed: 224981 (us) checkpoint=dst-precopy-loadvm-completed -> checkpoint=dst-precopy-bh-enter: 36 (us) checkpoint=dst-precopy-bh-enter -> checkpoint=dst-precopy-bh-announced: 7859 (us) checkpoint=dst-precopy-bh-announced -> checkpoint=dst-precopy-bh-vm-started: 15995 (us) checkpoint=dst-precopy-bh-vm-started -> checkpoint=src-downtime-end: 236 (us) Iterable device analysis: Device SAVE of ram: 0 took 151054 (us) Device LOAD of ram: 0 took 146855 (us) Device SAVE of 0000:20:04.0:00.0:00.0/vfio: 0 took 2127 (us) Device LOAD of 0000:20:04.0:00.0:00.0/vfio: 0 took 144202 (us) Non-iterable device analysis: Device LOAD of 0000:20:04.0:00.0:00.0/vfio: 0 took 67470 (us) Device LOAD of 0000:00:01.0/vga: 0 took 7527 (us) Device LOAD of 0000:00:02.0/e1000e: 0 took 1715 (us) Device LOAD of kvm-tpr-opt: 0 took 1697 (us) Device LOAD of 0000:00:03.0/virtio-blk: 0 took 1340 (us) Device SAVE of 0000:00:02.0/e1000e: 0 took 1036 (us) Device LOAD of 0000:00:00.0/mch: 0 took 1035 (us) Device LOAD of 0000:20:04.0:00.0/pcie-root-port: 0 took 976 (us) Device LOAD of 0000:00:1f.0/ICH9LPC: 0 took 851 (us) Device LOAD of 0000:00:1f.2/ich9_ahci: 0 took 578 (us) (qemu) info migrate globals: store-global-state: on only-migratable: off send-configuration: on send-section-footer: on decompress-error-check: on clear-bitmap-shift: 18 Migration status: completed total time: 5927 ms downtime: 483 ms setup: 78 ms transferred ram: 883709 kbytes throughput: 1237.71 mbps remaining ram: 0 kbytes total ram: 33571656 kbytes duplicate: 8192488 pages skipped: 0 pages normal: 201300 pages normal bytes: 805200 kbytes dirty sync count: 3 page size: 4 kbytes multifd bytes: 0 kbytes pages-per-second: 958776 precopy ram: 480464 kbytes downtime ram: 398313 kbytes vfio device transferred: 4496 kbytes Example 2: different system than above, live migration over 100Gbit/s connection and 2 vfio virtual functions (the guest has no workload and vfio devices are not engaged in VM and have same amount of data to live migrate). Displayed outliers that are larger than 3 us. Save: 252812@1721976657.700972:vmstate_downtime_checkpoint src-downtime-start 252812@1721976657.829180:vmstate_downtime_checkpoint src-vm-stopped 252812@1721976657.967987:vmstate_downtime_save type=iterable idstr=ram instance_id=0 downtime=138005 252812@1721976658.093218:vmstate_downtime_save type=iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=125188 252812@1721976658.318101:vmstate_downtime_save type=iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=224857 252812@1721976658.318125:vmstate_downtime_checkpoint src-iterable-saved ... Load: 353062@1721976488.995582:vmstate_downtime_load type=iterable idstr=ram instance_id=0 downtime=117294 353062@1721976489.235227:vmstate_downtime_load type=iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=239586 353062@1721976489.449736:vmstate_downtime_load type=iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=214462 353062@1721976489.463260:vmstate_downtime_load type=non-iterable idstr=0000:00:01.0/vga instance_id=0 downtime=7522 353062@1721976489.575383:vmstate_downtime_load type=non-iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=112113 353062@1721976489.686961:vmstate_downtime_load type=non-iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=111545 ... (qemu) info migrate globals: store-global-state: on only-migratable: off send-configuration: on send-section-footer: on decompress-error-check: on clear-bitmap-shift: 18 Migration status: completed total time: 23510 ms downtime: 1018 ms setup: 380 ms transferred ram: 5317587 kbytes throughput: 1883.34 mbps remaining ram: 0 kbytes total ram: 209732424 kbytes duplicate: 51628634 pages skipped: 0 pages normal: 1159697 pages normal bytes: 4638788 kbytes dirty sync count: 4 page size: 4 kbytes multifd bytes: 4653988 kbytes pages-per-second: 1150272 precopy ram: 453652 kbytes downtime ram: 118 kbytes vfio device transferred: 209431 kbytes As it can be seen from above, the downtime gets violated and the main contributors are vfio devices. Also it can very depending on the firmare version. I have to note that in one of 10 tests, the ram downtime is being much larger and becomes then the outlier. This is being investigated currently. This switchover overshoot is usually being reported as timed out queries. And to comment on precision, yes, this overshoot safeguard is not precise. In fact, the current implementation is not precise and has a granularity of the savevm_ handlers as it would only check the downtime overshoot after it is being completed. Maybe this part can be improved by delegating to some of the known abusers to observe the downtime overshoot on its own. > > > Personally, I think there's a tiny bit disconnect between what the user > > desires when setting downtime-limit vs what it really does. downtime-limit right > > now looks to be best viewed as 'precopy-ram-downtime-limit' :) > > That's fair to say indeed.. QEMU can try to do better on this, it's just > not yet straightforward to know how. Could be that the better known part of the downtime (and predictable taken the bandwidth is accurate), i.e. currently what is downtime-limit, serve as a starting point and be named as ram-downtime-limit and everything else would have switchover allowance of downtime_limit - ram-downtime-limit? The correct value of ram-downtime-limit would take few iterations after dirty sync to get established. I think that is somewhat similar to what Joao is thinking below in 1) and 2)? Thanks! Elena > > > Unless the accuracy work you're thinking is just having a better > > migration algorithm at obtaining the best possible downtime for > > outstanding-data/RAM *even if* downtime-limit is set at a high limit, > > like giving 1) a grace period in the beginning of migration post first > > dirty sync > > Can you elaborate on this one a bit? > > > or 2) a measured value with continually incrementing target downtime > > limit until max downtime-limit set by user hits ... before defaulting to > > the current behaviour of migrating as soon as expected downtime is within > > the downtime-limit. As discussed in the last response, this could create > > the 'downtime headroom' for getting the enforcement/SLA better > > honored. Is this maybe your line of thinking? > > Not what I was referring, but I think such logic existed for years, it was > just not implemented in QEMU. I know at least OpenStack implemented > exactly that, where instead of keeping an internal smaller downtime_limit > and keep increasing that, OpenStack will keep adjusting downtime_limit > parameter from time to time, starting with a relatively low value. > > That is also what I would suggest to most people who cares about downtime, > because QEMU does treat it pretty simple: if QEMU thinks it can switchover > within the downtime specified, QEMU will just do it, even if it's not the > best it can do. > > Do you think such idea should be instead implemented in QEMU, too? Note > that this will also be not about "making downtime accurate", but "reducing > downtime", it may depend on how we define downtime_limit in the context, > perhaps, where in OpenStack's case it simply won't directly feed that > parameter with the real max downtime the user allows. > > Since that wasn't my original purpose, what I meant is simply see ways to > make downtime_limit accurate, and by analyzing the current downtimes (as > you mentioned, using the downtime tracepoints; and I'd say kudos to you on > suggesting that in a formal patch). > > Here's something we collected by our QE team, for example, on a pretty > loaded system of 384 cores + 12TB: > > Checkpoints analysis: > > downtime-start -> vm-stopped: 267635.2 (us) > vm-stopped -> iterable-saved: 3558506.2 (us) > iterable-saved -> non-iterable-saved: 270352.2 (us) > non-iterable-saved -> downtime-end: 144264.2 (us) > total downtime: 4240758.0 (us) > > Iterable device analysis: > > Device SAVE of ram: 0 took 3470420 (us) > > Non-iterable device analysis: > > Device SAVE of cpu:121 took 118090 (us) > Device SAVE of apic:167 took 6899 (us) > Device SAVE of cpu:296 took 3795 (us) > Device SAVE of 0000:00:02.2:00.0/virtio-blk: 0 took 638 (us) > Device SAVE of cpu:213 took 630 (us) > Device SAVE of 0000:00:02.0:00.0/virtio-net: 0 took 534 (us) > Device SAVE of cpu:374 took 517 (us) > Device SAVE of cpu: 31 took 503 (us) > Device SAVE of cpu:346 took 497 (us) > Device SAVE of 0000:00:02.0:00.1/virtio-net: 0 took 492 (us) > (1341 vmsd omitted) > > In this case we also see the SLA violations since we specified something > much lower than 4.2sec as downtime_limit. > > This might not be a good example, as I think when capturing the traces we > used to still have the issue on inaccurate bw estimations, and that was why > I introduced switchover-bandwidth parameter, I wished after that the result > can be closer to downtime_limit but we never tried to test again. I am not > sure either on whether that's the best way to address this. > > But let's just ignore the iterable save() huge delays (which can be > explained, and hopefully will still be covered by downtime_limit > calculations when it can try to get closer to right), and we can also see > at least a few things we didn't account: > > - stop vm: 268ms > - non-iterables: 270ms > - dest load until complete: 144ms > > For the last one, we did see another outlier where it can only be seen from > dest: > > Non-iterable device analysis: > > Device LOAD of kvm-tpr-opt: 0 took 123976 (us) <----- this one > Device LOAD of 0000:00:02.0/pcie-root-port: 0 took 6362 (us) > Device LOAD of 0000:00:02.0:00.0/virtio-net: 0 took 4583 (us) > Device LOAD of 0000:00:02.0:00.1/virtio-net: 0 took 4440 (us) > Device LOAD of 0000:00:01.0/vga: 0 took 3740 (us) > Device LOAD of 0000:00:00.0/mch: 0 took 3557 (us) > Device LOAD of 0000:00:02.2:00.0/virtio-blk: 0 took 3530 (us) > Device LOAD of 0000:00:02.1:00.0/xhci: 0 took 2712 (us) > Device LOAD of 0000:00:02.1/pcie-root-port: 0 took 2046 (us) > Device LOAD of 0000:00:02.2/pcie-root-port: 0 took 1890 (us) > > So we found either cpu save() taking 100+ms, or kvm-tpr-opt load() taking > 100+ms. None of them sounds normal, and I didn't look into them. > > Now with a global ratio perhaps start to reflect "how much ratio of > downtime_limit should we account into data transfer", then we'll also need > to answer how the user should set that ratio value, and maybe there's a > sane way to calculate that by the VM setup? I'm not sure, but those > questions may need to be answered together in the next post, so that such > parameter can be consumable. > > The answer doesn't need to be accurate, but I hope that can be based on > some similar analysis like above (where I didn't do it well; as I don't > think I looked into any of the issues, and maybe they're fix-able). But > just to show what I meant. It'll be also great when doing the analysis we > found issues fix-able, then it'll be great we fix the issues intead. > That's the part when I mentioned "I still prefer fixing downtime_limit > itself". > > Thanks, > > -- > Peter Xu >
On Fri, Jul 26, 2024 at 12:41:57AM -0700, Elena Ufimtseva wrote: > On Wed, Jun 26, 2024 at 02:41:34PM -0400, Peter Xu wrote: > > On Wed, Jun 26, 2024 at 12:04:43PM +0100, Joao Martins wrote: > > > Are you thinking in something specifically? > > > > Not really. I don't think I have any idea on how to make it better, > > unfortunately, but we did some measurement too quite some time ago and I > > can share some below. > > > Hello Peter > > I apologize for such a long delay with the reply. I apologize back.. :) > > > > > > > > > Many "variables" affect this from the point we decide switchover, and at the > > > worst (likely) case it means having qemu subsystems declare empirical values on > > > how long it takes to suspend/resume/transfer-state to migration expected > > > downtime prediction equation. Part of the reason that having headroom within > > > downtime-limit was a simple 'catch-all' (from our PoV) in terms of > > > maintainability while giving user something to fallback for characterizing its > > > SLA. > > > > Yes, I think this might be a way to go, by starting with something that can > > catch all. > > > Possibly the title "strict SLA" is not the best choice of > words as it creates impression that the guarantees will be met. > But essentially this switchover limit is a safeguard against the unknowns > that can contribute to the downtime during the stop-copy and can be not > that easy to account for (or even impossible due to hardware > implementation or other issues). > > To show what kind of statistics we see in our environments and what > are the main contributors please see below. > > Example 1: host migration, default downtime set to 300: > > Checkpoints analysis: > > checkpoint=src-downtime-start -> checkpoint=src-vm-stopped: 74244 (us) > checkpoint=src-vm-stopped -> checkpoint=src-iterable-saved: 154493 (us) > checkpoint=src-iterable-saved -> checkpoint=src-non-iterable-saved: 4746 (us) > checkpoint=src-non-iterable-saved -> checkpoint=dst-precopy-loadvm-completed: 224981 (us) This is the first cross-host time diff, I assume the two hosts are synchronized and the time diff is pretty low. I guess part like this is not accounted in the switchover decision by src QEMU, with the hope that after src QEMU sent everything with the estimated bandwidth the dest QEMU should have (mostly) loaded everything. It might be interesting to know when this can grow much larger than we expect. More below.. > checkpoint=dst-precopy-loadvm-completed -> checkpoint=dst-precopy-bh-enter: 36 (us) > checkpoint=dst-precopy-bh-enter -> checkpoint=dst-precopy-bh-announced: 7859 (us) > checkpoint=dst-precopy-bh-announced -> checkpoint=dst-precopy-bh-vm-started: 15995 (us) > checkpoint=dst-precopy-bh-vm-started -> checkpoint=src-downtime-end: 236 (us) > > Iterable device analysis: > > Device SAVE of ram: 0 took 151054 (us) > Device LOAD of ram: 0 took 146855 (us) > Device SAVE of 0000:20:04.0:00.0:00.0/vfio: 0 took 2127 (us) > Device LOAD of 0000:20:04.0:00.0:00.0/vfio: 0 took 144202 (us) > > Non-iterable device analysis: > > Device LOAD of 0000:20:04.0:00.0:00.0/vfio: 0 took 67470 (us) > Device LOAD of 0000:00:01.0/vga: 0 took 7527 (us) > Device LOAD of 0000:00:02.0/e1000e: 0 took 1715 (us) > Device LOAD of kvm-tpr-opt: 0 took 1697 (us) > Device LOAD of 0000:00:03.0/virtio-blk: 0 took 1340 (us) > Device SAVE of 0000:00:02.0/e1000e: 0 took 1036 (us) > Device LOAD of 0000:00:00.0/mch: 0 took 1035 (us) > Device LOAD of 0000:20:04.0:00.0/pcie-root-port: 0 took 976 (us) > Device LOAD of 0000:00:1f.0/ICH9LPC: 0 took 851 (us) > Device LOAD of 0000:00:1f.2/ich9_ahci: 0 took 578 (us) > > (qemu) info migrate > globals: > store-global-state: on > only-migratable: off > send-configuration: on > send-section-footer: on > decompress-error-check: on > clear-bitmap-shift: 18 > Migration status: completed > total time: 5927 ms > downtime: 483 ms > setup: 78 ms > transferred ram: 883709 kbytes > throughput: 1237.71 mbps > remaining ram: 0 kbytes > total ram: 33571656 kbytes > duplicate: 8192488 pages > skipped: 0 pages > normal: 201300 pages > normal bytes: 805200 kbytes > dirty sync count: 3 > page size: 4 kbytes > multifd bytes: 0 kbytes > pages-per-second: 958776 > precopy ram: 480464 kbytes > downtime ram: 398313 kbytes > vfio device transferred: 4496 kbytes > > Example 2: different system than above, live migration over 100Gbit/s > connection and 2 vfio virtual functions (the guest has no workload and > vfio devices are not engaged in VM and have same amount of data to live > migrate). > > Displayed outliers that are larger than 3 us. > > Save: > 252812@1721976657.700972:vmstate_downtime_checkpoint src-downtime-start > 252812@1721976657.829180:vmstate_downtime_checkpoint src-vm-stopped > 252812@1721976657.967987:vmstate_downtime_save type=iterable idstr=ram instance_id=0 downtime=138005 > 252812@1721976658.093218:vmstate_downtime_save type=iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=125188 > 252812@1721976658.318101:vmstate_downtime_save type=iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=224857 > 252812@1721976658.318125:vmstate_downtime_checkpoint src-iterable-saved > ... > > Load: > 353062@1721976488.995582:vmstate_downtime_load type=iterable idstr=ram instance_id=0 downtime=117294 > 353062@1721976489.235227:vmstate_downtime_load type=iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=239586 > 353062@1721976489.449736:vmstate_downtime_load type=iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=214462 > 353062@1721976489.463260:vmstate_downtime_load type=non-iterable idstr=0000:00:01.0/vga instance_id=0 downtime=7522 > 353062@1721976489.575383:vmstate_downtime_load type=non-iterable idstr=0000:00:02.0/vfio instance_id=0 downtime=112113 > 353062@1721976489.686961:vmstate_downtime_load type=non-iterable idstr=0000:00:03.0/vfio instance_id=0 downtime=111545 > ... > > (qemu) info migrate > globals: > store-global-state: on > only-migratable: off > send-configuration: on > send-section-footer: on > decompress-error-check: on > clear-bitmap-shift: 18 > Migration status: completed > total time: 23510 ms > downtime: 1018 ms > setup: 380 ms > transferred ram: 5317587 kbytes > throughput: 1883.34 mbps > remaining ram: 0 kbytes > total ram: 209732424 kbytes > duplicate: 51628634 pages > skipped: 0 pages > normal: 1159697 pages > normal bytes: 4638788 kbytes > dirty sync count: 4 > page size: 4 kbytes > multifd bytes: 4653988 kbytes > pages-per-second: 1150272 > precopy ram: 453652 kbytes > downtime ram: 118 kbytes > vfio device transferred: 209431 kbytes > > As it can be seen from above, the downtime gets violated and the main > contributors are vfio devices. Also it can very depending on the > firmare version. So have you figured out how VFIO affected this? I thought VFIO will also report pending information just like RAM, in ->state_pending_exact(). However I don't think such reports involves things like read()/write() over the VFIO fds. I assume that can be a major issue that won't be captured by any form of pending data reports. It might be useful to first identify / verify this issue, then think about how to improve the estimation of downtimes, e.g. by reporting "extra time" (in this case, the time to do read() / write() from the device driver) needed for the final dump stage for either iterative/non-iterative data. > I have to note that in one of 10 tests, the ram downtime is being much > larger and becomes then the outlier. This is being investigated > currently. > > This switchover overshoot is usually being reported as timed out queries. > And to comment on precision, yes, this overshoot safeguard is not > precise. > In fact, the current implementation is not precise and has a granularity of the savevm_ > handlers as it would only check the downtime overshoot after it is being > completed. Maybe this part can be improved by delegating to some of the known > abusers to observe the downtime overshoot on its own. > > > > > > Personally, I think there's a tiny bit disconnect between what the user > > > desires when setting downtime-limit vs what it really does. downtime-limit right > > > now looks to be best viewed as 'precopy-ram-downtime-limit' :) > > > > That's fair to say indeed.. QEMU can try to do better on this, it's just > > not yet straightforward to know how. > > Could be that the better known part of the downtime (and predictable > taken the bandwidth is accurate), i.e. currently what is downtime-limit, serve > as a starting point and be named as ram-downtime-limit and everything > else would have switchover allowance of downtime_limit - ram-downtime-limit? > The correct value of ram-downtime-limit would take few iterations > after dirty sync to get established. > > I think that is somewhat similar to what Joao is thinking below in 1) and 2)? I'm personally curious on whether you can nail the current issues down to VFIO, and if so whether there can be something we can do already to improve downtime estimation when VFIO is involved, irrelevant of the rest. For a generic idea of improving downtime, I think as long as we stick with downtime_limit parameter untouched (reflecting the total & max downtime), then we can add some new ideas on top. Thanks, > > Thanks! > > Elena > > > > > Unless the accuracy work you're thinking is just having a better > > > migration algorithm at obtaining the best possible downtime for > > > outstanding-data/RAM *even if* downtime-limit is set at a high limit, > > > like giving 1) a grace period in the beginning of migration post first > > > dirty sync > > > > Can you elaborate on this one a bit? > > > > > > or 2) a measured value with continually incrementing target downtime > > > limit until max downtime-limit set by user hits ... before defaulting to > > > the current behaviour of migrating as soon as expected downtime is within > > > the downtime-limit. As discussed in the last response, this could create > > > the 'downtime headroom' for getting the enforcement/SLA better > > > honored. Is this maybe your line of thinking? > > > > > Not what I was referring, but I think such logic existed for years, it was > > just not implemented in QEMU. I know at least OpenStack implemented > > exactly that, where instead of keeping an internal smaller downtime_limit > > and keep increasing that, OpenStack will keep adjusting downtime_limit > > parameter from time to time, starting with a relatively low value. > > > > That is also what I would suggest to most people who cares about downtime, > > because QEMU does treat it pretty simple: if QEMU thinks it can switchover > > within the downtime specified, QEMU will just do it, even if it's not the > > best it can do. > > > > Do you think such idea should be instead implemented in QEMU, too? Note > > that this will also be not about "making downtime accurate", but "reducing > > downtime", it may depend on how we define downtime_limit in the context, > > perhaps, where in OpenStack's case it simply won't directly feed that > > parameter with the real max downtime the user allows. > > > > Since that wasn't my original purpose, what I meant is simply see ways to > > make downtime_limit accurate, and by analyzing the current downtimes (as > > you mentioned, using the downtime tracepoints; and I'd say kudos to you on > > suggesting that in a formal patch). > > > > Here's something we collected by our QE team, for example, on a pretty > > loaded system of 384 cores + 12TB: > > > > Checkpoints analysis: > > > > downtime-start -> vm-stopped: 267635.2 (us) > > vm-stopped -> iterable-saved: 3558506.2 (us) > > iterable-saved -> non-iterable-saved: 270352.2 (us) > > non-iterable-saved -> downtime-end: 144264.2 (us) > > total downtime: 4240758.0 (us) > > > > Iterable device analysis: > > > > Device SAVE of ram: 0 took 3470420 (us) > > > > Non-iterable device analysis: > > > > Device SAVE of cpu:121 took 118090 (us) > > Device SAVE of apic:167 took 6899 (us) > > Device SAVE of cpu:296 took 3795 (us) > > Device SAVE of 0000:00:02.2:00.0/virtio-blk: 0 took 638 (us) > > Device SAVE of cpu:213 took 630 (us) > > Device SAVE of 0000:00:02.0:00.0/virtio-net: 0 took 534 (us) > > Device SAVE of cpu:374 took 517 (us) > > Device SAVE of cpu: 31 took 503 (us) > > Device SAVE of cpu:346 took 497 (us) > > Device SAVE of 0000:00:02.0:00.1/virtio-net: 0 took 492 (us) > > (1341 vmsd omitted) > > > > In this case we also see the SLA violations since we specified something > > much lower than 4.2sec as downtime_limit. > > > > This might not be a good example, as I think when capturing the traces we > > used to still have the issue on inaccurate bw estimations, and that was why > > I introduced switchover-bandwidth parameter, I wished after that the result > > can be closer to downtime_limit but we never tried to test again. I am not > > sure either on whether that's the best way to address this. > > > > But let's just ignore the iterable save() huge delays (which can be > > explained, and hopefully will still be covered by downtime_limit > > calculations when it can try to get closer to right), and we can also see > > at least a few things we didn't account: > > > > - stop vm: 268ms > > - non-iterables: 270ms > > - dest load until complete: 144ms > > > > For the last one, we did see another outlier where it can only be seen from > > dest: > > > > Non-iterable device analysis: > > > > Device LOAD of kvm-tpr-opt: 0 took 123976 (us) <----- this one > > Device LOAD of 0000:00:02.0/pcie-root-port: 0 took 6362 (us) > > Device LOAD of 0000:00:02.0:00.0/virtio-net: 0 took 4583 (us) > > Device LOAD of 0000:00:02.0:00.1/virtio-net: 0 took 4440 (us) > > Device LOAD of 0000:00:01.0/vga: 0 took 3740 (us) > > Device LOAD of 0000:00:00.0/mch: 0 took 3557 (us) > > Device LOAD of 0000:00:02.2:00.0/virtio-blk: 0 took 3530 (us) > > Device LOAD of 0000:00:02.1:00.0/xhci: 0 took 2712 (us) > > Device LOAD of 0000:00:02.1/pcie-root-port: 0 took 2046 (us) > > Device LOAD of 0000:00:02.2/pcie-root-port: 0 took 1890 (us) > > > > So we found either cpu save() taking 100+ms, or kvm-tpr-opt load() taking > > 100+ms. None of them sounds normal, and I didn't look into them. > > > > > > > Now with a global ratio perhaps start to reflect "how much ratio of > > downtime_limit should we account into data transfer", then we'll also need > > to answer how the user should set that ratio value, and maybe there's a > > sane way to calculate that by the VM setup? I'm not sure, but those > > questions may need to be answered together in the next post, so that such > > parameter can be consumable. > > > > > The answer doesn't need to be accurate, but I hope that can be based on > > some similar analysis like above (where I didn't do it well; as I don't > > think I looked into any of the issues, and maybe they're fix-able). But > > just to show what I meant. It'll be also great when doing the analysis we > > found issues fix-able, then it'll be great we fix the issues intead. > > That's the part when I mentioned "I still prefer fixing downtime_limit > > itself". > > > > Thanks, > > > > -- > > Peter Xu > > >
diff --git a/migration/migration.c b/migration/migration.c index 5cc304d2db..64d7290997 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -240,6 +240,8 @@ void migration_object_init(void) current_incoming->page_requested = g_tree_new(page_request_addr_cmp); current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR; + /* Downtime will start when source sends its current downtime. */ + current_incoming->downtime_start = 0; migration_object_check(current_migration, &error_fatal); diff --git a/migration/migration.h b/migration/migration.h index aa56b70795..06f4ebe214 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -230,6 +230,21 @@ struct MigrationIncomingState { /* Do exit on incoming migration failure */ bool exit_on_error; + + /* Initial downtime on destination set by MIG_CMD_SEND_SRC_DOWNTIME */ + uint64_t downtime_start; + /* + * Current donwtime on destination that initially set equal to source by + * MIG_CMD_SEND_SRC_DOWNTIME, then updated by destination itself. + */ + uint64_t downtime_now; + /* + * Abort live migration on destination when current destination downtime + * exceeds the abort_limit. abort_limit is being set by + * MIG_CMD_SEND_SRC_DOWNTIME sent from source. + */ + uint64_t abort_limit; + uint64_t src_downtime; }; MigrationIncomingState *migration_incoming_get_current(void); diff --git a/migration/savevm.c b/migration/savevm.c index 031ab03915..f3b5ea98bf 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -90,6 +90,7 @@ enum qemu_vm_cmd { MIG_CMD_ENABLE_COLO, /* Enable COLO */ MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ + MIG_CMD_SEND_SRC_DOWNTIME, /* Send current downtime to dst */ MIG_CMD_MAX }; @@ -109,6 +110,7 @@ static struct mig_cmd_args { [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, + [MIG_CMD_SEND_SRC_DOWNTIME] = { .len = -1, .name = "SEND_SRC_DOWNTIME" }, [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, }; @@ -1218,6 +1220,18 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); } +void qemu_savevm_send_downtime(QEMUFile *f, int64_t abort_limit_ms, + int64_t source_downtime) +{ + uint64_t tmp[2]; + tmp[0] = cpu_to_be64(abort_limit_ms); + tmp[1] = cpu_to_be64(source_downtime); + + trace_qemu_savevm_send_downtime(abort_limit_ms, source_downtime); + qemu_savevm_command_send(f, MIG_CMD_SEND_SRC_DOWNTIME, + 16, (uint8_t *)tmp); +} + bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -1635,6 +1649,14 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only, } } + if (migrate_switchover_abort()) { + MigrationState *s = migrate_get_current(); + uint64_t abort_limit_ms = + s->parameters.downtime_limit + s->parameters.switchover_limit; + qemu_savevm_send_downtime(f, abort_limit_ms, + migration_get_current_downtime(s)); + } + if (iterable_only) { goto flush; } @@ -1919,6 +1941,20 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, return 0; } +static int loadvm_handle_src_downtime(MigrationIncomingState *mis, + uint16_t len) +{ + uint64_t src_abort_limit = qemu_get_be64(mis->from_src_file); + uint64_t src_current_downtime = qemu_get_be64(mis->from_src_file); + + mis->abort_limit = src_abort_limit; + mis->src_downtime = src_current_downtime; + mis->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + trace_loadvm_handle_src_downtime(src_abort_limit, src_current_downtime); + return 0; +} + /* After postcopy we will be told to throw some pages away since they're * dirty and will have to be demand fetched. Must happen before CPU is * started. @@ -2540,6 +2576,9 @@ static int loadvm_process_command(QEMUFile *f) case MIG_CMD_ENABLE_COLO: return loadvm_process_enable_colo(mis); + + case MIG_CMD_SEND_SRC_DOWNTIME: + return loadvm_handle_src_downtime(mis, len); } return 0; @@ -2659,6 +2698,18 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis, trace_vmstate_downtime_load("non-iterable", se->idstr, se->instance_id, end_ts - start_ts); } + if (migrate_switchover_abort() && type == QEMU_VM_SECTION_FULL && + mis->downtime_start) { + mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + uint64_t dst_downtime = mis->downtime_now - mis->downtime_start; + if (mis->src_downtime + dst_downtime >= mis->abort_limit) { + error_report("Shutdown destination migration, migration abort_limit" + "(%lu ms) was reached.", mis->abort_limit); + trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime, + mis->src_downtime); + return -EINVAL; + } + } if (!check_section_footer(f, se)) { return -EINVAL; @@ -2714,6 +2765,19 @@ qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis, se->instance_id, end_ts - start_ts); } + if (migrate_switchover_abort() && type == QEMU_VM_SECTION_END && + mis->downtime_start) { + mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + uint64_t dst_downtime = mis->downtime_now - mis->downtime_start; + if (mis->src_downtime + dst_downtime >= mis->abort_limit) { + error_report("Shutdown destination migration, migration abort_limit (%lu ms)" + "was reached.", mis->abort_limit); + trace_qemu_loadvm_downtime_abort(mis->abort_limit, dst_downtime, + mis->src_downtime); + return -EINVAL; + } + } + if (!check_section_footer(f, se)) { return -EINVAL; } @@ -2901,6 +2965,10 @@ retry: } trace_qemu_loadvm_state_section(section_type); + /* Start destination timer after we receive the downtime from source. */ + if (mis->downtime_start) { + mis->downtime_now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + } switch (section_type) { case QEMU_VM_SECTION_START: case QEMU_VM_SECTION_FULL: diff --git a/migration/savevm.h b/migration/savevm.h index 9ec96a995c..1166c341f3 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -53,6 +53,8 @@ void qemu_savevm_send_postcopy_listen(QEMUFile *f); void qemu_savevm_send_postcopy_run(QEMUFile *f); void qemu_savevm_send_postcopy_resume(QEMUFile *f); void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name); +void qemu_savevm_send_downtime(QEMUFile *f, int64_t switchover_abort_ms, + int64_t source_downtime_ms); void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, uint16_t len, diff --git a/migration/trace-events b/migration/trace-events index 0b7c3324fb..977988848a 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -54,6 +54,9 @@ postcopy_pause_incoming(void) "" postcopy_pause_incoming_continued(void) "" postcopy_page_req_sync(void *host_addr) "sync page req %p" +qemu_savevm_send_downtime(uint64_t src_abort_limit_ms, uint64_t source_downtime) "switchover abort limit=%"PRIi64", source downtime=%"PRIi64 +loadvm_handle_src_downtime(uint64_t src_abort_limit_ms, uint64_t source_downtime) "switchover abort limit=%"PRIi64", source downtime=%"PRIi64 +qemu_loadvm_downtime_abort(uint64_t abort_limit_ms, uint64_t dst_downtime, uint64_t source_downtime) "switchover abort limit=%"PRIi64", destination downtime=%"PRIi64", source downtime=%"PRIi64 # vmstate.c vmstate_load_field_error(const char *field, int ret) "field \"%s\" load failed, ret = %d" vmstate_load_state(const char *name, int version_id) "%s v%d"
During live migration, receive current downtime from source and start a downtime timer. When the destination dowtime and added source downtime exceeds downtime limit for more than switchover limit, abort live migration on destination. Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> --- migration/migration.c | 2 ++ migration/migration.h | 15 ++++++++++ migration/savevm.c | 68 ++++++++++++++++++++++++++++++++++++++++++ migration/savevm.h | 2 ++ migration/trace-events | 3 ++ 5 files changed, 90 insertions(+)