Message ID | 20161104165933.GA3027@amt.cnet (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Marcelo Tosatti <mtosatti@redhat.com> wrote: > This patch, relative to pre-copy migration codepath, > measures the time between vm_stop() and pre_save(), > which includes copying the remaining RAM to destination, > and advances the clock by that amount. > > In a VM with 5 seconds downtime, this reduces the guest > clock difference on destination from 5s to 0.2s. > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> > > --- > > v2: use subsection (Juan Quintela) > fix older machine types support > Reviewed-by: Juan Quintela <quintela@redhat.com>
* Marcelo Tosatti (mtosatti@redhat.com) wrote: > This patch, relative to pre-copy migration codepath, > measures the time between vm_stop() and pre_save(), > which includes copying the remaining RAM to destination, > and advances the clock by that amount. > > In a VM with 5 seconds downtime, this reduces the guest > clock difference on destination from 5s to 0.2s. > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. One thing that bothers me is that it's only this clock that's getting corrected; doesn't it cause things to get upset when one clock moves and the others dont? Shouldn't the pause delay be recorded somewhere architecturally independent and then be a thing that kvm-clock happens to use and other clocks might as well? Dave > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> > > --- > > v2: use subsection (Juan Quintela) > fix older machine types support > > diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c > index 0f75dd3..a2a02ac 100644 > --- a/hw/i386/kvm/clock.c > +++ b/hw/i386/kvm/clock.c > @@ -22,9 +22,11 @@ > #include "kvm_i386.h" > #include "hw/sysbus.h" > #include "hw/kvm/clock.h" > +#include "migration/migration.h" > > #include <linux/kvm.h> > #include <linux/kvm_para.h> > +#include <time.h> > > #define TYPE_KVM_CLOCK "kvmclock" > #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK) > @@ -35,7 +37,13 @@ typedef struct KVMClockState { > /*< public >*/ > > uint64_t clock; > + uint64_t ns; > bool clock_valid; > + > + uint64_t advance_clock; > + struct timespec t_aftervmstop; > + > + bool adv_clock_enabled; > } KVMClockState; > > struct pvclock_vcpu_time_info { > @@ -100,6 +108,11 @@ static void kvmclock_vm_state_change(void *opaque, int running, > s->clock = time_at_migration; > } > > + if (s->advance_clock && s->clock + s->advance_clock > s->clock) { > + s->clock += s->advance_clock; > + s->advance_clock = 0; > + } > + > data.clock = s->clock; > ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data); > if (ret < 0) { > @@ -135,6 +148,18 @@ static void kvmclock_vm_state_change(void *opaque, int running, > abort(); > } > s->clock = data.clock; > + /* > + * Transition from VM-running to VM-stopped via migration? > + * Record when the VM was stopped. > + */ > + > + if (state == RUN_STATE_FINISH_MIGRATE && > + !migration_in_postcopy(migrate_get_current())) { > + clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop); > + } else { > + s->t_aftervmstop.tv_sec = 0; > + s->t_aftervmstop.tv_nsec = 0; > + } > > /* > * If the VM is stopped, declare the clock state valid to > @@ -152,6 +177,77 @@ static void kvmclock_realize(DeviceState *dev, Error **errp) > qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s); > } > > +static uint64_t clock_delta(struct timespec *before, struct timespec *after) > +{ > + if (before->tv_sec > after->tv_sec || > + (before->tv_sec == after->tv_sec && > + before->tv_nsec > after->tv_nsec)) { > + fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec)," > + "after=(%ld sec, %ld nsec)\n", before->tv_sec, > + before->tv_nsec, after->tv_sec, after->tv_nsec); > + abort(); > + } > + > + return (after->tv_sec - before->tv_sec) * 1000000000ULL + > + after->tv_nsec - before->tv_nsec; > +} > + > +static void kvmclock_pre_save(void *opaque) > +{ > + KVMClockState *s = opaque; > + struct timespec now; > + uint64_t ns; > + > + if (s->t_aftervmstop.tv_sec == 0) { > + return; > + } > + > + clock_gettime(CLOCK_MONOTONIC, &now); > + > + ns = clock_delta(&s->t_aftervmstop, &now); > + > + /* > + * Linux guests can overflow if time jumps > + * forward in large increments. > + * Cap maximum adjustment to 10 minutes. > + */ > + ns = MIN(ns, 600000000000ULL); > + > + if (s->clock + ns > s->clock) { > + s->ns = ns; > + } > +} > + > +static int kvmclock_post_load(void *opaque, int version_id) > +{ > + KVMClockState *s = opaque; > + > + /* save the value from incoming migration */ > + s->advance_clock = s->ns; > + > + return 0; > +} > + > +static bool kvmclock_ns_needed(void *opaque) > +{ > + KVMClockState *s = opaque; > + > + return s->adv_clock_enabled; > +} > + > +static const VMStateDescription kvmclock_advance_ns = { > + .name = "kvmclock/advance_ns", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = kvmclock_ns_needed, > + .pre_save = kvmclock_pre_save, > + .post_load = kvmclock_post_load, > + .fields = (VMStateField[]) { > + VMSTATE_UINT64(ns, KVMClockState), > + VMSTATE_END_OF_LIST() > + } > +}; > + > static const VMStateDescription kvmclock_vmsd = { > .name = "kvmclock", > .version_id = 1, > @@ -159,15 +255,25 @@ static const VMStateDescription kvmclock_vmsd = { > .fields = (VMStateField[]) { > VMSTATE_UINT64(clock, KVMClockState), > VMSTATE_END_OF_LIST() > + }, > + .subsections = (const VMStateDescription * []) { > + &kvmclock_advance_ns, > + NULL > } > }; > > +static Property kvmclock_properties[] = { > + DEFINE_PROP_BOOL("advance_clock", KVMClockState, adv_clock_enabled, true), > + DEFINE_PROP_END_OF_LIST(), > +}; > + > static void kvmclock_class_init(ObjectClass *klass, void *data) > { > DeviceClass *dc = DEVICE_CLASS(klass); > > dc->realize = kvmclock_realize; > dc->vmsd = &kvmclock_vmsd; > + dc->props = kvmclock_properties; > } > > static const TypeInfo kvmclock_info = { > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h > index 98dc772..243352e 100644 > --- a/include/hw/i386/pc.h > +++ b/include/hw/i386/pc.h > @@ -370,6 +370,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); > #define PC_COMPAT_2_7 \ > HW_COMPAT_2_7 \ > {\ > + .driver = "kvmclock",\ > + .property = "advance_clock",\ > + .value = "off",\ > + },\ > + {\ > .driver = TYPE_X86_CPU,\ > .property = "l3-cache",\ > .value = "off",\ > -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > This patch, relative to pre-copy migration codepath, > > measures the time between vm_stop() and pre_save(), > > which includes copying the remaining RAM to destination, > > and advances the clock by that amount. > > > > In a VM with 5 seconds downtime, this reduces the guest > > clock difference on destination from 5s to 0.2s. > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > > One thing that bothers me is that it's only this clock that's > getting corrected; doesn't it cause things to get upset when > one clock moves and the others dont? If you are correlating the clocks, then yes. Older Linux guests get upset (marking the TSC clocksource unstable because the watchdog checks TSC vs kvmclock), but there is a workaround for it in newer guests (kvmclock interface to notify watchdog to not complain). Note marking TSC clocksource unstable on older guests is harmless because kvmclock is the standard clocksource. For Windows guests, i don't know that Windows correlates between different clocks. That is, there is relative control as to which software reads kvmclock or Windows TIMER MSR, so i don't see the need to advance every clock exposed. > Shouldn't the pause delay be recorded somewhere architecturally > independent and then be a thing that kvm-clock happens to use and > other clocks might as well? In theory, yes. In practice, i don't see the need for this...
* Marcelo Tosatti (mtosatti@redhat.com) wrote: > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > This patch, relative to pre-copy migration codepath, > > > measures the time between vm_stop() and pre_save(), > > > which includes copying the remaining RAM to destination, > > > and advances the clock by that amount. > > > > > > In a VM with 5 seconds downtime, this reduces the guest > > > clock difference on destination from 5s to 0.2s. > > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > > > > One thing that bothers me is that it's only this clock that's > > getting corrected; doesn't it cause things to get upset when > > one clock moves and the others dont? > > If you are correlating the clocks, then yes. > > Older Linux guests get upset (marking the TSC clocksource unstable > because the watchdog checks TSC vs kvmclock), but there is a workaround for it > in newer guests > (kvmclock interface to notify watchdog to not complain). > > Note marking TSC clocksource unstable on older guests is harmless > because kvmclock is the standard clocksource. > > For Windows guests, i don't know that Windows correlates between different > clocks. > > That is, there is relative control as to which software reads kvmclock > or Windows TIMER MSR, so i don't see the need to advance every clock > exposed. > > > Shouldn't the pause delay be recorded somewhere architecturally > > independent and then be a thing that kvm-clock happens to use and > > other clocks might as well? > > In theory, yes. In practice, i don't see the need for this... It seems unlikely to me that x86 is the only one that will want to do something similar. Dave -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote: > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > > This patch, relative to pre-copy migration codepath, > > > > measures the time between vm_stop() and pre_save(), > > > > which includes copying the remaining RAM to destination, > > > > and advances the clock by that amount. > > > > > > > > In a VM with 5 seconds downtime, this reduces the guest > > > > clock difference on destination from 5s to 0.2s. > > > > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > > > > > > One thing that bothers me is that it's only this clock that's > > > getting corrected; doesn't it cause things to get upset when > > > one clock moves and the others dont? > > > > If you are correlating the clocks, then yes. > > > > Older Linux guests get upset (marking the TSC clocksource unstable > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it > > in newer guests > > (kvmclock interface to notify watchdog to not complain). > > > > Note marking TSC clocksource unstable on older guests is harmless > > because kvmclock is the standard clocksource. > > > > For Windows guests, i don't know that Windows correlates between different > > clocks. > > > > That is, there is relative control as to which software reads kvmclock > > or Windows TIMER MSR, so i don't see the need to advance every clock > > exposed. > > > > > Shouldn't the pause delay be recorded somewhere architecturally > > > independent and then be a thing that kvm-clock happens to use and > > > other clocks might as well? > > > > In theory, yes. In practice, i don't see the need for this... > > It seems unlikely to me that x86 is the only one that will want > to do something similar. Can't they copy what kvmclock is doing today?
* Marcelo Tosatti (mtosatti@redhat.com) wrote: > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote: > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > > > This patch, relative to pre-copy migration codepath, > > > > > measures the time between vm_stop() and pre_save(), > > > > > which includes copying the remaining RAM to destination, > > > > > and advances the clock by that amount. > > > > > > > > > > In a VM with 5 seconds downtime, this reduces the guest > > > > > clock difference on destination from 5s to 0.2s. > > > > > > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > > > > > > > > One thing that bothers me is that it's only this clock that's > > > > getting corrected; doesn't it cause things to get upset when > > > > one clock moves and the others dont? > > > > > > If you are correlating the clocks, then yes. > > > > > > Older Linux guests get upset (marking the TSC clocksource unstable > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it > > > in newer guests > > > (kvmclock interface to notify watchdog to not complain). > > > > > > Note marking TSC clocksource unstable on older guests is harmless > > > because kvmclock is the standard clocksource. > > > > > > For Windows guests, i don't know that Windows correlates between different > > > clocks. > > > > > > That is, there is relative control as to which software reads kvmclock > > > or Windows TIMER MSR, so i don't see the need to advance every clock > > > exposed. > > > > > > > Shouldn't the pause delay be recorded somewhere architecturally > > > > independent and then be a thing that kvm-clock happens to use and > > > > other clocks might as well? > > > > > > In theory, yes. In practice, i don't see the need for this... > > > > It seems unlikely to me that x86 is the only one that will want > > to do something similar. > > Can't they copy what kvmclock is doing today? We shouldn't have copies of code all over should we? Dave -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Tue, Nov 08, 2016 at 10:22:56AM +0000, Dr. David Alan Gilbert wrote: > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote: > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > > > > This patch, relative to pre-copy migration codepath, > > > > > > measures the time between vm_stop() and pre_save(), > > > > > > which includes copying the remaining RAM to destination, > > > > > > and advances the clock by that amount. > > > > > > > > > > > > In a VM with 5 seconds downtime, this reduces the guest > > > > > > clock difference on destination from 5s to 0.2s. > > > > > > > > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > > > > > > > > > > One thing that bothers me is that it's only this clock that's > > > > > getting corrected; doesn't it cause things to get upset when > > > > > one clock moves and the others dont? > > > > > > > > If you are correlating the clocks, then yes. > > > > > > > > Older Linux guests get upset (marking the TSC clocksource unstable > > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it > > > > in newer guests > > > > (kvmclock interface to notify watchdog to not complain). > > > > > > > > Note marking TSC clocksource unstable on older guests is harmless > > > > because kvmclock is the standard clocksource. > > > > > > > > For Windows guests, i don't know that Windows correlates between different > > > > clocks. > > > > > > > > That is, there is relative control as to which software reads kvmclock > > > > or Windows TIMER MSR, so i don't see the need to advance every clock > > > > exposed. > > > > > > > > > Shouldn't the pause delay be recorded somewhere architecturally > > > > > independent and then be a thing that kvm-clock happens to use and > > > > > other clocks might as well? > > > > > > > > In theory, yes. In practice, i don't see the need for this... > > > > > > It seems unlikely to me that x86 is the only one that will want > > > to do something similar. > > > > Can't they copy what kvmclock is doing today? > > We shouldn't have copies of code all over should we? > > Dave Fine i'll add a notifier.
On 08/11/2016 11:22, Dr. David Alan Gilbert wrote: > * Marcelo Tosatti (mtosatti@redhat.com) wrote: >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote: >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: >>>>>> This patch, relative to pre-copy migration codepath, >>>>>> measures the time between vm_stop() and pre_save(), >>>>>> which includes copying the remaining RAM to destination, >>>>>> and advances the clock by that amount. >>>>>> >>>>>> In a VM with 5 seconds downtime, this reduces the guest >>>>>> clock difference on destination from 5s to 0.2s. >>>>>> >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. >>>>> >>>>> One thing that bothers me is that it's only this clock that's >>>>> getting corrected; doesn't it cause things to get upset when >>>>> one clock moves and the others dont? >>>> >>>> If you are correlating the clocks, then yes. >>>> >>>> Older Linux guests get upset (marking the TSC clocksource unstable >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it >>>> in newer guests >>>> (kvmclock interface to notify watchdog to not complain). >>>> >>>> Note marking TSC clocksource unstable on older guests is harmless >>>> because kvmclock is the standard clocksource. >>>> >>>> For Windows guests, i don't know that Windows correlates between different >>>> clocks. >>>> >>>> That is, there is relative control as to which software reads kvmclock >>>> or Windows TIMER MSR, so i don't see the need to advance every clock >>>> exposed. >>>> >>>>> Shouldn't the pause delay be recorded somewhere architecturally >>>>> independent and then be a thing that kvm-clock happens to use and >>>>> other clocks might as well? >>>> >>>> In theory, yes. In practice, i don't see the need for this... >>> >>> It seems unlikely to me that x86 is the only one that will want >>> to do something similar. >> >> Can't they copy what kvmclock is doing today? > > We shouldn't have copies of code all over should we? Let's cross the bridge when we get there. For now I'm more interested in having a version of the patch that is clean and doesn't need advance_clock. Paolo
* Paolo Bonzini (pbonzini@redhat.com) wrote: > > > On 08/11/2016 11:22, Dr. David Alan Gilbert wrote: > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote: > >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: > >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: > >>>>>> This patch, relative to pre-copy migration codepath, > >>>>>> measures the time between vm_stop() and pre_save(), > >>>>>> which includes copying the remaining RAM to destination, > >>>>>> and advances the clock by that amount. > >>>>>> > >>>>>> In a VM with 5 seconds downtime, this reduces the guest > >>>>>> clock difference on destination from 5s to 0.2s. > >>>>>> > >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > >>>>> > >>>>> One thing that bothers me is that it's only this clock that's > >>>>> getting corrected; doesn't it cause things to get upset when > >>>>> one clock moves and the others dont? > >>>> > >>>> If you are correlating the clocks, then yes. > >>>> > >>>> Older Linux guests get upset (marking the TSC clocksource unstable > >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it > >>>> in newer guests > >>>> (kvmclock interface to notify watchdog to not complain). > >>>> > >>>> Note marking TSC clocksource unstable on older guests is harmless > >>>> because kvmclock is the standard clocksource. > >>>> > >>>> For Windows guests, i don't know that Windows correlates between different > >>>> clocks. > >>>> > >>>> That is, there is relative control as to which software reads kvmclock > >>>> or Windows TIMER MSR, so i don't see the need to advance every clock > >>>> exposed. > >>>> > >>>>> Shouldn't the pause delay be recorded somewhere architecturally > >>>>> independent and then be a thing that kvm-clock happens to use and > >>>>> other clocks might as well? > >>>> > >>>> In theory, yes. In practice, i don't see the need for this... > >>> > >>> It seems unlikely to me that x86 is the only one that will want > >>> to do something similar. > >> > >> Can't they copy what kvmclock is doing today? > > > > We shouldn't have copies of code all over should we? > > Let's cross the bridge when we get there. That will mean it has the migration data in the wrong place and any other clocks that need to be incremented by the same offset will need a hook or be inconsistent with this calculation. Dave > For now I'm more interested in having a version of the patch that is > clean and doesn't need advance_clock. > > Paolo -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On 09/11/2016 17:28, Dr. David Alan Gilbert wrote: > * Paolo Bonzini (pbonzini@redhat.com) wrote: >> >> >> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote: >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: >>>> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote: >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: >>>>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: >>>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: >>>>>>>> This patch, relative to pre-copy migration codepath, >>>>>>>> measures the time between vm_stop() and pre_save(), >>>>>>>> which includes copying the remaining RAM to destination, >>>>>>>> and advances the clock by that amount. >>>>>>>> >>>>>>>> In a VM with 5 seconds downtime, this reduces the guest >>>>>>>> clock difference on destination from 5s to 0.2s. >>>>>>>> >>>>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. >>>>>>> >>>>>>> One thing that bothers me is that it's only this clock that's >>>>>>> getting corrected; doesn't it cause things to get upset when >>>>>>> one clock moves and the others dont? >>>>>> >>>>>> If you are correlating the clocks, then yes. >>>>>> >>>>>> Older Linux guests get upset (marking the TSC clocksource unstable >>>>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it >>>>>> in newer guests >>>>>> (kvmclock interface to notify watchdog to not complain). >>>>>> >>>>>> Note marking TSC clocksource unstable on older guests is harmless >>>>>> because kvmclock is the standard clocksource. >>>>>> >>>>>> For Windows guests, i don't know that Windows correlates between different >>>>>> clocks. >>>>>> >>>>>> That is, there is relative control as to which software reads kvmclock >>>>>> or Windows TIMER MSR, so i don't see the need to advance every clock >>>>>> exposed. >>>>>> >>>>>>> Shouldn't the pause delay be recorded somewhere architecturally >>>>>>> independent and then be a thing that kvm-clock happens to use and >>>>>>> other clocks might as well? >>>>>> >>>>>> In theory, yes. In practice, i don't see the need for this... >>>>> >>>>> It seems unlikely to me that x86 is the only one that will want >>>>> to do something similar. >>>> >>>> Can't they copy what kvmclock is doing today? >>> >>> We shouldn't have copies of code all over should we? >> >> Let's cross the bridge when we get there. > > That will mean it has the migration data in the wrong place > and any other clocks that need to be incremented by the same offset > will need a hook or be inconsistent with this calculation. No, there is no additional migration data that is needed. This is just a bug in how the pausing of CLOCK_MONOTONIC was implemented for the kvmclock clocksource. Right now, x86 is the only case where we have the problem, and x86 is using a single "backend" for both kvmclock and the Hyper-V TSC reference page. For everyone else, there is no clocksource paravirtualization going on (luckily, considering what a mess is kvmclock). They can just use QEMU_CLOCK_VIRTUAL if they want something that pauses during the VM. Now, QEMU_CLOCK_VIRTUAL actually has the same bug that Marcelo is fixing, so we may indeed want a common solution if possible. But again, let's see first what the code looks like for _one_ clocksource, before writing a generalized (and thus more complex) solution. Paolo
On Tue, Nov 08, 2016 at 11:32:30AM -0200, Marcelo Tosatti wrote: > On Tue, Nov 08, 2016 at 10:22:56AM +0000, Dr. David Alan Gilbert wrote: > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote: > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > > > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > > > > > > > This patch, relative to pre-copy migration codepath, > > > > > > > measures the time between vm_stop() and pre_save(), > > > > > > > which includes copying the remaining RAM to destination, > > > > > > > and advances the clock by that amount. > > > > > > > > > > > > > > In a VM with 5 seconds downtime, this reduces the guest > > > > > > > clock difference on destination from 5s to 0.2s. > > > > > > > > > > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > > > > > > > > > > > > One thing that bothers me is that it's only this clock that's > > > > > > getting corrected; doesn't it cause things to get upset when > > > > > > one clock moves and the others dont? > > > > > > > > > > If you are correlating the clocks, then yes. > > > > > > > > > > Older Linux guests get upset (marking the TSC clocksource unstable > > > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it > > > > > in newer guests > > > > > (kvmclock interface to notify watchdog to not complain). > > > > > > > > > > Note marking TSC clocksource unstable on older guests is harmless > > > > > because kvmclock is the standard clocksource. > > > > > > > > > > For Windows guests, i don't know that Windows correlates between different > > > > > clocks. > > > > > > > > > > That is, there is relative control as to which software reads kvmclock > > > > > or Windows TIMER MSR, so i don't see the need to advance every clock > > > > > exposed. > > > > > > > > > > > Shouldn't the pause delay be recorded somewhere architecturally > > > > > > independent and then be a thing that kvm-clock happens to use and > > > > > > other clocks might as well? > > > > > > > > > > In theory, yes. In practice, i don't see the need for this... > > > > > > > > It seems unlikely to me that x86 is the only one that will want > > > > to do something similar. > > > > > > Can't they copy what kvmclock is doing today? > > > > We shouldn't have copies of code all over should we? > > > > Dave > > Fine i'll add a notifier. > KVM Linux guests all have to: Make CLOCK_MONOTONIC not count during vmpause (because it mimicks behaviour under suspend/resume of baremetal). And because time overflows. Assuming the HW clock counts while the VM is paused, it means they have to hook into vmstate change notifiers to: event action vmstop KVM_GET_CLOCK vmstart KVM_SET_CLOCK (that earlier value) For x86 we want to start counting the time there because while the VM is running, the host TSC is keeping track of time. So you measure the amount of time between: -> Guest VM clock stops ticking. -> clock_gettime(CLOCK_MONOTONIC, &pointA); ... -> presave: clock_gettime(CLOCK_MONOTONIC, &pointB); I measured the additional time between presave and EOF: its about 5ms. On destination, there is an additional 30ms between EOF receival and restoration of TSC. Now, the clock difference is 130ms, and i am not sure where it comes from, trying to figure out. But the patch should give 35ms difference which is pretty good. Chasing that down... So in summary: i don't see the point of making the code "generic" without knowing what the other arches want to do.
GOn Wed, Nov 09, 2016 at 05:23:50PM +0100, Paolo Bonzini wrote: > > > On 08/11/2016 11:22, Dr. David Alan Gilbert wrote: > > * Marcelo Tosatti (mtosatti@redhat.com) wrote: > >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote: > >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: > >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote: > >>>>>> This patch, relative to pre-copy migration codepath, > >>>>>> measures the time between vm_stop() and pre_save(), > >>>>>> which includes copying the remaining RAM to destination, > >>>>>> and advances the clock by that amount. > >>>>>> > >>>>>> In a VM with 5 seconds downtime, this reduces the guest > >>>>>> clock difference on destination from 5s to 0.2s. > >>>>>> > >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > >>>>> > >>>>> One thing that bothers me is that it's only this clock that's > >>>>> getting corrected; doesn't it cause things to get upset when > >>>>> one clock moves and the others dont? > >>>> > >>>> If you are correlating the clocks, then yes. > >>>> > >>>> Older Linux guests get upset (marking the TSC clocksource unstable > >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it > >>>> in newer guests > >>>> (kvmclock interface to notify watchdog to not complain). > >>>> > >>>> Note marking TSC clocksource unstable on older guests is harmless > >>>> because kvmclock is the standard clocksource. > >>>> > >>>> For Windows guests, i don't know that Windows correlates between different > >>>> clocks. > >>>> > >>>> That is, there is relative control as to which software reads kvmclock > >>>> or Windows TIMER MSR, so i don't see the need to advance every clock > >>>> exposed. > >>>> > >>>>> Shouldn't the pause delay be recorded somewhere architecturally > >>>>> independent and then be a thing that kvm-clock happens to use and > >>>>> other clocks might as well? > >>>> > >>>> In theory, yes. In practice, i don't see the need for this... > >>> > >>> It seems unlikely to me that x86 is the only one that will want > >>> to do something similar. > >> > >> Can't they copy what kvmclock is doing today? > > > > We shouldn't have copies of code all over should we? > > Let's cross the bridge when we get there. > > For now I'm more interested in having a version of the patch that is > clean and doesn't need advance_clock. > > Paolo Destination has to run the following logic: If (source has KVM_CAP_ADVANCE_CLOCK) use KVM_GET_CLOCK value Else read pvclock from guest To support migration from older QEMU versions which do not have KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old hosts without KVM_CAP_ADVANCE_CLOCK). I don't see any clean way to give that information, except changing the migration format to pass "host: kvm_cap_advance_clock=true/false" information. Ideas?
On 10/11/2016 12:48, Marcelo Tosatti wrote: > Destination has to run the following logic: > > If (source has KVM_CAP_ADVANCE_CLOCK) > use KVM_GET_CLOCK value > Else > read pvclock from guest > > To support migration from older QEMU versions which do not have > KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old > hosts without KVM_CAP_ADVANCE_CLOCK). > > I don't see any clean way to give that information, except changing > the migration format to pass "host: kvm_cap_advance_clock=true/false" > information. If you make it only affect new machine types, you could transmit a dummy clock value such as -1 if the source does not have KVM_CLOCK_TSC_STABLE. Paolo
On Thu, Nov 10, 2016 at 06:57:21PM +0100, Paolo Bonzini wrote: > > > On 10/11/2016 12:48, Marcelo Tosatti wrote: > > Destination has to run the following logic: > > > > If (source has KVM_CAP_ADVANCE_CLOCK) > > use KVM_GET_CLOCK value > > Else > > read pvclock from guest > > > > To support migration from older QEMU versions which do not have > > KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old > > hosts without KVM_CAP_ADVANCE_CLOCK). > > > > I don't see any clean way to give that information, except changing > > the migration format to pass "host: kvm_cap_advance_clock=true/false" > > information. > > If you make it only affect new machine types, you could transmit a dummy > clock value such as -1 if the source does not have KVM_CLOCK_TSC_STABLE. > > Paolo Prefer a new subsection (which is fine since migration is broken anyway), because otherwise you have to deal with restoring s->clock from -1 to what was read at KVM_GET_CLOCK (in case migration fails).
2016-11-08 3:41 GMT+08:00 Marcelo Tosatti <mtosatti@redhat.com>: > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: >> * Marcelo Tosatti (mtosatti@redhat.com) wrote: >> > This patch, relative to pre-copy migration codepath, >> > measures the time between vm_stop() and pre_save(), >> > which includes copying the remaining RAM to destination, >> > and advances the clock by that amount. >> > >> > In a VM with 5 seconds downtime, this reduces the guest >> > clock difference on destination from 5s to 0.2s. >> > >> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. >> >> One thing that bothers me is that it's only this clock that's >> getting corrected; doesn't it cause things to get upset when >> one clock moves and the others dont? > > If you are correlating the clocks, then yes. > > Older Linux guests get upset (marking the TSC clocksource unstable > because the watchdog checks TSC vs kvmclock), but there is a workaround for it > in newer guests > (kvmclock interface to notify watchdog to not complain). Could you point out which interface? I didn't find it. Regards, Wanpeng Li > > Note marking TSC clocksource unstable on older guests is harmless > because kvmclock is the standard clocksource. > > For Windows guests, i don't know that Windows correlates between different > clocks. > > That is, there is relative control as to which software reads kvmclock > or Windows TIMER MSR, so i don't see the need to advance every clock > exposed. > >> Shouldn't the pause delay be recorded somewhere architecturally >> independent and then be a thing that kvm-clock happens to use and >> other clocks might as well? > > In theory, yes. In practice, i don't see the need for this... > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 07, 2017 at 06:02:13PM +0800, Wanpeng Li wrote: > 2016-11-08 3:41 GMT+08:00 Marcelo Tosatti <mtosatti@redhat.com>: > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote: > >> * Marcelo Tosatti (mtosatti@redhat.com) wrote: > >> > This patch, relative to pre-copy migration codepath, > >> > measures the time between vm_stop() and pre_save(), > >> > which includes copying the remaining RAM to destination, > >> > and advances the clock by that amount. > >> > > >> > In a VM with 5 seconds downtime, this reduces the guest > >> > clock difference on destination from 5s to 0.2s. > >> > > >> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. > >> > >> One thing that bothers me is that it's only this clock that's > >> getting corrected; doesn't it cause things to get upset when > >> one clock moves and the others dont? > > > > If you are correlating the clocks, then yes. > > > > Older Linux guests get upset (marking the TSC clocksource unstable > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it > > in newer guests > > (kvmclock interface to notify watchdog to not complain). > > Could you point out which interface? I didn't find it. > > Regards, > Wanpeng Li PVCLOCK_GUEST_STOPPED flag. > > > > > Note marking TSC clocksource unstable on older guests is harmless > > because kvmclock is the standard clocksource. > > > > For Windows guests, i don't know that Windows correlates between different > > clocks. > > > > That is, there is relative control as to which software reads kvmclock > > or Windows TIMER MSR, so i don't see the need to advance every clock > > exposed. > > > >> Shouldn't the pause delay be recorded somewhere architecturally > >> independent and then be a thing that kvm-clock happens to use and > >> other clocks might as well? > > > > In theory, yes. In practice, i don't see the need for this... > > > > -- > > To unsubscribe from this list: send the line "unsubscribe kvm" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 0f75dd3..a2a02ac 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -22,9 +22,11 @@ #include "kvm_i386.h" #include "hw/sysbus.h" #include "hw/kvm/clock.h" +#include "migration/migration.h" #include <linux/kvm.h> #include <linux/kvm_para.h> +#include <time.h> #define TYPE_KVM_CLOCK "kvmclock" #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK) @@ -35,7 +37,13 @@ typedef struct KVMClockState { /*< public >*/ uint64_t clock; + uint64_t ns; bool clock_valid; + + uint64_t advance_clock; + struct timespec t_aftervmstop; + + bool adv_clock_enabled; } KVMClockState; struct pvclock_vcpu_time_info { @@ -100,6 +108,11 @@ static void kvmclock_vm_state_change(void *opaque, int running, s->clock = time_at_migration; } + if (s->advance_clock && s->clock + s->advance_clock > s->clock) { + s->clock += s->advance_clock; + s->advance_clock = 0; + } + data.clock = s->clock; ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data); if (ret < 0) { @@ -135,6 +148,18 @@ static void kvmclock_vm_state_change(void *opaque, int running, abort(); } s->clock = data.clock; + /* + * Transition from VM-running to VM-stopped via migration? + * Record when the VM was stopped. + */ + + if (state == RUN_STATE_FINISH_MIGRATE && + !migration_in_postcopy(migrate_get_current())) { + clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop); + } else { + s->t_aftervmstop.tv_sec = 0; + s->t_aftervmstop.tv_nsec = 0; + } /* * If the VM is stopped, declare the clock state valid to @@ -152,6 +177,77 @@ static void kvmclock_realize(DeviceState *dev, Error **errp) qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s); } +static uint64_t clock_delta(struct timespec *before, struct timespec *after) +{ + if (before->tv_sec > after->tv_sec || + (before->tv_sec == after->tv_sec && + before->tv_nsec > after->tv_nsec)) { + fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec)," + "after=(%ld sec, %ld nsec)\n", before->tv_sec, + before->tv_nsec, after->tv_sec, after->tv_nsec); + abort(); + } + + return (after->tv_sec - before->tv_sec) * 1000000000ULL + + after->tv_nsec - before->tv_nsec; +} + +static void kvmclock_pre_save(void *opaque) +{ + KVMClockState *s = opaque; + struct timespec now; + uint64_t ns; + + if (s->t_aftervmstop.tv_sec == 0) { + return; + } + + clock_gettime(CLOCK_MONOTONIC, &now); + + ns = clock_delta(&s->t_aftervmstop, &now); + + /* + * Linux guests can overflow if time jumps + * forward in large increments. + * Cap maximum adjustment to 10 minutes. + */ + ns = MIN(ns, 600000000000ULL); + + if (s->clock + ns > s->clock) { + s->ns = ns; + } +} + +static int kvmclock_post_load(void *opaque, int version_id) +{ + KVMClockState *s = opaque; + + /* save the value from incoming migration */ + s->advance_clock = s->ns; + + return 0; +} + +static bool kvmclock_ns_needed(void *opaque) +{ + KVMClockState *s = opaque; + + return s->adv_clock_enabled; +} + +static const VMStateDescription kvmclock_advance_ns = { + .name = "kvmclock/advance_ns", + .version_id = 1, + .minimum_version_id = 1, + .needed = kvmclock_ns_needed, + .pre_save = kvmclock_pre_save, + .post_load = kvmclock_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT64(ns, KVMClockState), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription kvmclock_vmsd = { .name = "kvmclock", .version_id = 1, @@ -159,15 +255,25 @@ static const VMStateDescription kvmclock_vmsd = { .fields = (VMStateField[]) { VMSTATE_UINT64(clock, KVMClockState), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * []) { + &kvmclock_advance_ns, + NULL } }; +static Property kvmclock_properties[] = { + DEFINE_PROP_BOOL("advance_clock", KVMClockState, adv_clock_enabled, true), + DEFINE_PROP_END_OF_LIST(), +}; + static void kvmclock_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = kvmclock_realize; dc->vmsd = &kvmclock_vmsd; + dc->props = kvmclock_properties; } static const TypeInfo kvmclock_info = { diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 98dc772..243352e 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -370,6 +370,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); #define PC_COMPAT_2_7 \ HW_COMPAT_2_7 \ {\ + .driver = "kvmclock",\ + .property = "advance_clock",\ + .value = "off",\ + },\ + {\ .driver = TYPE_X86_CPU,\ .property = "l3-cache",\ .value = "off",\
This patch, relative to pre-copy migration codepath, measures the time between vm_stop() and pre_save(), which includes copying the remaining RAM to destination, and advances the clock by that amount. In a VM with 5 seconds downtime, this reduces the guest clock difference on destination from 5s to 0.2s. Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time. Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> --- v2: use subsection (Juan Quintela) fix older machine types support