diff mbox

[RFC,v3,2/2] add support for Hyper-V partition reference time enlightenment

Message ID 1386502419-26614-3-git-send-email-vrozenfe@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Vadim Rozenfeld Dec. 8, 2013, 11:33 a.m. UTC
The following patch allows to activate a partition reference
time enlightenment that is based on the host platform's support
for an Invariant Time Stamp Counter (iTSC).

v2 -> v3
Handle TSC sequence, scale, and offest changing during migration.

---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 29 +++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

Comments

Paolo Bonzini Dec. 9, 2013, 2:32 p.m. UTC | #1
Il 08/12/2013 12:33, Vadim Rozenfeld ha scritto:
> +		tsc_ref.tsc_sequence =
> +			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
> +		tsc_ref.tsc_scale =
> +			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
> +		tsc_ref.tsc_offset = 0;
>  		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
>  			return 1;
>  		mark_page_dirty(kvm, gfn);
>  		kvm->arch.hv_tsc_page = data;
> +		kvm->arch.hv_ref_count = 0;
>  		break;
>  	}
>  	default:
> @@ -3879,6 +3884,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		local_irq_enable();
>  		kvm->arch.kvmclock_offset = delta;
>  		kvm_gen_update_masterclock(kvm);
> +
> +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> +			u64 curr_time;
> +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
> +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> +			tsc_ref->tsc_sequence =
> +				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
> +			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;

Why shouldn't this be vcpu->arch.virtual_tsc_khz?

> +			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
> +				tsc_ref->tsc_offset;
> +			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
> +		}

The difference in setting tsc_ref->tsc_scale is the only important
change between the two occurrences.  If you can avoid that difference
and you move this to a separate function, you can reuse that new
function in set_msr_hyperv_pw as well.

Also, kvm_set_tsc_khz should recompute the reference page's values as
well, so you'd have three uses.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vadim Rozenfeld Dec. 10, 2013, 11:23 a.m. UTC | #2
On Mon, 2013-12-09 at 15:32 +0100, Paolo Bonzini wrote:
> Il 08/12/2013 12:33, Vadim Rozenfeld ha scritto:
> > +		tsc_ref.tsc_sequence =
> > +			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
> > +		tsc_ref.tsc_scale =
> > +			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
> > +		tsc_ref.tsc_offset = 0;
> >  		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
> >  			return 1;
> >  		mark_page_dirty(kvm, gfn);
> >  		kvm->arch.hv_tsc_page = data;
> > +		kvm->arch.hv_ref_count = 0;
> >  		break;
> >  	}
> >  	default:
> > @@ -3879,6 +3884,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >  		local_irq_enable();
> >  		kvm->arch.kvmclock_offset = delta;
> >  		kvm_gen_update_masterclock(kvm);
> > +
> > +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> > +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> > +			u64 curr_time;
> > +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
> > +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> > +			tsc_ref->tsc_sequence =
> > +				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
> > +			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;
> 
> Why shouldn't this be vcpu->arch.virtual_tsc_khz?
Yeah, I was thinking about that, but we need a vcpu instance for this.

> 
> > +			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
> > +				tsc_ref->tsc_offset;
> > +			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
> > +		}
> 
> The difference in setting tsc_ref->tsc_scale is the only important
> change between the two occurrences.  If you can avoid that difference
> and you move this to a separate function, you can reuse that new
> function in set_msr_hyperv_pw as well.

Do you mean between HV_X64_MSR_REFERENCE_TSC which happens during
partition creation time and KVM_SET_CLOCK which happens during resume 
after partition pause? If so - there are several differences, where
the offset calculation probably is the most important one.

Vadim.

> 
> Also, kvm_set_tsc_khz should recompute the reference page's values as
> well, so you'd have three uses.
> 
> Paolo


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Dec. 10, 2013, 4:52 p.m. UTC | #3
Il 10/12/2013 12:23, Vadim Rozenfeld ha scritto:
> > > +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> > > +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> > > +			u64 curr_time;
> > > +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
> > > +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> > > +			tsc_ref->tsc_sequence =
> > > +				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
> > > +			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;
> > 
> > Why shouldn't this be vcpu->arch.virtual_tsc_khz?
> 
> Yeah, I was thinking about that, but we need a vcpu instance for this.

You can perhaps store the value from vcpu->arch.virtual_tsc_khz to 
kvm->arch when the MSR is first written?

> Do you mean between HV_X64_MSR_REFERENCE_TSC which happens during
> partition creation time and KVM_SET_CLOCK which happens during resume 
> after partition pause? If so - there are several differences, where
> the offset calculation probably is the most important one.

The offset and frequence are the only differences.

+			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
+				tsc_ref->tsc_offset;
+			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;

Why do you need kvm->arch.hv_ref_time at all?  Can you just use
"get_kernel_ns() + kvm->arch.kvmclock_offset - kvm->arch.hv_ref_count"?
Then the same code can set tsc_ref->tsc_offset in both cases.

In fact, it's not clear to me what hv_ref_time is for, and how it
is different from 

By the way, a small nit:

> 
> +		tsc_ref.tsc_sequence =
> +			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
> +		tsc_ref.tsc_scale =
> +			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
> +		tsc_ref.tsc_offset = 0;
>  		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
>  			return 1;
>  		mark_page_dirty(kvm, gfn);
>  		kvm->arch.hv_tsc_page = data;
> +		kvm->arch.hv_ref_count = 0;
>  		break;

This setting of kvm->arch.hv_ref_count belongs in the previous patch.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vadim Rozenfeld Dec. 11, 2013, 10:58 a.m. UTC | #4
On Tue, 2013-12-10 at 17:52 +0100, Paolo Bonzini wrote:
> Il 10/12/2013 12:23, Vadim Rozenfeld ha scritto:
> > > > +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> > > > +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> > > > +			u64 curr_time;
> > > > +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
> > > > +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> > > > +			tsc_ref->tsc_sequence =
> > > > +				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
> > > > +			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;
> > > 
> > > Why shouldn't this be vcpu->arch.virtual_tsc_khz?
> > 
> > Yeah, I was thinking about that, but we need a vcpu instance for this.
> 
> You can perhaps store the value from vcpu->arch.virtual_tsc_khz to 
> kvm->arch when the MSR is first written?
> 
> > Do you mean between HV_X64_MSR_REFERENCE_TSC which happens during
> > partition creation time and KVM_SET_CLOCK which happens during resume 
> > after partition pause? If so - there are several differences, where
> > the offset calculation probably is the most important one.
> 
> The offset and frequence are the only differences.
> 
> +			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
> +				tsc_ref->tsc_offset;
> +			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
> 
> Why do you need kvm->arch.hv_ref_time at all?  Can you just use
> "get_kernel_ns() + kvm->arch.kvmclock_offset - kvm->arch.hv_ref_count"?
> Then the same code can set tsc_ref->tsc_offset in both cases.
> 
> In fact, it's not clear to me what hv_ref_time is for, and how it
> is different from 

OK, let me explain how it works.
Hyper-V allows guest to use invariant TSC provided by host as a time
stamp source (KeQueryPerformanceCounter). Guest is calling rdtsc and
normalizing it to 10MHz frequency, it is why we need "tsc_scale".
"tsc_offset" is needed for migration or pause/resume cycles.
When we pause a VM, we need to save the current vTSC value
("hv_ref_time"), which is rdtsc * tsc_scale + tsc_offset.
Then, during resume, we need to recalculate the new tsc_scale
as well as the new tsc_offset value. 
tsc_offset = old(saved) vTSC - new vTSC

So maybe hv_ref_time is not a good name, but we use it 
for keeping the old vTSC value, saved before stopping VM.

Vadim.

> 
> By the way, a small nit:
> 
> > 
> > +		tsc_ref.tsc_sequence =
> > +			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
> > +		tsc_ref.tsc_scale =
> > +			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
> > +		tsc_ref.tsc_offset = 0;
> >  		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
> >  			return 1;
> >  		mark_page_dirty(kvm, gfn);
> >  		kvm->arch.hv_tsc_page = data;
> > +		kvm->arch.hv_ref_count = 0;
> >  		break;
> 
> This setting of kvm->arch.hv_ref_count belongs in the previous patch.
> 
> Paolo


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Dec. 11, 2013, 12:28 p.m. UTC | #5
Il 11/12/2013 11:58, Vadim Rozenfeld ha scritto:
>> > +			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
>> > +				tsc_ref->tsc_offset;
>> > +			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
>> > 
>> > Why do you need kvm->arch.hv_ref_time at all?  Can you just use
>> > "get_kernel_ns() + kvm->arch.kvmclock_offset - kvm->arch.hv_ref_count"?
>> > Then the same code can set tsc_ref->tsc_offset in both cases.
>> > 
>> > In fact, it's not clear to me what hv_ref_time is for, and how it
>> > is different from 
> OK, let me explain how it works.
> Hyper-V allows guest to use invariant TSC provided by host as a time
> stamp source (KeQueryPerformanceCounter). Guest is calling rdtsc and
> normalizing it to 10MHz frequency, it is why we need "tsc_scale".
> "tsc_offset" is needed for migration or pause/resume cycles.
> When we pause a VM, we need to save the current vTSC value
> ("hv_ref_time"), which is rdtsc * tsc_scale + tsc_offset.
> Then, during resume, we need to recalculate the new tsc_scale
> as well as the new tsc_offset value. 
> tsc_offset = old(saved) vTSC - new vTSC

In practice "save" means KVM_GET_CLOCK, and "restore" means
KVM_SET_CLOCK, right?

> So maybe hv_ref_time is not a good name, but we use it 
> for keeping the old vTSC value, saved before stopping VM.

Ok, this was roughly my understanding as well.

My understanding is also that (((tsc_ref->tsc_scale >> 32) *
native_read_tsc()) >> 32) + tsc_ref->tsc_offset returns exactly the same
value as HV_X64_MSR_TIME_REF_COUNT.  Thus we do not need
kvm->arch.hv_ref_time.  We can use the value of
HV_X64_MSR_TIME_REF_COUNT, which is "(get_kernel_ns() +
kvm->arch.kvmclock_offset - kvm->arch.hv_ref_count) / 100", to compute
tsc_offset, like this:

  curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32);
  tsc_ref->tsc_offset = get_hv_x64_msr_time_ref_count() - curr_time;

This code can be applied always: when the TSC page is initialized and
when KVM_SET_CLOCK is called.  You do not need to do anything for
KVM_GET_CLOCK.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Dec. 11, 2013, 7:27 p.m. UTC | #6
On Sun, Dec 08, 2013 at 10:33:39PM +1100, Vadim Rozenfeld wrote:
> The following patch allows to activate a partition reference
> time enlightenment that is based on the host platform's support
> for an Invariant Time Stamp Counter (iTSC).
> 
> v2 -> v3
> Handle TSC sequence, scale, and offest changing during migration.
> 
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/x86.c              | 29 +++++++++++++++++++++++++++--
>  2 files changed, 28 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2fd0753..81fdff0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -607,6 +607,7 @@ struct kvm_arch {
>  	u64 hv_hypercall;
>  	u64 hv_ref_count;
>  	u64 hv_tsc_page;
> +	u64 hv_ref_time;
>  
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5e4e495a..cb6766a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1882,14 +1882,19 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>  			break;
>  		}
>  		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
> -		addr = gfn_to_hva(kvm, data >>
> -			HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> +		addr = gfn_to_hva(kvm, gfn);
>  		if (kvm_is_error_hva(addr))
>  			return 1;
> +		tsc_ref.tsc_sequence =
> +			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
> +		tsc_ref.tsc_scale =
> +			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
> +		tsc_ref.tsc_offset = 0;
>  		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
>  			return 1;
>  		mark_page_dirty(kvm, gfn);
>  		kvm->arch.hv_tsc_page = data;
> +		kvm->arch.hv_ref_count = 0;
>  		break;
>  	}
>  	default:
> @@ -3879,6 +3884,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		local_irq_enable();
>  		kvm->arch.kvmclock_offset = delta;
>  		kvm_gen_update_masterclock(kvm);
> +
> +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> +			u64 curr_time;
> +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
> +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> +			tsc_ref->tsc_sequence =
> +				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
> +			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;
> +			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
> +				tsc_ref->tsc_offset;
> +			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
> +		}
>  		break;
>  	}
>  	case KVM_GET_CLOCK: {
> @@ -3896,6 +3914,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
>  			goto out;
>  		r = 0;
> +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm,
> +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);

kvm_read_guest_cached.

> +			kvm->arch.hv_ref_time = (((tsc_ref->tsc_scale >> 32) * 
> +				native_read_tsc()) >> 32) + tsc_ref->tsc_offset;

Why native_read_tsc and not ->read_l1_tsc?

It is easier to trust on the host to check reliability of the TSC: if
it uses TSC clocksource, then the TSCs are stable. So could condition
exposing the TSC ref page when ka->use_master_clock=1, see kvm_guest_time_update.
And hook into pvclock_gtod_notify.

So in addition to X86_FEATURE_CONSTANT_TSC, check
ka->use_master_clock=1


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Dec. 11, 2013, 7:28 p.m. UTC | #7
On Tue, Dec 10, 2013 at 10:23:17PM +1100, Vadim Rozenfeld wrote:
> On Mon, 2013-12-09 at 15:32 +0100, Paolo Bonzini wrote:
> > Il 08/12/2013 12:33, Vadim Rozenfeld ha scritto:
> > > +		tsc_ref.tsc_sequence =
> > > +			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
> > > +		tsc_ref.tsc_scale =
> > > +			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
> > > +		tsc_ref.tsc_offset = 0;
> > >  		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
> > >  			return 1;
> > >  		mark_page_dirty(kvm, gfn);
> > >  		kvm->arch.hv_tsc_page = data;
> > > +		kvm->arch.hv_ref_count = 0;
> > >  		break;
> > >  	}
> > >  	default:
> > > @@ -3879,6 +3884,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
> > >  		local_irq_enable();
> > >  		kvm->arch.kvmclock_offset = delta;
> > >  		kvm_gen_update_masterclock(kvm);
> > > +
> > > +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> > > +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> > > +			u64 curr_time;
> > > +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
> > > +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> > > +			tsc_ref->tsc_sequence =
> > > +				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
> > > +			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;
> > 
> > Why shouldn't this be vcpu->arch.virtual_tsc_khz?
> Yeah, I was thinking about that, but we need a vcpu instance for this.

Move it to kvm_guest_time_update time (which is necessary anyway for the
pvclock gtod notifier changes etc).

> > > +			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
> > > +				tsc_ref->tsc_offset;
> > > +			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
> > > +		}
> > 
> > The difference in setting tsc_ref->tsc_scale is the only important
> > change between the two occurrences.  If you can avoid that difference
> > and you move this to a separate function, you can reuse that new
> > function in set_msr_hyperv_pw as well.
> 
> Do you mean between HV_X64_MSR_REFERENCE_TSC which happens during
> partition creation time and KVM_SET_CLOCK which happens during resume 
> after partition pause? If so - there are several differences, where
> the offset calculation probably is the most important one.
> 
> Vadim.
> 
> > 
> > Also, kvm_set_tsc_khz should recompute the reference page's values as
> > well, so you'd have three uses.
> > 
> > Paolo
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Dec. 12, 2013, 9:34 a.m. UTC | #8
Il 11/12/2013 20:27, Marcelo Tosatti ha scritto:
>> > +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
>> > +			HV_REFERENCE_TSC_PAGE* tsc_ref;
>> > +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm,
>> > +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> kvm_read_guest_cached.
> 
>> > +			kvm->arch.hv_ref_time = (((tsc_ref->tsc_scale >> 32) * 
>> > +				native_read_tsc()) >> 32) + tsc_ref->tsc_offset;
> Why native_read_tsc and not ->read_l1_tsc?
> 
> It is easier to trust on the host to check reliability of the TSC: if
> it uses TSC clocksource, then the TSCs are stable. So could condition
> exposing the TSC ref page when ka->use_master_clock=1, see kvm_guest_time_update.
> And hook into pvclock_gtod_notify.
> 
> So in addition to X86_FEATURE_CONSTANT_TSC, check
> ka->use_master_clock=1

FWIW, I agree with all these comments from Marcelo.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vadim Rozenfeld Jan. 14, 2014, 4:11 a.m. UTC | #9
----- Original Message -----
From: "Marcelo Tosatti" <mtosatti@redhat.com>
To: "Vadim Rozenfeld" <vrozenfe@redhat.com>
Cc: kvm@vger.kernel.org, pl@dlhnet.de, pbonzini@redhat.com
Sent: Thursday, December 12, 2013 6:27:00 AM
Subject: Re: [RFC PATCH v3 2/2] add support for Hyper-V partition reference time enlightenment

On Sun, Dec 08, 2013 at 10:33:39PM +1100, Vadim Rozenfeld wrote:
> The following patch allows to activate a partition reference
> time enlightenment that is based on the host platform's support
> for an Invariant Time Stamp Counter (iTSC).
> 
> v2 -> v3
> Handle TSC sequence, scale, and offest changing during migration.
> 
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/x86.c              | 29 +++++++++++++++++++++++++++--
>  2 files changed, 28 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2fd0753..81fdff0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -607,6 +607,7 @@ struct kvm_arch {
>  	u64 hv_hypercall;
>  	u64 hv_ref_count;
>  	u64 hv_tsc_page;
> +	u64 hv_ref_time;
>  
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5e4e495a..cb6766a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1882,14 +1882,19 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>  			break;
>  		}
>  		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
> -		addr = gfn_to_hva(kvm, data >>
> -			HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> +		addr = gfn_to_hva(kvm, gfn);
>  		if (kvm_is_error_hva(addr))
>  			return 1;
> +		tsc_ref.tsc_sequence =
> +			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
> +		tsc_ref.tsc_scale =
> +			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
> +		tsc_ref.tsc_offset = 0;
>  		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
>  			return 1;
>  		mark_page_dirty(kvm, gfn);
>  		kvm->arch.hv_tsc_page = data;
> +		kvm->arch.hv_ref_count = 0;
>  		break;
>  	}
>  	default:
> @@ -3879,6 +3884,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		local_irq_enable();
>  		kvm->arch.kvmclock_offset = delta;
>  		kvm_gen_update_masterclock(kvm);
> +
> +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> +			u64 curr_time;
> +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
> +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> +			tsc_ref->tsc_sequence =
> +				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
> +			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;
> +			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
> +				tsc_ref->tsc_offset;
> +			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
> +		}
>  		break;
>  	}
>  	case KVM_GET_CLOCK: {
> @@ -3896,6 +3914,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
>  			goto out;
>  		r = 0;
> +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm,
> +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);

kvm_read_guest_cached.

> +			kvm->arch.hv_ref_time = (((tsc_ref->tsc_scale >> 32) * 
> +				native_read_tsc()) >> 32) + tsc_ref->tsc_offset;

Why native_read_tsc and not ->read_l1_tsc?

[VR]
Is it possible to get pointer to the vcpu instance at this point?
Thanks,
Vadim. 

It is easier to trust on the host to check reliability of the TSC: if
it uses TSC clocksource, then the TSCs are stable. So could condition
exposing the TSC ref page when ka->use_master_clock=1, see kvm_guest_time_update.
And hook into pvclock_gtod_notify.

So in addition to X86_FEATURE_CONSTANT_TSC, check
ka->use_master_clock=1


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Jan. 14, 2014, 1:54 p.m. UTC | #10
On Mon, Jan 13, 2014 at 11:11:40PM -0500, Vadim Rozenfeld wrote:
> 
> 
> ----- Original Message -----
> From: "Marcelo Tosatti" <mtosatti@redhat.com>
> To: "Vadim Rozenfeld" <vrozenfe@redhat.com>
> Cc: kvm@vger.kernel.org, pl@dlhnet.de, pbonzini@redhat.com
> Sent: Thursday, December 12, 2013 6:27:00 AM
> Subject: Re: [RFC PATCH v3 2/2] add support for Hyper-V partition reference time enlightenment
> 
> On Sun, Dec 08, 2013 at 10:33:39PM +1100, Vadim Rozenfeld wrote:
> > The following patch allows to activate a partition reference
> > time enlightenment that is based on the host platform's support
> > for an Invariant Time Stamp Counter (iTSC).
> > 
> > v2 -> v3
> > Handle TSC sequence, scale, and offest changing during migration.
> > 
> > ---
> >  arch/x86/include/asm/kvm_host.h |  1 +
> >  arch/x86/kvm/x86.c              | 29 +++++++++++++++++++++++++++--
> >  2 files changed, 28 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 2fd0753..81fdff0 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -607,6 +607,7 @@ struct kvm_arch {
> >  	u64 hv_hypercall;
> >  	u64 hv_ref_count;
> >  	u64 hv_tsc_page;
> > +	u64 hv_ref_time;
> >  
> >  	#ifdef CONFIG_KVM_MMU_AUDIT
> >  	int audit_point;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 5e4e495a..cb6766a 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1882,14 +1882,19 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >  			break;
> >  		}
> >  		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
> > -		addr = gfn_to_hva(kvm, data >>
> > -			HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> > +		addr = gfn_to_hva(kvm, gfn);
> >  		if (kvm_is_error_hva(addr))
> >  			return 1;
> > +		tsc_ref.tsc_sequence =
> > +			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
> > +		tsc_ref.tsc_scale =
> > +			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
> > +		tsc_ref.tsc_offset = 0;
> >  		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
> >  			return 1;
> >  		mark_page_dirty(kvm, gfn);
> >  		kvm->arch.hv_tsc_page = data;
> > +		kvm->arch.hv_ref_count = 0;
> >  		break;
> >  	}
> >  	default:
> > @@ -3879,6 +3884,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >  		local_irq_enable();
> >  		kvm->arch.kvmclock_offset = delta;
> >  		kvm_gen_update_masterclock(kvm);
> > +
> > +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> > +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> > +			u64 curr_time;
> > +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
> > +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> > +			tsc_ref->tsc_sequence =
> > +				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
> > +			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;
> > +			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
> > +				tsc_ref->tsc_offset;
> > +			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
> > +		}
> >  		break;
> >  	}
> >  	case KVM_GET_CLOCK: {
> > @@ -3896,6 +3914,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >  		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
> >  			goto out;
> >  		r = 0;
> > +		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
> > +			HV_REFERENCE_TSC_PAGE* tsc_ref;
> > +			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm,
> > +				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
> 
> kvm_read_guest_cached.
> 
> > +			kvm->arch.hv_ref_time = (((tsc_ref->tsc_scale >> 32) * 
> > +				native_read_tsc()) >> 32) + tsc_ref->tsc_offset;
> 
> Why native_read_tsc and not ->read_l1_tsc?
> 
> [VR]
> Is it possible to get pointer to the vcpu instance at this point?

See the suggestion to move this code to kvm_guest_time_update.


> Thanks,
> Vadim. 
> 
> It is easier to trust on the host to check reliability of the TSC: if
> it uses TSC clocksource, then the TSCs are stable. So could condition
> exposing the TSC ref page when ka->use_master_clock=1, see kvm_guest_time_update.
> And hook into pvclock_gtod_notify.
> 
> So in addition to X86_FEATURE_CONSTANT_TSC, check
> ka->use_master_clock=1
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2fd0753..81fdff0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -607,6 +607,7 @@  struct kvm_arch {
 	u64 hv_hypercall;
 	u64 hv_ref_count;
 	u64 hv_tsc_page;
+	u64 hv_ref_time;
 
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5e4e495a..cb6766a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1882,14 +1882,19 @@  static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			break;
 		}
 		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		addr = gfn_to_hva(kvm, data >>
-			HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
+		addr = gfn_to_hva(kvm, gfn);
 		if (kvm_is_error_hva(addr))
 			return 1;
+		tsc_ref.tsc_sequence =
+			boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? 1 : 0;
+		tsc_ref.tsc_scale =
+			((10000LL << 32) / vcpu->arch.virtual_tsc_khz) << 32;
+		tsc_ref.tsc_offset = 0;
 		if (__copy_to_user((void __user *)addr, &tsc_ref, sizeof(tsc_ref)))
 			return 1;
 		mark_page_dirty(kvm, gfn);
 		kvm->arch.hv_tsc_page = data;
+		kvm->arch.hv_ref_count = 0;
 		break;
 	}
 	default:
@@ -3879,6 +3884,19 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		local_irq_enable();
 		kvm->arch.kvmclock_offset = delta;
 		kvm_gen_update_masterclock(kvm);
+
+		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
+			HV_REFERENCE_TSC_PAGE* tsc_ref;
+			u64 curr_time;
+			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm, 
+				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
+			tsc_ref->tsc_sequence =
+				boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ? tsc_ref->tsc_sequence + 1 : 0;
+			tsc_ref->tsc_scale = ((10000LL << 32) / __get_cpu_var(cpu_tsc_khz)) << 32;
+			curr_time = (((tsc_ref->tsc_scale >> 32) * native_read_tsc()) >> 32) + 
+				tsc_ref->tsc_offset;
+			tsc_ref->tsc_offset = kvm->arch.hv_ref_time - curr_time;
+		}
 		break;
 	}
 	case KVM_GET_CLOCK: {
@@ -3896,6 +3914,13 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
 			goto out;
 		r = 0;
+		if (kvm->arch.hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
+			HV_REFERENCE_TSC_PAGE* tsc_ref;
+			tsc_ref = (HV_REFERENCE_TSC_PAGE*)gfn_to_hva(kvm,
+				kvm->arch.hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT);
+			kvm->arch.hv_ref_time = (((tsc_ref->tsc_scale >> 32) * 
+				native_read_tsc()) >> 32) + tsc_ref->tsc_offset;
+		}
 		break;
 	}