diff mbox series

[1/2] KVM: X86: TSCDEADLINE MSR emulation fastpath

Message ID 1587468026-15753-2-git-send-email-wanpengli@tencent.com (mailing list archive)
State New, archived
Headers show
Series KVM: VMX: Tscdeadline timer emulation fastpath | expand

Commit Message

Wanpeng Li April 21, 2020, 11:20 a.m. UTC
From: Wanpeng Li <wanpengli@tencent.com>

This patch implements tscdealine msr emulation fastpath, after wrmsr 
tscdeadline vmexit, handle it as soon as possible and vmentry immediately 
without checking various kvm stuff when possible.

Tested-by: Haiwei Li <lihaiwei@tencent.com>
Cc: Haiwei Li <lihaiwei@tencent.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/lapic.c            | 20 +++-----------
 arch/x86/kvm/lapic.h            | 16 +++++++++++
 arch/x86/kvm/vmx/vmx.c          | 60 +++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/x86.c              | 55 ++++++++++++++++++++++++++++++++-----
 5 files changed, 126 insertions(+), 28 deletions(-)

Comments

Paolo Bonzini April 21, 2020, 11:37 a.m. UTC | #1
On 21/04/20 13:20, Wanpeng Li wrote:
> +	case MSR_IA32_TSCDEADLINE:
> +		if (!kvm_x86_ops.event_needs_reinjection(vcpu)) {
> +			data = kvm_read_edx_eax(vcpu);
> +			if (!handle_fastpath_set_tscdeadline(vcpu, data))
> +				ret = EXIT_FASTPATH_CONT_RUN;
> +		}
>  		break;

Can you explain the event_needs_reinjection case?  Also, does this break
AMD which does not implement the callback?

> +
> +	reg = kvm_lapic_get_reg(apic, APIC_LVTT);
> +	if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
> +		vector = reg & APIC_VECTOR_MASK;
> +		kvm_lapic_clear_vector(vector, apic->regs + APIC_TMR);
> +
> +		if (vcpu->arch.apicv_active) {
> +			if (pi_test_and_set_pir(vector, &vmx->pi_desc))
> +				return;
> +
> +			if (pi_test_and_set_on(&vmx->pi_desc))
> +				return;
> +
> +			vmx_sync_pir_to_irr(vcpu);
> +		} else {
> +			kvm_lapic_set_irr(vector, apic);
> +			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
> +			vmx_inject_irq(vcpu);
> +		}
> +	}

This is mostly a copy of

               if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) {
                        kvm_lapic_set_irr(vector, apic);
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
                        kvm_vcpu_kick(vcpu);
                }
                break;

(is it required to do vmx_sync_pir_to_irr?).  So you should not special
case LVTT and move this code to lapic.c instead.  But even before that...

> 
> +
> +	if (kvm_start_hv_timer(apic)) {
> +		if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
> +			if (kvm_x86_ops.interrupt_allowed(vcpu)) {
> +				kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
> +				kvm_x86_ops.fast_deliver_interrupt(vcpu);
> +				atomic_set(&apic->lapic_timer.pending, 0);
> +				apic->lapic_timer.tscdeadline = 0;
> +				return 0;
> +			}
> +			return 1;


Is it actually common that the timer is set back in time and therefore
this code is executed?

Paolo
Wanpeng Li April 22, 2020, 12:48 a.m. UTC | #2
On Tue, 21 Apr 2020 at 19:37, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 21/04/20 13:20, Wanpeng Li wrote:
> > +     case MSR_IA32_TSCDEADLINE:
> > +             if (!kvm_x86_ops.event_needs_reinjection(vcpu)) {
> > +                     data = kvm_read_edx_eax(vcpu);
> > +                     if (!handle_fastpath_set_tscdeadline(vcpu, data))
> > +                             ret = EXIT_FASTPATH_CONT_RUN;
> > +             }
> >               break;
>
> Can you explain the event_needs_reinjection case?  Also, does this break

This is used to catch the case vmexit occurred while another event was
being delivered to guest software, I move the
vmx_exit_handlers_fastpath() call after vmx_complete_interrupts()
which will decode such event and make kvm_event_needs_reinjection
return true.

> AMD which does not implement the callback?

Now I add the tscdeadline msr emulation and vmx-preemption timer
fastpath pair for Intel platform.

>
> > +
> > +     reg = kvm_lapic_get_reg(apic, APIC_LVTT);
> > +     if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
> > +             vector = reg & APIC_VECTOR_MASK;
> > +             kvm_lapic_clear_vector(vector, apic->regs + APIC_TMR);
> > +
> > +             if (vcpu->arch.apicv_active) {
> > +                     if (pi_test_and_set_pir(vector, &vmx->pi_desc))
> > +                             return;
> > +
> > +                     if (pi_test_and_set_on(&vmx->pi_desc))
> > +                             return;
> > +
> > +                     vmx_sync_pir_to_irr(vcpu);
> > +             } else {
> > +                     kvm_lapic_set_irr(vector, apic);
> > +                     kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
> > +                     vmx_inject_irq(vcpu);
> > +             }
> > +     }
>
> This is mostly a copy of
>
>                if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) {
>                         kvm_lapic_set_irr(vector, apic);
>                         kvm_make_request(KVM_REQ_EVENT, vcpu);
>                         kvm_vcpu_kick(vcpu);
>                 }
>                 break;
>
> (is it required to do vmx_sync_pir_to_irr?).  So you should not special

I observe send notification vector as in
kvm_x86_ops.deliver_posted_interrupt() is ~900 cycles worse than
vmx_sync_pir_to_irr in my case. It needs to wait guest vmentry, then
the physical cpu ack the notification vector, read posted-interrupt
desciptor etc. For the non-APICv part, original copy needs to wait
inject_pending_event to do these stuff.

> case LVTT and move this code to lapic.c instead.  But even before that...
>
> >
> > +
> > +     if (kvm_start_hv_timer(apic)) {
> > +             if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
> > +                     if (kvm_x86_ops.interrupt_allowed(vcpu)) {
> > +                             kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
> > +                             kvm_x86_ops.fast_deliver_interrupt(vcpu);
> > +                             atomic_set(&apic->lapic_timer.pending, 0);
> > +                             apic->lapic_timer.tscdeadline = 0;
> > +                             return 0;
> > +                     }
> > +                     return 1;
>
>
> Is it actually common that the timer is set back in time and therefore
> this code is executed?

It is used to handle the already-expired timer.

    Wanpeng
Paolo Bonzini April 22, 2020, 8:38 a.m. UTC | #3
On 22/04/20 02:48, Wanpeng Li wrote:
> On Tue, 21 Apr 2020 at 19:37, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> On 21/04/20 13:20, Wanpeng Li wrote:
>>> +     case MSR_IA32_TSCDEADLINE:
>>> +             if (!kvm_x86_ops.event_needs_reinjection(vcpu)) {
>>> +                     data = kvm_read_edx_eax(vcpu);
>>> +                     if (!handle_fastpath_set_tscdeadline(vcpu, data))
>>> +                             ret = EXIT_FASTPATH_CONT_RUN;
>>> +             }
>>>               break;
>> Can you explain the event_needs_reinjection case?  Also, does this break
> This is used to catch the case vmexit occurred while another event was
> being delivered to guest software, I move the
> vmx_exit_handlers_fastpath() call after vmx_complete_interrupts()
> which will decode such event and make kvm_event_needs_reinjection
> return true.

This doesn't need a callback, kvm_event_needs_reinjection should be enough.

You should also check other conditions such as

	vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
            || need_resched() || signal_pending(current)

before doing CONT_RUN.

>> AMD which does not implement the callback?
> Now I add the tscdeadline msr emulation and vmx-preemption timer
> fastpath pair for Intel platform.

But this would cause a crash if you run on AMD.

Paolo
Wanpeng Li April 22, 2020, 8:46 a.m. UTC | #4
On Wed, 22 Apr 2020 at 16:38, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 22/04/20 02:48, Wanpeng Li wrote:
> > On Tue, 21 Apr 2020 at 19:37, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >> On 21/04/20 13:20, Wanpeng Li wrote:
> >>> +     case MSR_IA32_TSCDEADLINE:
> >>> +             if (!kvm_x86_ops.event_needs_reinjection(vcpu)) {
> >>> +                     data = kvm_read_edx_eax(vcpu);
> >>> +                     if (!handle_fastpath_set_tscdeadline(vcpu, data))
> >>> +                             ret = EXIT_FASTPATH_CONT_RUN;
> >>> +             }
> >>>               break;
> >> Can you explain the event_needs_reinjection case?  Also, does this break
> > This is used to catch the case vmexit occurred while another event was
> > being delivered to guest software, I move the
> > vmx_exit_handlers_fastpath() call after vmx_complete_interrupts()
> > which will decode such event and make kvm_event_needs_reinjection
> > return true.
>
> This doesn't need a callback, kvm_event_needs_reinjection should be enough.
>
> You should also check other conditions such as
>
>         vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
>             || need_resched() || signal_pending(current)
>
> before doing CONT_RUN.

Agreed.

>
> >> AMD which does not implement the callback?
> > Now I add the tscdeadline msr emulation and vmx-preemption timer
> > fastpath pair for Intel platform.
>
> But this would cause a crash if you run on AMD.

Yes. :)

    Wanpeng
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f26df2c..1626ce3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -188,6 +188,7 @@  enum {
 enum exit_fastpath_completion {
 	EXIT_FASTPATH_NONE,
 	EXIT_FASTPATH_SKIP_EMUL_INS,
+	EXIT_FASTPATH_CONT_RUN,
 };
 
 struct x86_emulate_ctxt;
@@ -1265,6 +1266,8 @@  struct kvm_x86_ops {
 
 	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
 	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
+	void (*fast_deliver_interrupt)(struct kvm_vcpu *vcpu);
+	bool (*event_needs_reinjection)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_init_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 38f7dc9..9e54301 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -91,6 +91,7 @@  static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
 }
 
 struct static_key_deferred apic_hw_disabled __read_mostly;
+EXPORT_SYMBOL_GPL(apic_hw_disabled);
 struct static_key_deferred apic_sw_disabled __read_mostly;
 
 static inline int apic_enabled(struct kvm_lapic *apic)
@@ -311,21 +312,6 @@  static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
 	return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
 }
 
-static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
-{
-	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
-}
-
-static inline int apic_lvtt_period(struct kvm_lapic *apic)
-{
-	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
-}
-
-static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
-{
-	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
-}
-
 static inline int apic_lvt_nmi_mode(u32 lvt_val)
 {
 	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
@@ -1781,7 +1767,7 @@  static void cancel_hv_timer(struct kvm_lapic *apic)
 	apic->lapic_timer.hv_timer_in_use = false;
 }
 
-static bool start_hv_timer(struct kvm_lapic *apic)
+bool kvm_start_hv_timer(struct kvm_lapic *apic)
 {
 	struct kvm_timer *ktimer = &apic->lapic_timer;
 	struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1847,7 +1833,7 @@  static void restart_apic_timer(struct kvm_lapic *apic)
 	if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending))
 		goto out;
 
-	if (!start_hv_timer(apic))
+	if (!kvm_start_hv_timer(apic))
 		start_sw_timer(apic);
 out:
 	preempt_enable();
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7f15f9e..4c917fd 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -251,6 +251,7 @@  void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
 bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu);
+bool kvm_start_hv_timer(struct kvm_lapic *apic);
 
 static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
 {
@@ -262,4 +263,19 @@  static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
 	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
 }
 
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
+}
+
+static inline int apic_lvtt_period(struct kvm_lapic *apic)
+{
+	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
+}
+
 #endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 766303b..7688e40 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6559,6 +6559,54 @@  void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 	}
 }
 
+static bool vmx_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+	return (to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+		kvm_event_needs_reinjection(vcpu);
+}
+
+static void vmx_fast_deliver_interrupt(struct kvm_vcpu *vcpu)
+{
+	u32 reg;
+	int vector;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	reg = kvm_lapic_get_reg(apic, APIC_LVTT);
+	if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+		vector = reg & APIC_VECTOR_MASK;
+		kvm_lapic_clear_vector(vector, apic->regs + APIC_TMR);
+
+		if (vcpu->arch.apicv_active) {
+			if (pi_test_and_set_pir(vector, &vmx->pi_desc))
+				return;
+
+			if (pi_test_and_set_on(&vmx->pi_desc))
+				return;
+
+			vmx_sync_pir_to_irr(vcpu);
+		} else {
+			kvm_lapic_set_irr(vector, apic);
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
+			vmx_inject_irq(vcpu);
+		}
+	}
+}
+
+static enum exit_fastpath_completion vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+	if (!is_guest_mode(vcpu)) {
+		switch (to_vmx(vcpu)->exit_reason) {
+		case EXIT_REASON_MSR_WRITE:
+			return handle_fastpath_set_msr_irqoff(vcpu);
+		default:
+			return EXIT_FASTPATH_NONE;
+		}
+	}
+
+	return EXIT_FASTPATH_NONE;
+}
+
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
 static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -6566,6 +6614,7 @@  static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	enum exit_fastpath_completion exit_fastpath;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long cr3, cr4;
+continue_vmx_vcpu_run:
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!enable_vnmi &&
@@ -6733,17 +6782,16 @@  static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
 		return EXIT_FASTPATH_NONE;
 
-	if (!is_guest_mode(vcpu) && vmx->exit_reason == EXIT_REASON_MSR_WRITE)
-		exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
-	else
-		exit_fastpath = EXIT_FASTPATH_NONE;
-
 	vmx->loaded_vmcs->launched = 1;
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
 
+	exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
+	if (exit_fastpath == EXIT_FASTPATH_CONT_RUN)
+		goto continue_vmx_vcpu_run;
+
 	return exit_fastpath;
 }
 
@@ -7885,6 +7933,8 @@  static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.nested_get_evmcs_version = NULL,
 	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
 	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+	.fast_deliver_interrupt = vmx_fast_deliver_interrupt,
+	.event_needs_reinjection = vmx_event_needs_reinjection,
 };
 
 static __init int hardware_setup(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59958ce..9c6733d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1609,27 +1609,70 @@  static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
 	return 1;
 }
 
+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
+			apic_lvtt_period(apic))
+		return 0;
+
+	if (!kvm_x86_ops.set_hv_timer ||
+		kvm_mwait_in_guest(vcpu->kvm) ||
+		kvm_can_post_timer_interrupt(vcpu))
+		return 1;
+
+	hrtimer_cancel(&apic->lapic_timer.timer);
+	apic->lapic_timer.tscdeadline = data;
+	atomic_set(&apic->lapic_timer.pending, 0);
+
+	if (kvm_start_hv_timer(apic)) {
+		if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
+			if (kvm_x86_ops.interrupt_allowed(vcpu)) {
+				kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+				kvm_x86_ops.fast_deliver_interrupt(vcpu);
+				atomic_set(&apic->lapic_timer.pending, 0);
+				apic->lapic_timer.tscdeadline = 0;
+				return 0;
+			}
+			return 1;
+		}
+		return 0;
+	}
+
+	return 1;
+}
+
 enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 {
 	u32 msr = kvm_rcx_read(vcpu);
 	u64 data;
-	int ret = 0;
+	int ret = EXIT_FASTPATH_NONE;
 
 	switch (msr) {
 	case APIC_BASE_MSR + (APIC_ICR >> 4):
 		data = kvm_read_edx_eax(vcpu);
-		ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
+		if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data))
+			ret = EXIT_FASTPATH_SKIP_EMUL_INS;
+		break;
+	case MSR_IA32_TSCDEADLINE:
+		if (!kvm_x86_ops.event_needs_reinjection(vcpu)) {
+			data = kvm_read_edx_eax(vcpu);
+			if (!handle_fastpath_set_tscdeadline(vcpu, data))
+				ret = EXIT_FASTPATH_CONT_RUN;
+		}
 		break;
 	default:
-		return EXIT_FASTPATH_NONE;
+		ret = EXIT_FASTPATH_NONE;
 	}
 
-	if (!ret) {
+	if (ret != EXIT_FASTPATH_NONE) {
 		trace_kvm_msr_write(msr, data);
-		return EXIT_FASTPATH_SKIP_EMUL_INS;
+		if (ret == EXIT_FASTPATH_CONT_RUN)
+			kvm_skip_emulated_instruction(vcpu);
 	}
 
-	return EXIT_FASTPATH_NONE;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);