diff mbox series

KVM: X86: set EXITING_GUEST_MODE as soon as vCPU exits

Message ID 20221129182226.82087-1-jon@nutanix.com (mailing list archive)
State New, archived
Headers show
Series KVM: X86: set EXITING_GUEST_MODE as soon as vCPU exits | expand

Commit Message

Jon Kohler Nov. 29, 2022, 6:22 p.m. UTC
Set vcpu->mode to EXITING_GUEST_MODE as soon vCPU exits to reflect
that we are indeed exiting guest mode, but not quite out of guest
mode yet. Note: This is done lazily without an explicit memory
barrier so that we do not regress the cost in the critical path
of going from the exit to the exit handler.

Flip back to IN_GUEST_MODE for exits that use
EXIT_FASTPATH_REENTER_GUEST, such that we are IN_GUEST_MODE upon
reentry.

Changing vcpu->mode away from IN_GUEST_MODE as early as possible
gives IPI senders as much runway as possible to avoid ringing
doorbell or sending posted interrupt IPI in AMD and Intel,
respectively. Since this is done without an explicit memory
barrier, the worst case is that the IPI sender sees IN_GUEST_MODE
still and sends a spurious event, which is the behavior prior
to this patch.

Signed-off-by: Jon Kohler <jon@nutanix.com>
---
 arch/x86/kvm/svm/svm.c |  7 +++++++
 arch/x86/kvm/vmx/vmx.c | 23 +++++++++++++++++++++++
 arch/x86/kvm/x86.c     |  8 ++++++++
 3 files changed, 38 insertions(+)

Comments

Maxim Levitsky Nov. 29, 2022, 7:47 p.m. UTC | #1
On Tue, 2022-11-29 at 13:22 -0500, Jon Kohler wrote:
> Set vcpu->mode to EXITING_GUEST_MODE as soon vCPU exits to reflect
> that we are indeed exiting guest mode, but not quite out of guest
> mode yet. Note: This is done lazily without an explicit memory
> barrier so that we do not regress the cost in the critical path
> of going from the exit to the exit handler.
> 
> Flip back to IN_GUEST_MODE for exits that use
> EXIT_FASTPATH_REENTER_GUEST, such that we are IN_GUEST_MODE upon
> reentry.
> 
> Changing vcpu->mode away from IN_GUEST_MODE as early as possible
> gives IPI senders as much runway as possible to avoid ringing
> doorbell or sending posted interrupt IPI in AMD and Intel,
> respectively. Since this is done without an explicit memory
> barrier, the worst case is that the IPI sender sees IN_GUEST_MODE
> still and sends a spurious event, which is the behavior prior
> to this patch.

Beware that we had a king sized bug in regard to AVIC inhibition races
vs guest entries, this this should be carefully checked for this.

Also, do you have any perf numbers to see if that actually improves performance?
(I am just curious, I do think that this can improve performance).

Best regards,
	Maxim Levitsky


> 
> Signed-off-by: Jon Kohler <jon@nutanix.com>
> ---
>  arch/x86/kvm/svm/svm.c |  7 +++++++
>  arch/x86/kvm/vmx/vmx.c | 23 +++++++++++++++++++++++
>  arch/x86/kvm/x86.c     |  8 ++++++++
>  3 files changed, 38 insertions(+)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index ce362e88a567..5f0c118a3ffd 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -3907,6 +3907,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
>  	else
>  		__svm_vcpu_run(svm, spec_ctrl_intercepted);
>  
> +	/* Optimize IPI reduction by setting mode immediately after vmexit
> +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
> +	 * barrier, after the host is done fully restoring various host states.
> +	 */
> +	vcpu->mode = EXITING_GUEST_MODE;
> +
>  	guest_state_exit_irqoff();
>  }
>  
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 63247c57c72c..243dcb87c727 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5878,6 +5878,17 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
>  
>  	if (!vmx->req_immediate_exit &&
>  	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
> +		/* Reset IN_GUEST_MODE since we're going to reenter
> +		 * guest as part of this fast path. This is done as
> +		 * an optimization without a memory barrier since
> +		 * EXITING_GUEST_MODE is also set without a memory
> +		 * barrier. This also needs to be reset prior to
> +		 * calling apic_timer_expired() so that
> +		 * kvm_use_posted_timer_interrupt() returns the proper
> +		 * value.
> +		 */
> +		if (vcpu->mode == EXITING_GUEST_MODE)
> +			vcpu->mode = IN_GUEST_MODE;
>  		kvm_lapic_expired_hv_timer(vcpu);
>  		return EXIT_FASTPATH_REENTER_GUEST;
>  	}
> @@ -7031,6 +7042,18 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
>  void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
>  					unsigned int flags)
>  {
> +	struct kvm_vcpu *vcpu = &vmx->vcpu;
> +
> +	/* Optimize IPI reduction by setting mode immediately after vmexit
> +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
> +	 * barrier, after the host is done fully restoring various host states.
> +	 * Since the rdmsr and wrmsr below are expensive, this must be done
> +	 * first, so that the IPI suppression window covers the time dealing
> +	 * with fixing up SPEC_CTRL.
> +	 */
> +	vcpu->mode = EXITING_GUEST_MODE;
> +
>  	u64 hostval = this_cpu_read(x86_spec_ctrl_current);
>  
>  	if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 2835bd796639..0e0d228f3fa5 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2160,6 +2160,14 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
>  		data = kvm_read_edx_eax(vcpu);
>  		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
>  			kvm_skip_emulated_instruction(vcpu);
> +			/* Reset IN_GUEST_MODE since we're going to reenter
> +			 * guest as part of this fast path. This is done as
> +			 * an optimization without a memory barrier since
> +			 * EXITING_GUEST_MODE is also set without a memory
> +			 * barrier.
> +			 */
> +			if (vcpu->mode == EXITING_GUEST_MODE)
> +				vcpu->mode = IN_GUEST_MODE;
>  			ret = EXIT_FASTPATH_REENTER_GUEST;
>  		}
>  		break;
Jon Kohler Nov. 29, 2022, 7:56 p.m. UTC | #2
> On Nov 29, 2022, at 2:47 PM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
> 
> On Tue, 2022-11-29 at 13:22 -0500, Jon Kohler wrote:
>> Set vcpu->mode to EXITING_GUEST_MODE as soon vCPU exits to reflect
>> that we are indeed exiting guest mode, but not quite out of guest
>> mode yet. Note: This is done lazily without an explicit memory
>> barrier so that we do not regress the cost in the critical path
>> of going from the exit to the exit handler.
>> 
>> Flip back to IN_GUEST_MODE for exits that use
>> EXIT_FASTPATH_REENTER_GUEST, such that we are IN_GUEST_MODE upon
>> reentry.
>> 
>> Changing vcpu->mode away from IN_GUEST_MODE as early as possible
>> gives IPI senders as much runway as possible to avoid ringing
>> doorbell or sending posted interrupt IPI in AMD and Intel,
>> respectively. Since this is done without an explicit memory
>> barrier, the worst case is that the IPI sender sees IN_GUEST_MODE
>> still and sends a spurious event, which is the behavior prior
>> to this patch.
> 
> Beware that we had a king sized bug in regard to AVIC inhibition races
> vs guest entries, this this should be carefully checked for this.

Thanks, Maxim - any pointers on what we should be looking for here?

> 
> Also, do you have any perf numbers to see if that actually improves performance?
> (I am just curious, I do think that this can improve performance).
> 

Yes indeed! Sorry I should have put that right in the commit msg as a note,
but using the kvm-unit-tests vmexit_ipi with -smp 20 on an Intel 8168 its
roughly ~3% better (~3325-ish vs ~3400-ish), though the test is a bit noisy
even with taskset to a single socket.

To help validate that we were even getting *any* benefit, in a local build
I added a log statement (rough code below) to IPI delivery path, and did see 
many, many IPIs getting suppressed that would have otherwise fired.

kvm_vcpu_trigger_posted_interrupt() {
...
    if (vcpu->mode == EXITING_GUEST_MODE) {
        pr_warn_ratelimited("exiting suppression worked");
    }
...
}

> Best regards,
> 	Maxim Levitsky
> 
> 
>> 
>> Signed-off-by: Jon Kohler <jon@nutanix.com>
>> ---
>> arch/x86/kvm/svm/svm.c |  7 +++++++
>> arch/x86/kvm/vmx/vmx.c | 23 +++++++++++++++++++++++
>> arch/x86/kvm/x86.c     |  8 ++++++++
>> 3 files changed, 38 insertions(+)
>> 
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index ce362e88a567..5f0c118a3ffd 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -3907,6 +3907,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
>> 	else
>> 		__svm_vcpu_run(svm, spec_ctrl_intercepted);
>> 
>> +	/* Optimize IPI reduction by setting mode immediately after vmexit
>> +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
>> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
>> +	 * barrier, after the host is done fully restoring various host states.
>> +	 */
>> +	vcpu->mode = EXITING_GUEST_MODE;
>> +
>> 	guest_state_exit_irqoff();
>> }
>> 
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 63247c57c72c..243dcb87c727 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -5878,6 +5878,17 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
>> 
>> 	if (!vmx->req_immediate_exit &&
>> 	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
>> +		/* Reset IN_GUEST_MODE since we're going to reenter
>> +		 * guest as part of this fast path. This is done as
>> +		 * an optimization without a memory barrier since
>> +		 * EXITING_GUEST_MODE is also set without a memory
>> +		 * barrier. This also needs to be reset prior to
>> +		 * calling apic_timer_expired() so that
>> +		 * kvm_use_posted_timer_interrupt() returns the proper
>> +		 * value.
>> +		 */
>> +		if (vcpu->mode == EXITING_GUEST_MODE)
>> +			vcpu->mode = IN_GUEST_MODE;
>> 		kvm_lapic_expired_hv_timer(vcpu);
>> 		return EXIT_FASTPATH_REENTER_GUEST;
>> 	}
>> @@ -7031,6 +7042,18 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
>> void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
>> 					unsigned int flags)
>> {
>> +	struct kvm_vcpu *vcpu = &vmx->vcpu;
>> +
>> +	/* Optimize IPI reduction by setting mode immediately after vmexit
>> +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
>> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
>> +	 * barrier, after the host is done fully restoring various host states.
>> +	 * Since the rdmsr and wrmsr below are expensive, this must be done
>> +	 * first, so that the IPI suppression window covers the time dealing
>> +	 * with fixing up SPEC_CTRL.
>> +	 */
>> +	vcpu->mode = EXITING_GUEST_MODE;
>> +
>> 	u64 hostval = this_cpu_read(x86_spec_ctrl_current);
>> 
>> 	if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 2835bd796639..0e0d228f3fa5 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2160,6 +2160,14 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
>> 		data = kvm_read_edx_eax(vcpu);
>> 		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
>> 			kvm_skip_emulated_instruction(vcpu);
>> +			/* Reset IN_GUEST_MODE since we're going to reenter
>> +			 * guest as part of this fast path. This is done as
>> +			 * an optimization without a memory barrier since
>> +			 * EXITING_GUEST_MODE is also set without a memory
>> +			 * barrier.
>> +			 */
>> +			if (vcpu->mode == EXITING_GUEST_MODE)
>> +				vcpu->mode = IN_GUEST_MODE;
>> 			ret = EXIT_FASTPATH_REENTER_GUEST;
>> 		}
>> 		break;
Chao Gao Nov. 30, 2022, 6:29 a.m. UTC | #3
On Tue, Nov 29, 2022 at 01:22:25PM -0500, Jon Kohler wrote:
>@@ -7031,6 +7042,18 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
> void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
> 					unsigned int flags)
> {
>+	struct kvm_vcpu *vcpu = &vmx->vcpu;
>+
>+	/* Optimize IPI reduction by setting mode immediately after vmexit
>+	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
>+	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
>+	 * barrier, after the host is done fully restoring various host states.
>+	 * Since the rdmsr and wrmsr below are expensive, this must be done
>+	 * first, so that the IPI suppression window covers the time dealing
>+	 * with fixing up SPEC_CTRL.
>+	 */
>+	vcpu->mode = EXITING_GUEST_MODE;

Does this break kvm_vcpu_kick()? IIUC, kvm_vcpu_kick() does nothing if
vcpu->mode is already EXITING_GUEST_MODE, expecting the vCPU will exit
guest mode. But ...

>+
> 	u64 hostval = this_cpu_read(x86_spec_ctrl_current);
> 
> 	if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>index 2835bd796639..0e0d228f3fa5 100644
>--- a/arch/x86/kvm/x86.c
>+++ b/arch/x86/kvm/x86.c
>@@ -2160,6 +2160,14 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
> 		data = kvm_read_edx_eax(vcpu);
> 		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
> 			kvm_skip_emulated_instruction(vcpu);
>+			/* Reset IN_GUEST_MODE since we're going to reenter
>+			 * guest as part of this fast path. This is done as
>+			 * an optimization without a memory barrier since
>+			 * EXITING_GUEST_MODE is also set without a memory
>+			 * barrier.
>+			 */
>+			if (vcpu->mode == EXITING_GUEST_MODE)
>+				vcpu->mode = IN_GUEST_MODE;

... the vCPU enters guest mode again.

I believe mode transition from EXITING_GUEST_MODE to IN_GUEST_MODE
directly isn't valid for current KVM.
Jon Kohler Nov. 30, 2022, 2:07 p.m. UTC | #4
> On Nov 30, 2022, at 1:29 AM, Chao Gao <chao.gao@intel.com> wrote:
> 

Chao while I’ve got you here, I was inspired to tune up the software side here based
on the VTD suppress notifications change we had been talking about. Any chance
we could get the v4 of that? Seemed like it was almost done, yea? Would love to 
get our hands on that to help accelerate the VTD path.


> On Tue, Nov 29, 2022 at 01:22:25PM -0500, Jon Kohler wrote:
>> @@ -7031,6 +7042,18 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
>> void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
>> 					unsigned int flags)
>> {
>> +	struct kvm_vcpu *vcpu = &vmx->vcpu;
>> +
>> +	/* Optimize IPI reduction by setting mode immediately after vmexit
>> +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
>> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
>> +	 * barrier, after the host is done fully restoring various host states.
>> +	 * Since the rdmsr and wrmsr below are expensive, this must be done
>> +	 * first, so that the IPI suppression window covers the time dealing
>> +	 * with fixing up SPEC_CTRL.
>> +	 */
>> +	vcpu->mode = EXITING_GUEST_MODE;
> 
> Does this break kvm_vcpu_kick()? IIUC, kvm_vcpu_kick() does nothing if
> vcpu->mode is already EXITING_GUEST_MODE, expecting the vCPU will exit
> guest mode. But ...

IIRC that’d only be a problem for fast path exits that reenter guest (like TSC Deadline)
everything else *will* eventually exit out to kernel mode to pickup whatever other
requests may be pending. In this sense, this patch is actually even better for kick
because we will send incrementally less spurious kicks.

Even then, for fast path reentry exits, a guest is likely to exit all the way out eventually
for something else soon enough, so worst case something gets a wee bit more delayed
than it should. Small price to pay for clawing back cycles on the IPI send side I think.

> 
>> +
>> 	u64 hostval = this_cpu_read(x86_spec_ctrl_current);
>> 
>> 	if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 2835bd796639..0e0d228f3fa5 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2160,6 +2160,14 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
>> 		data = kvm_read_edx_eax(vcpu);
>> 		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
>> 			kvm_skip_emulated_instruction(vcpu);
>> +			/* Reset IN_GUEST_MODE since we're going to reenter
>> +			 * guest as part of this fast path. This is done as
>> +			 * an optimization without a memory barrier since
>> +			 * EXITING_GUEST_MODE is also set without a memory
>> +			 * barrier.
>> +			 */
>> +			if (vcpu->mode == EXITING_GUEST_MODE)
>> +				vcpu->mode = IN_GUEST_MODE;
> 
> ... the vCPU enters guest mode again.
> 
> I believe mode transition from EXITING_GUEST_MODE to IN_GUEST_MODE
> directly isn't valid for current KVM.

You’re correct that we do not have this pattern in KVM today; however, I couldn’t
Find anything in the vcpu-requests documentation that specifically says this isn’t
Allowed. Furthermore, this really is the spirit of what we’re doing, e.g.:

We are indeed existing guest mode, but because there is a fast path, we aren’t
exiting anymore, so we need to flip the state back to in guest mode.

This code just makes that sentiment clear in code, and just so happens to also
further suppress IPI sends during that small window.
Chao Gao Dec. 1, 2022, 4:55 a.m. UTC | #5
On Wed, Nov 30, 2022 at 02:07:57PM +0000, Jon Kohler wrote:
>
>
>> On Nov 30, 2022, at 1:29 AM, Chao Gao <chao.gao@intel.com> wrote:
>> 
>
>Chao while I’ve got you here, I was inspired to tune up the software side here based
>on the VTD suppress notifications change we had been talking about. Any chance
>we could get the v4 of that? Seemed like it was almost done, yea? Would love to 

I didn't post a new version because there is no feedback on v3. But
considering there is a mistake in v3, I will fix it and post v4.

>get our hands on that to help accelerate the VTD path.
>
>
>> On Tue, Nov 29, 2022 at 01:22:25PM -0500, Jon Kohler wrote:
>>> @@ -7031,6 +7042,18 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
>>> void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
>>> 					unsigned int flags)
>>> {
>>> +	struct kvm_vcpu *vcpu = &vmx->vcpu;
>>> +
>>> +	/* Optimize IPI reduction by setting mode immediately after vmexit
>>> +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
>>> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
>>> +	 * barrier, after the host is done fully restoring various host states.
>>> +	 * Since the rdmsr and wrmsr below are expensive, this must be done
>>> +	 * first, so that the IPI suppression window covers the time dealing
>>> +	 * with fixing up SPEC_CTRL.
>>> +	 */
>>> +	vcpu->mode = EXITING_GUEST_MODE;
>> 
>> Does this break kvm_vcpu_kick()? IIUC, kvm_vcpu_kick() does nothing if
>> vcpu->mode is already EXITING_GUEST_MODE, expecting the vCPU will exit
>> guest mode. But ...
>
>IIRC that’d only be a problem for fast path exits that reenter guest (like TSC Deadline)
>everything else *will* eventually exit out to kernel mode to pickup whatever other
>requests may be pending. In this sense, this patch is actually even better for kick
>because we will send incrementally less spurious kicks.

Yes. I agree.

>
>Even then, for fast path reentry exits, a guest is likely to exit all the way out eventually
>for something else soon enough, so worst case something gets a wee bit more delayed
>than it should. Small price to pay for clawing back cycles on the IPI send side I think.

Thanks for above clarification. On second thoughts, for fastpath, there is a
call of kvm_vcpu_exit_request() before re-entry. This call guarantees that
vCPUs will exit guest mode if any request pending. So, this change actually
won't lead to a delay in handling pending events.
Jon Kohler Dec. 1, 2022, 2:57 p.m. UTC | #6
> On Nov 30, 2022, at 11:55 PM, Chao Gao <chao.gao@intel.com> wrote:
> 
> On Wed, Nov 30, 2022 at 02:07:57PM +0000, Jon Kohler wrote:
>> 
>> 
>>> On Nov 30, 2022, at 1:29 AM, Chao Gao <chao.gao@intel.com> wrote:
>>> 
>> 
>> Chao while I’ve got you here, I was inspired to tune up the software side here based
>> on the VTD suppress notifications change we had been talking about. Any chance
>> we could get the v4 of that? Seemed like it was almost done, yea? Would love to 
> 
> I didn't post a new version because there is no feedback on v3. But
> considering there is a mistake in v3, I will fix it and post v4.

Ok Thanks! Looking forward to that. In between that patch and this one, should be a great
combined impact. Any chance you can apply my patch and yours together and see how
it works? I’d imagine it isn’t as applicable with IPI-v, but it’d still be interesting to see
how the test numbers work out with your benchmark with/without IPI-v to see if your
test sees a speedup here too.

> 
>> get our hands on that to help accelerate the VTD path.
>> 
>> 
>>> On Tue, Nov 29, 2022 at 01:22:25PM -0500, Jon Kohler wrote:
>>>> @@ -7031,6 +7042,18 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
>>>> void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
>>>> 					unsigned int flags)
>>>> {
>>>> +	struct kvm_vcpu *vcpu = &vmx->vcpu;
>>>> +
>>>> +	/* Optimize IPI reduction by setting mode immediately after vmexit
>>>> +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
>>>> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
>>>> +	 * barrier, after the host is done fully restoring various host states.
>>>> +	 * Since the rdmsr and wrmsr below are expensive, this must be done
>>>> +	 * first, so that the IPI suppression window covers the time dealing
>>>> +	 * with fixing up SPEC_CTRL.
>>>> +	 */
>>>> +	vcpu->mode = EXITING_GUEST_MODE;
>>> 
>>> Does this break kvm_vcpu_kick()? IIUC, kvm_vcpu_kick() does nothing if
>>> vcpu->mode is already EXITING_GUEST_MODE, expecting the vCPU will exit
>>> guest mode. But ...
>> 
>> IIRC that’d only be a problem for fast path exits that reenter guest (like TSC Deadline)
>> everything else *will* eventually exit out to kernel mode to pickup whatever other
>> requests may be pending. In this sense, this patch is actually even better for kick
>> because we will send incrementally less spurious kicks.
> 
> Yes. I agree.
> 
>> 
>> Even then, for fast path reentry exits, a guest is likely to exit all the way out eventually
>> for something else soon enough, so worst case something gets a wee bit more delayed
>> than it should. Small price to pay for clawing back cycles on the IPI send side I think.
> 
> Thanks for above clarification. On second thoughts, for fastpath, there is a
> call of kvm_vcpu_exit_request() before re-entry. This call guarantees that
> vCPUs will exit guest mode if any request pending. So, this change actually
> won't lead to a delay in handling pending events.

Ok thanks. I know this week tends to be a slow(er) week in the US coming back from the
Holidays, so will wait for additional review / comments here
Maxim Levitsky Dec. 1, 2022, 3:45 p.m. UTC | #7
On Tue, 2022-11-29 at 19:56 +0000, Jon Kohler wrote:
> > On Nov 29, 2022, at 2:47 PM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
> > 
> > On Tue, 2022-11-29 at 13:22 -0500, Jon Kohler wrote:
> > > Set vcpu->mode to EXITING_GUEST_MODE as soon vCPU exits to reflect
> > > that we are indeed exiting guest mode, but not quite out of guest
> > > mode yet. Note: This is done lazily without an explicit memory
> > > barrier so that we do not regress the cost in the critical path
> > > of going from the exit to the exit handler.
> > > 
> > > Flip back to IN_GUEST_MODE for exits that use
> > > EXIT_FASTPATH_REENTER_GUEST, such that we are IN_GUEST_MODE upon
> > > reentry.
> > > 
> > > Changing vcpu->mode away from IN_GUEST_MODE as early as possible
> > > gives IPI senders as much runway as possible to avoid ringing
> > > doorbell or sending posted interrupt IPI in AMD and Intel,
> > > respectively. Since this is done without an explicit memory
> > > barrier, the worst case is that the IPI sender sees IN_GUEST_MODE
> > > still and sends a spurious event, which is the behavior prior
> > > to this patch.
> > 
> > Beware that we had a king sized bug in regard to AVIC inhibition races
> > vs guest entries, this this should be carefully checked for this.
> 
> Thanks, Maxim - any pointers on what we should be looking for here?

I need to swap the whole thing in to be able to comment on this one.
I'll do this next week.

Overall the problem is that the target vCPU can inhibit its AVIC at any moment,
which forces the senders to use normal KVM_REQ_EVENT + vcpu kick to deliver
it the interrupt. It is very racy.

The only other pointer I can recall now is the code at 'svm_complete_interrupt_delivery'

Best regards,
	Maxim Levitsky

> 
> > Also, do you have any perf numbers to see if that actually improves performance?
> > (I am just curious, I do think that this can improve performance).
> > 
> 
> Yes indeed! Sorry I should have put that right in the commit msg as a note,
> but using the kvm-unit-tests vmexit_ipi with -smp 20 on an Intel 8168 its
> roughly ~3% better (~3325-ish vs ~3400-ish), though the test is a bit noisy
> even with taskset to a single socket.
> 
> To help validate that we were even getting *any* benefit, in a local build
> I added a log statement (rough code below) to IPI delivery path, and did see 
> many, many IPIs getting suppressed that would have otherwise fired.
> 
> kvm_vcpu_trigger_posted_interrupt() {
> ...
>     if (vcpu->mode == EXITING_GUEST_MODE) {
>         pr_warn_ratelimited("exiting suppression worked");
>     }
> ...
> }
> 
> > Best regards,
> > 	Maxim Levitsky
> > 
> > 
> > > Signed-off-by: Jon Kohler <jon@nutanix.com>
> > > ---
> > > arch/x86/kvm/svm/svm.c |  7 +++++++
> > > arch/x86/kvm/vmx/vmx.c | 23 +++++++++++++++++++++++
> > > arch/x86/kvm/x86.c     |  8 ++++++++
> > > 3 files changed, 38 insertions(+)
> > > 
> > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > > index ce362e88a567..5f0c118a3ffd 100644
> > > --- a/arch/x86/kvm/svm/svm.c
> > > +++ b/arch/x86/kvm/svm/svm.c
> > > @@ -3907,6 +3907,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
> > > 	else
> > > 		__svm_vcpu_run(svm, spec_ctrl_intercepted);
> > > 
> > > +	/* Optimize IPI reduction by setting mode immediately after vmexit
> > > +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
> > > +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
> > > +	 * barrier, after the host is done fully restoring various host states.
> > > +	 */
> > > +	vcpu->mode = EXITING_GUEST_MODE;
> > > +
> > > 	guest_state_exit_irqoff();
> > > }
> > > 
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > index 63247c57c72c..243dcb87c727 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -5878,6 +5878,17 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
> > > 
> > > 	if (!vmx->req_immediate_exit &&
> > > 	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
> > > +		/* Reset IN_GUEST_MODE since we're going to reenter
> > > +		 * guest as part of this fast path. This is done as
> > > +		 * an optimization without a memory barrier since
> > > +		 * EXITING_GUEST_MODE is also set without a memory
> > > +		 * barrier. This also needs to be reset prior to
> > > +		 * calling apic_timer_expired() so that
> > > +		 * kvm_use_posted_timer_interrupt() returns the proper
> > > +		 * value.
> > > +		 */
> > > +		if (vcpu->mode == EXITING_GUEST_MODE)
> > > +			vcpu->mode = IN_GUEST_MODE;
> > > 		kvm_lapic_expired_hv_timer(vcpu);
> > > 		return EXIT_FASTPATH_REENTER_GUEST;
> > > 	}
> > > @@ -7031,6 +7042,18 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
> > > void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
> > > 					unsigned int flags)
> > > {
> > > +	struct kvm_vcpu *vcpu = &vmx->vcpu;
> > > +
> > > +	/* Optimize IPI reduction by setting mode immediately after vmexit
> > > +	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
> > > +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
> > > +	 * barrier, after the host is done fully restoring various host states.
> > > +	 * Since the rdmsr and wrmsr below are expensive, this must be done
> > > +	 * first, so that the IPI suppression window covers the time dealing
> > > +	 * with fixing up SPEC_CTRL.
> > > +	 */
> > > +	vcpu->mode = EXITING_GUEST_MODE;
> > > +
> > > 	u64 hostval = this_cpu_read(x86_spec_ctrl_current);
> > > 
> > > 	if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index 2835bd796639..0e0d228f3fa5 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -2160,6 +2160,14 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
> > > 		data = kvm_read_edx_eax(vcpu);
> > > 		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
> > > 			kvm_skip_emulated_instruction(vcpu);
> > > +			/* Reset IN_GUEST_MODE since we're going to reenter
> > > +			 * guest as part of this fast path. This is done as
> > > +			 * an optimization without a memory barrier since
> > > +			 * EXITING_GUEST_MODE is also set without a memory
> > > +			 * barrier.
> > > +			 */
> > > +			if (vcpu->mode == EXITING_GUEST_MODE)
> > > +				vcpu->mode = IN_GUEST_MODE;
> > > 			ret = EXIT_FASTPATH_REENTER_GUEST;
> > > 		}
> > > 		break;
Sean Christopherson Dec. 1, 2022, 7:17 p.m. UTC | #8
On Tue, Nov 29, 2022, Jon Kohler wrote:
> Set vcpu->mode to EXITING_GUEST_MODE as soon vCPU exits to reflect
> that we are indeed exiting guest mode, but not quite out of guest
> mode yet. Note: This is done lazily without an explicit memory
> barrier so that we do not regress the cost in the critical path
> of going from the exit to the exit handler.

This is not remotely sufficient justification.  Memory "barriers" are just compiler
barriers on x86, so the odds of regressing the VM-Enter/VM-Exit cost without
introducing a bug are miniscule.

> Flip back to IN_GUEST_MODE for exits that use
> EXIT_FASTPATH_REENTER_GUEST, such that we are IN_GUEST_MODE upon
> reentry.
> 
> Changing vcpu->mode away from IN_GUEST_MODE as early as possible

Except this isn't as early as possible.  If we're going to bother doing something
like this, my vote is to move it into assembly.

> gives IPI senders as much runway as possible to avoid ringing
> doorbell or sending posted interrupt IPI in AMD and Intel,
> respectively. Since this is done without an explicit memory
> barrier, the worst case is that the IPI sender sees IN_GUEST_MODE
> still and sends a spurious event, which is the behavior prior
> to this patch.

No, worst case scenario is that kvm_vcpu_exiting_guest_mode() sees EXITING_GUEST_MODE
and doesn't kick the vCPU.  For "kicks" that set a request, kvm_vcpu_exit_request()
will punt the vCPU out of the tight run loop, though there might be ordering issues.

But whether or not there are ordering issues is a moot point since there are uses
of kvm_vcpu_kick() that aren't accompanied by a request, e.g. to purge the PML
buffers.  In other words, kvm_vcpu_kick() absolutely cannot have false negatives.
We could modify KVM to require a request when using kvm_vcpu_kick(), but that's
a bit of a hack, and all of the possible ordering problems is still a pile of
complexity I'd rather avoid.

No small part of me thinks we'd be better off adding a dedicated flag to very
precisely track whether or not the vCPU is truly "in the guest" for the purposes
of sending IPIs.  Things like kicks have different requirements around IN_GUEST_MODE
than sending interrupts, e.g. KVM manually processes the IRR on every VM-Enter and
so lack of an IPI is a non-issue, whereas missing an IPI for a kick is problematic.
In other words, EXITING_GUEST_MODE really needs to mean "existing the run loop".

E.g. toggle the dedicated flag within a few instructions of VM-Enter and
VM-Exit for maximum efficiency for interrupts, and avoid having to make vcpu->mode
more complex than it already is.

To add clarity, we could even rename IN_GUEST_MODE and EXITING_GUEST_MODE to
something like IN_RUN_LOOP and EXITING_RUN_LOOP.

> Signed-off-by: Jon Kohler <jon@nutanix.com>
> ---
>  arch/x86/kvm/svm/svm.c |  7 +++++++
>  arch/x86/kvm/vmx/vmx.c | 23 +++++++++++++++++++++++
>  arch/x86/kvm/x86.c     |  8 ++++++++
>  3 files changed, 38 insertions(+)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index ce362e88a567..5f0c118a3ffd 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -3907,6 +3907,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
>  	else
>  		__svm_vcpu_run(svm, spec_ctrl_intercepted);
>  
> +	/* Optimize IPI reduction by setting mode immediately after vmexit

	/*
 	 * Because KVM isn't the crazy land of net/ block comments should like
	 * this. 
	 */

> +	 * without a memmory barrier as this as not paired anywhere.
> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
> +	 * barrier, after the host is done fully restoring various host states.
> +	 */
> +	vcpu->mode = EXITING_GUEST_MODE;
> +
>  	guest_state_exit_irqoff();
>  }
>  
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 63247c57c72c..243dcb87c727 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5878,6 +5878,17 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
>  
>  	if (!vmx->req_immediate_exit &&
>  	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
> +		/* Reset IN_GUEST_MODE since we're going to reenter
> +		 * guest as part of this fast path. This is done as
> +		 * an optimization without a memory barrier since
> +		 * EXITING_GUEST_MODE is also set without a memory

Heh, justifying the lack of a memory barrier by saying pointing out the other
code you added doesn't use a memory barrier is interesting, to put it politely.

> +		 * barrier. This also needs to be reset prior to
> +		 * calling apic_timer_expired() so that
> +		 * kvm_use_posted_timer_interrupt() returns the proper
> +		 * value.
> +		 */
> +		if (vcpu->mode == EXITING_GUEST_MODE)
> +			vcpu->mode = IN_GUEST_MODE;

It's far easier, likely more performant, documents why this has a chance of working,
and significantly less error prone to do this unconditionally in either assembly
or after the EXIT_FASTPATH_REENTER_GUEST check in vcpu_enter_guest().
Jon Kohler Dec. 5, 2022, 3:09 p.m. UTC | #9
> On Dec 1, 2022, at 2:17 PM, Sean Christopherson <seanjc@google.com> wrote:
> 
> On Tue, Nov 29, 2022, Jon Kohler wrote:
>> Set vcpu->mode to EXITING_GUEST_MODE as soon vCPU exits to reflect
>> that we are indeed exiting guest mode, but not quite out of guest
>> mode yet. Note: This is done lazily without an explicit memory
>> barrier so that we do not regress the cost in the critical path
>> of going from the exit to the exit handler.
> 
> This is not remotely sufficient justification.  Memory "barriers" are just compiler
> barriers on x86, so the odds of regressing the VM-Enter/VM-Exit cost without
> introducing a bug are miniscule.

I was talking about hardware memory barriers, not compiler memory barriers. I’ll
tune that up and take a different approach on v2.

> 
>> Flip back to IN_GUEST_MODE for exits that use
>> EXIT_FASTPATH_REENTER_GUEST, such that we are IN_GUEST_MODE upon
>> reentry.
>> 
>> Changing vcpu->mode away from IN_GUEST_MODE as early as possible
> 
> Except this isn't as early as possible.  If we're going to bother doing something
> like this, my vote is to move it into assembly.

In vmenter.S, tacking on to call vmx_spec_ctrl_restore_host seemed like the
most logical place after handling all of the state saves and RSB work. Are you
saying put it even closer to the exit, meaning before the FILL_RETURN_BUFFER?

> 
>> gives IPI senders as much runway as possible to avoid ringing
>> doorbell or sending posted interrupt IPI in AMD and Intel,
>> respectively. Since this is done without an explicit memory
>> barrier, the worst case is that the IPI sender sees IN_GUEST_MODE
>> still and sends a spurious event, which is the behavior prior
>> to this patch.
> 
> No, worst case scenario is that kvm_vcpu_exiting_guest_mode() sees EXITING_GUEST_MODE
> and doesn't kick the vCPU.  For "kicks" that set a request, kvm_vcpu_exit_request()
> will punt the vCPU out of the tight run loop, though there might be ordering issues.
> 
> But whether or not there are ordering issues is a moot point since there are uses
> of kvm_vcpu_kick() that aren't accompanied by a request, e.g. to purge the PML
> buffers.  In other words, kvm_vcpu_kick() absolutely cannot have false negatives.
> We could modify KVM to require a request when using kvm_vcpu_kick(), but that's
> a bit of a hack, and all of the possible ordering problems is still a pile of
> complexity I'd rather avoid.
> 
> No small part of me thinks we'd be better off adding a dedicated flag to very
> precisely track whether or not the vCPU is truly "in the guest" for the purposes
> of sending IPIs.  Things like kicks have different requirements around IN_GUEST_MODE
> than sending interrupts, e.g. KVM manually processes the IRR on every VM-Enter and
> so lack of an IPI is a non-issue, whereas missing an IPI for a kick is problematic.
> In other words, EXITING_GUEST_MODE really needs to mean "existing the run loop".

Do you mean:
“one small part” (as in give this a shot, maybe), 
or 
“no small part” (as in good-god-don’t-do-this!)

I’m assuming you meant one small part :) sure, how about something like:

To my earlier comment about when to do this within a few instructions, I don’t want
to clobber other stuff happening as part of the enter/exit, what if we repurposed/renamed
vmx_update_host_rsp and vmx_spec_ctrl_restore_host to make them “do stuff before
entry” and “do stuff right after entry returns” functions. That way we wouldn’t have to
add another other function calls or change the existing control flow all that much.

In “do before” we could set something like vcpu->non_root_mode = true
In “do after” we could set vcpu->non_root_mode = false

Or perhaps something like (respectively)
vcpu->operational_state = NON_ROOT_MODE
vcpu->operational_state = ROOT_MODE

Using the root/non-root moniker would precisely track when the whether the 
vCPU is in guest, and aligns with the language used to describe such a state
from the SDM.

Thoughts?


> 
> E.g. toggle the dedicated flag within a few instructions of VM-Enter and
> VM-Exit for maximum efficiency for interrupts, and avoid having to make vcpu->mode
> more complex than it already is.
> 
> To add clarity, we could even rename IN_GUEST_MODE and EXITING_GUEST_MODE to
> something like IN_RUN_LOOP and EXITING_RUN_LOOP.

Yea, this feels like a good idea to reduce confusion. Let’s nail this thread down, then I 
can propose a wider cleanup?

> 
>> Signed-off-by: Jon Kohler <jon@nutanix.com>
>> ---
>> arch/x86/kvm/svm/svm.c |  7 +++++++
>> arch/x86/kvm/vmx/vmx.c | 23 +++++++++++++++++++++++
>> arch/x86/kvm/x86.c     |  8 ++++++++
>> 3 files changed, 38 insertions(+)
>> 
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index ce362e88a567..5f0c118a3ffd 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -3907,6 +3907,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
>> 	else
>> 		__svm_vcpu_run(svm, spec_ctrl_intercepted);
>> 
>> +	/* Optimize IPI reduction by setting mode immediately after vmexit
> 
> 	/*
> 	 * Because KVM isn't the crazy land of net/ block comments should like
> 	 * this. 
> 	 */

Ack, will change in v2

> 
>> +	 * without a memmory barrier as this as not paired anywhere.
>> +	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
>> +	 * barrier, after the host is done fully restoring various host states.
>> +	 */
>> +	vcpu->mode = EXITING_GUEST_MODE;
>> +
>> 	guest_state_exit_irqoff();
>> }
>> 
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 63247c57c72c..243dcb87c727 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -5878,6 +5878,17 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
>> 
>> 	if (!vmx->req_immediate_exit &&
>> 	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
>> +		/* Reset IN_GUEST_MODE since we're going to reenter
>> +		 * guest as part of this fast path. This is done as
>> +		 * an optimization without a memory barrier since
>> +		 * EXITING_GUEST_MODE is also set without a memory
> 
> Heh, justifying the lack of a memory barrier by saying pointing out the other
> code you added doesn't use a memory barrier is interesting, to put it politely.

Message received! Will mop it up.

> 
>> +		 * barrier. This also needs to be reset prior to
>> +		 * calling apic_timer_expired() so that
>> +		 * kvm_use_posted_timer_interrupt() returns the proper
>> +		 * value.
>> +		 */
>> +		if (vcpu->mode == EXITING_GUEST_MODE)
>> +			vcpu->mode = IN_GUEST_MODE;
> 
> It's far easier, likely more performant, documents why this has a chance of working,
> and significantly less error prone to do this unconditionally in either assembly
> or after the EXIT_FASTPATH_REENTER_GUEST check in vcpu_enter_guest().

Ack, will do that there.
Sean Christopherson Dec. 7, 2022, 8:18 p.m. UTC | #10
On Mon, Dec 05, 2022, Jon Kohler wrote:
> 
> > On Dec 1, 2022, at 2:17 PM, Sean Christopherson <seanjc@google.com> wrote:
> >> Changing vcpu->mode away from IN_GUEST_MODE as early as possible
> > 
> > Except this isn't as early as possible.  If we're going to bother doing something
> > like this, my vote is to move it into assembly.
> 
> In vmenter.S, tacking on to call vmx_spec_ctrl_restore_host seemed like the
> most logical place after handling all of the state saves and RSB work. Are you
> saying put it even closer to the exit, meaning before the FILL_RETURN_BUFFER?

Yes, assuming that's safe, in which case it's truly the "as early as possible"
location.

> >> gives IPI senders as much runway as possible to avoid ringing
> >> doorbell or sending posted interrupt IPI in AMD and Intel,
> >> respectively. Since this is done without an explicit memory
> >> barrier, the worst case is that the IPI sender sees IN_GUEST_MODE
> >> still and sends a spurious event, which is the behavior prior
> >> to this patch.
> > 
> > No, worst case scenario is that kvm_vcpu_exiting_guest_mode() sees EXITING_GUEST_MODE
> > and doesn't kick the vCPU.  For "kicks" that set a request, kvm_vcpu_exit_request()
> > will punt the vCPU out of the tight run loop, though there might be ordering issues.
> > 
> > But whether or not there are ordering issues is a moot point since there are uses
> > of kvm_vcpu_kick() that aren't accompanied by a request, e.g. to purge the PML
> > buffers.  In other words, kvm_vcpu_kick() absolutely cannot have false negatives.
> > We could modify KVM to require a request when using kvm_vcpu_kick(), but that's
> > a bit of a hack, and all of the possible ordering problems is still a pile of
> > complexity I'd rather avoid.
> > 
> > No small part of me thinks we'd be better off adding a dedicated flag to very
> > precisely track whether or not the vCPU is truly "in the guest" for the purposes
> > of sending IPIs.  Things like kicks have different requirements around IN_GUEST_MODE
> > than sending interrupts, e.g. KVM manually processes the IRR on every VM-Enter and
> > so lack of an IPI is a non-issue, whereas missing an IPI for a kick is problematic.
> > In other words, EXITING_GUEST_MODE really needs to mean "existing the run loop".
> 
> Do you mean:
> “one small part” (as in give this a shot, maybe), 
> or 
> “no small part” (as in good-god-don’t-do-this!)

Neither.  "No small part" as in "Most of my brain", i.e. "I haven't completely
thought things through, but I think we'd be better off adding a dedicated flag".

> I’m assuming you meant one small part :) sure, how about something like:
> 
> To my earlier comment about when to do this within a few instructions, I don’t want
> to clobber other stuff happening as part of the enter/exit, what if we repurposed/renamed
> vmx_update_host_rsp and vmx_spec_ctrl_restore_host to make them “do stuff before
> entry” and “do stuff right after entry returns” functions. That way we wouldn’t have to
> add another other function calls or change the existing control flow all that much.

I'd prefer not to wrap vmx_update_host_rsp(), that thing is a very special
snowflake.

I don't see why we'd have to add function calls or change the existing control
flow anyways.  The asm flows for VMX and SVM both take the vCPU in the form of
@vmx and @svm, so accessing the proposed excution mode field is just a matter of
adding an entry in arch/x86/kvm/kvm-asm-offsets.c.

And now that kvm-asm-offsets.c exists, I think it makes sense to drop the @regs
parameter for __vmx_vcpu_run(), e.g. to roughly match __svm_vcpu_run().

With that done as prep, accessing the vCPU immediately before/after VM-Enter and
VM-Exit is easy.

As a rough, incomplete sketch for VMX:

diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 766c6b3ef5ed..f80553e34f26 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -102,7 +102,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
         * an LFENCE to stop speculation from skipping the wrmsr.
         */
 
-       /* Load @regs to RAX. */
+       /* Load @vmx to RAX. */
        mov (%_ASM_SP), %_ASM_AX
 
        /* Check if vmlaunch or vmresume is needed */
@@ -125,7 +125,11 @@ SYM_FUNC_START(__vmx_vcpu_run)
        mov VCPU_R14(%_ASM_AX), %r14
        mov VCPU_R15(%_ASM_AX), %r15
 #endif
-       /* Load guest RAX.  This kills the @regs pointer! */
+
+       /* Mark the vCPU as executing in the guest! */
+       movb $IN_GUEST_MODE, VCPU_EXECUTION_MODE(_%ASM_AX)
+
+       /* Load guest RAX.  This kills the @vmx pointer! */
        mov VCPU_RAX(%_ASM_AX), %_ASM_AX
 
        /* Check EFLAGS.ZF from 'testb' above */
@@ -163,9 +167,11 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
        /* Temporarily save guest's RAX. */
        push %_ASM_AX
 
-       /* Reload @regs to RAX. */
+       /* Reload @vmx to RAX. */
        mov WORD_SIZE(%_ASM_SP), %_ASM_AX
 
+       movb $IN_HOST_MODE, VCPU_EXECUTION_MODE(_%ASM_AX)
+
        /* Save all guest registers, including RAX from the stack */
        pop           VCPU_RAX(%_ASM_AX)
        mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
@@ -189,9 +195,12 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
        xor %ebx, %ebx
 
 .Lclear_regs:
-       /* Discard @regs.  The register is irrelevant, it just can't be RBX. */
+       /* Pop @vmx.  The register can be anything except RBX. */
        pop %_ASM_AX
 
+       /* Set the execution mode (again) in case of VM-Fail. */
+       movb $IN_HOST_MODE, VCPU_EXECUTION_MODE(_%ASM_AX)
+
        /*
         * Clear all general purpose registers except RSP and RBX to prevent
         * speculative use of the guest's values, even those that are reloaded


> In “do before” we could set something like vcpu->non_root_mode = true
> In “do after” we could set vcpu->non_root_mode = false
> 
> Or perhaps something like (respectively)
> vcpu->operational_state = NON_ROOT_MODE
> vcpu->operational_state = ROOT_MODE
>
> Using the root/non-root moniker would precisely track when the whether the 
> vCPU is in guest, and aligns with the language used to describe such a state
> from the SDM.
> 
> Thoughts?

No, we should use KVM-defined names, not VMX-defined names, because we'll want
the same logic for SVM.  If we first rename GUEST_MODE => RUN_LOOP, then we can
use IN_GUEST_MODE and IN_HOST_MODE or whatever.
Paolo Bonzini Dec. 9, 2022, 8:06 a.m. UTC | #11
On 12/7/22 21:18, Sean Christopherson wrote:
> 
> And now that kvm-asm-offsets.c exists, I think it makes sense to drop the @regs
> parameter for __vmx_vcpu_run(), e.g. to roughly match __svm_vcpu_run().

Yeah, that I had already here 
(https://lore.kernel.org/kvm/20221028230723.3254250-1-pbonzini@redhat.com/T/#me77cdd848fa86cdce03f59447ddb4f2cba45f79c) 
but then I dropped when it became clear the SVM series was targeting 
6.1.  I planned to resubmit for 6.3 after the holidays.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ce362e88a567..5f0c118a3ffd 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3907,6 +3907,13 @@  static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
 	else
 		__svm_vcpu_run(svm, spec_ctrl_intercepted);
 
+	/* Optimize IPI reduction by setting mode immediately after vmexit
+	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
+	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
+	 * barrier, after the host is done fully restoring various host states.
+	 */
+	vcpu->mode = EXITING_GUEST_MODE;
+
 	guest_state_exit_irqoff();
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 63247c57c72c..243dcb87c727 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5878,6 +5878,17 @@  static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
 
 	if (!vmx->req_immediate_exit &&
 	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
+		/* Reset IN_GUEST_MODE since we're going to reenter
+		 * guest as part of this fast path. This is done as
+		 * an optimization without a memory barrier since
+		 * EXITING_GUEST_MODE is also set without a memory
+		 * barrier. This also needs to be reset prior to
+		 * calling apic_timer_expired() so that
+		 * kvm_use_posted_timer_interrupt() returns the proper
+		 * value.
+		 */
+		if (vcpu->mode == EXITING_GUEST_MODE)
+			vcpu->mode = IN_GUEST_MODE;
 		kvm_lapic_expired_hv_timer(vcpu);
 		return EXIT_FASTPATH_REENTER_GUEST;
 	}
@@ -7031,6 +7042,18 @@  void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
 					unsigned int flags)
 {
+	struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+	/* Optimize IPI reduction by setting mode immediately after vmexit
+	 * without a memmory barrier as this as not paired anywhere. vcpu->mode
+	 * is will be set to OUTSIDE_GUEST_MODE in x86 common code with a memory
+	 * barrier, after the host is done fully restoring various host states.
+	 * Since the rdmsr and wrmsr below are expensive, this must be done
+	 * first, so that the IPI suppression window covers the time dealing
+	 * with fixing up SPEC_CTRL.
+	 */
+	vcpu->mode = EXITING_GUEST_MODE;
+
 	u64 hostval = this_cpu_read(x86_spec_ctrl_current);
 
 	if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2835bd796639..0e0d228f3fa5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2160,6 +2160,14 @@  fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 		data = kvm_read_edx_eax(vcpu);
 		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
 			kvm_skip_emulated_instruction(vcpu);
+			/* Reset IN_GUEST_MODE since we're going to reenter
+			 * guest as part of this fast path. This is done as
+			 * an optimization without a memory barrier since
+			 * EXITING_GUEST_MODE is also set without a memory
+			 * barrier.
+			 */
+			if (vcpu->mode == EXITING_GUEST_MODE)
+				vcpu->mode = IN_GUEST_MODE;
 			ret = EXIT_FASTPATH_REENTER_GUEST;
 		}
 		break;