diff mbox series

[v2] KVM: VMX: Enable Notify VM exit

Message ID 20210525051204.1480610-1-tao3.xu@intel.com (mailing list archive)
State New, archived
Headers show
Series [v2] KVM: VMX: Enable Notify VM exit | expand

Commit Message

Tao Xu May 25, 2021, 5:12 a.m. UTC
There are some cases that malicious virtual machines can cause CPU stuck
(event windows don't open up), e.g., infinite loop in microcode when
nested #AC (CVE-2015-5307). No event window obviously means no events,
e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
hardware CPU can't be used by host or other VM.

To resolve those cases, it can enable a notify VM exit if no event
window occur in VMX non-root mode for a specified amount of time
(notify window). Since CPU is first observed the risk of not causing
forward progress, after notify window time in a units of crystal clock,
Notify VM exit will happen. Notify VM exit can happen incident to delivery
of a vectored event.

Expose a module param for configuring notify window, which is in unit of
crystal clock cycle.
- A negative value (e.g. -1) is to disable this feature.
- Make the default as 0. It is safe because an internal threshold is added
to notify window to ensure all the normal instructions being coverd.
- User can set it to a large value when they want to give more cycles to
wait for some reasons, e.g., silicon wrongly kill some normal instruction
due to internal threshold is too small.

Notify VM exit is defined in latest Intel Architecture Instruction Set
Extensions Programming Reference, chapter 9.2.

Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Tao Xu <tao3.xu@intel.com>
---

Changelog:
v2:
     Default set notify window to 0, less than 0 to disable.
     Add more description in commit message.
---
 arch/x86/include/asm/vmx.h         |  7 +++++
 arch/x86/include/asm/vmxfeatures.h |  1 +
 arch/x86/include/uapi/asm/vmx.h    |  4 ++-
 arch/x86/kvm/vmx/capabilities.h    |  6 +++++
 arch/x86/kvm/vmx/vmx.c             | 42 ++++++++++++++++++++++++++++--
 include/uapi/linux/kvm.h           |  2 ++
 6 files changed, 59 insertions(+), 3 deletions(-)

Comments

Vitaly Kuznetsov June 2, 2021, 10:31 a.m. UTC | #1
Tao Xu <tao3.xu@intel.com> writes:

> There are some cases that malicious virtual machines can cause CPU stuck
> (event windows don't open up), e.g., infinite loop in microcode when
> nested #AC (CVE-2015-5307). No event window obviously means no events,
> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
> hardware CPU can't be used by host or other VM.
>
> To resolve those cases, it can enable a notify VM exit if no event
> window occur in VMX non-root mode for a specified amount of time
> (notify window). Since CPU is first observed the risk of not causing
> forward progress, after notify window time in a units of crystal clock,
> Notify VM exit will happen. Notify VM exit can happen incident to delivery
> of a vectored event.
>
> Expose a module param for configuring notify window, which is in unit of
> crystal clock cycle.
> - A negative value (e.g. -1) is to disable this feature.
> - Make the default as 0. It is safe because an internal threshold is added
> to notify window to ensure all the normal instructions being coverd.
> - User can set it to a large value when they want to give more cycles to
> wait for some reasons, e.g., silicon wrongly kill some normal instruction
> due to internal threshold is too small.
>
> Notify VM exit is defined in latest Intel Architecture Instruction Set
> Extensions Programming Reference, chapter 9.2.
>
> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Tao Xu <tao3.xu@intel.com>
> ---
>
> Changelog:
> v2:
>      Default set notify window to 0, less than 0 to disable.
>      Add more description in commit message.

Sorry if this was already discussed, but in case of nested
virtualization and when L1 also enables
SECONDARY_EXEC_NOTIFY_VM_EXITING, shouldn't we just reflect NOTIFY exits
during L2 execution to L1 instead of crashing the whole L1?

> ---
>  arch/x86/include/asm/vmx.h         |  7 +++++
>  arch/x86/include/asm/vmxfeatures.h |  1 +
>  arch/x86/include/uapi/asm/vmx.h    |  4 ++-
>  arch/x86/kvm/vmx/capabilities.h    |  6 +++++
>  arch/x86/kvm/vmx/vmx.c             | 42 ++++++++++++++++++++++++++++--
>  include/uapi/linux/kvm.h           |  2 ++
>  6 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 0ffaa3156a4e..9104c85a973f 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -74,6 +74,7 @@
>  #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
>  #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
>  #define SECONDARY_EXEC_BUS_LOCK_DETECTION	VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
> +#define SECONDARY_EXEC_NOTIFY_VM_EXITING	VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
>  
>  #define PIN_BASED_EXT_INTR_MASK                 VMCS_CONTROL_BIT(INTR_EXITING)
>  #define PIN_BASED_NMI_EXITING                   VMCS_CONTROL_BIT(NMI_EXITING)
> @@ -269,6 +270,7 @@ enum vmcs_field {
>  	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
>  	PLE_GAP                         = 0x00004020,
>  	PLE_WINDOW                      = 0x00004022,
> +	NOTIFY_WINDOW                   = 0x00004024,
>  	VM_INSTRUCTION_ERROR            = 0x00004400,
>  	VM_EXIT_REASON                  = 0x00004402,
>  	VM_EXIT_INTR_INFO               = 0x00004404,
> @@ -555,6 +557,11 @@ enum vm_entry_failure_code {
>  #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
>  #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
>  
> +/*
> + * Exit Qualifications for NOTIFY VM EXIT
> + */
> +#define NOTIFY_VM_CONTEXT_INVALID     BIT(0)
> +
>  /*
>   * VM-instruction error numbers
>   */
> diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
> index d9a74681a77d..15f0f2ab4f95 100644
> --- a/arch/x86/include/asm/vmxfeatures.h
> +++ b/arch/x86/include/asm/vmxfeatures.h
> @@ -84,5 +84,6 @@
>  #define VMX_FEATURE_USR_WAIT_PAUSE	( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
>  #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
>  #define VMX_FEATURE_BUS_LOCK_DETECTION	( 2*32+ 30) /* "" VM-Exit when bus lock caused */
> +#define VMX_FEATURE_NOTIFY_VM_EXITING	( 2*32+ 31) /* VM-Exit when no event windows after notify window */
>  
>  #endif /* _ASM_X86_VMXFEATURES_H */
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index 946d761adbd3..ef4c80f6553e 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -91,6 +91,7 @@
>  #define EXIT_REASON_UMWAIT              67
>  #define EXIT_REASON_TPAUSE              68
>  #define EXIT_REASON_BUS_LOCK            74
> +#define EXIT_REASON_NOTIFY              75
>  
>  #define VMX_EXIT_REASONS \
>  	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> @@ -153,7 +154,8 @@
>  	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
>  	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
>  	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
> -	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }
> +	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }, \
> +	{ EXIT_REASON_NOTIFY,                "NOTIFY"}
>  
>  #define VMX_EXIT_REASON_FLAGS \
>  	{ VMX_EXIT_REASONS_FAILED_VMENTRY,	"FAILED_VMENTRY" }
> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
> index 8dee8a5fbc17..8527f34a84ac 100644
> --- a/arch/x86/kvm/vmx/capabilities.h
> +++ b/arch/x86/kvm/vmx/capabilities.h
> @@ -407,4 +407,10 @@ static inline u64 vmx_supported_debugctl(void)
>  	return debugctl;
>  }
>  
> +static inline bool cpu_has_notify_vm_exiting(void)
> +{
> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> +		SECONDARY_EXEC_NOTIFY_VM_EXITING;
> +}
> +
>  #endif /* __KVM_X86_VMX_CAPS_H */
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 4bceb5ca3a89..c0ad01c88dac 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
>  int __read_mostly pt_mode = PT_MODE_SYSTEM;
>  module_param(pt_mode, int, S_IRUGO);
>  
> +/* Default is 0, less than 0 (for example, -1) disables notify window. */
> +static int __read_mostly notify_window;
> +module_param(notify_window, int, 0644);
> +
>  static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
>  static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
>  static DEFINE_MUTEX(vmx_l1d_flush_mutex);
> @@ -2539,7 +2543,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  			SECONDARY_EXEC_PT_USE_GPA |
>  			SECONDARY_EXEC_PT_CONCEAL_VMX |
>  			SECONDARY_EXEC_ENABLE_VMFUNC |
> -			SECONDARY_EXEC_BUS_LOCK_DETECTION;
> +			SECONDARY_EXEC_BUS_LOCK_DETECTION |
> +			SECONDARY_EXEC_NOTIFY_VM_EXITING;
>  		if (cpu_has_sgx())
>  			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
>  		if (adjust_vmx_controls(min2, opt2,
> @@ -4376,6 +4381,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
>  	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
>  		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
>  
> +	if (cpu_has_notify_vm_exiting() && notify_window < 0)
> +		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
> +
>  	vmx->secondary_exec_control = exec_control;
>  }
>  
> @@ -4423,6 +4431,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  		vmx->ple_window_dirty = true;
>  	}
>  
> +	if (cpu_has_notify_vm_exiting() && notify_window >= 0)
> +		vmcs_write32(NOTIFY_WINDOW, notify_window);
> +
>  	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
>  	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
>  	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>  	return 0;
>  }
>  
> +static int handle_notify(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
> +
> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
> +		/*
> +		 * Notify VM exit happened while executing iret from NMI,
> +		 * "blocked by NMI" bit has to be set before next VM entry.
> +		 */
> +		if (enable_vnmi &&
> +		    (exit_qual & INTR_INFO_UNBLOCK_NMI))
> +			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
> +				      GUEST_INTR_STATE_NMI);
> +
> +		return 1;
> +	}
> +
> +	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> +	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_NO_EVENT_WINDOW;
> +	vcpu->run->internal.ndata = 1;
> +	vcpu->run->internal.data[0] = exit_qual;
> +
> +	return 0;
> +}
> +
>  /*
>   * The exit handlers return 1 if the exit was handled fully and guest execution
>   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
> @@ -5699,6 +5735,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>  	[EXIT_REASON_ENCLS]		      = handle_encls,
>  	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
> +	[EXIT_REASON_NOTIFY]		      = handle_notify,
>  };
>  
>  static const int kvm_vmx_max_exit_handlers =
> @@ -6042,7 +6079,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
>  	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
>  	     exit_reason.basic != EXIT_REASON_PML_FULL &&
>  	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
> -	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
> +	     exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
> +	     exit_reason.basic != EXIT_REASON_NOTIFY)) {
>  		int ndata = 3;
>  
>  		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 3fd9a7e9d90c..bb3b49b1fb0d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -278,6 +278,8 @@ struct kvm_xen_exit {
>  #define KVM_INTERNAL_ERROR_DELIVERY_EV	3
>  /* Encounter unexpected vm-exit reason */
>  #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON	4
> +/* Encounter notify vm-exit */
> +#define KVM_INTERNAL_ERROR_NO_EVENT_WINDOW   5
>  
>  /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
>  struct kvm_run {
Tao Xu June 3, 2021, 1:23 a.m. UTC | #2
On 6/2/21 6:31 PM, Vitaly Kuznetsov wrote:
> Tao Xu <tao3.xu@intel.com> writes:
> 
>> There are some cases that malicious virtual machines can cause CPU stuck
>> (event windows don't open up), e.g., infinite loop in microcode when
>> nested #AC (CVE-2015-5307). No event window obviously means no events,
>> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
>> hardware CPU can't be used by host or other VM.
>>
>> To resolve those cases, it can enable a notify VM exit if no event
>> window occur in VMX non-root mode for a specified amount of time
>> (notify window). Since CPU is first observed the risk of not causing
>> forward progress, after notify window time in a units of crystal clock,
>> Notify VM exit will happen. Notify VM exit can happen incident to delivery
>> of a vectored event.
>>
>> Expose a module param for configuring notify window, which is in unit of
>> crystal clock cycle.
>> - A negative value (e.g. -1) is to disable this feature.
>> - Make the default as 0. It is safe because an internal threshold is added
>> to notify window to ensure all the normal instructions being coverd.
>> - User can set it to a large value when they want to give more cycles to
>> wait for some reasons, e.g., silicon wrongly kill some normal instruction
>> due to internal threshold is too small.
>>
>> Notify VM exit is defined in latest Intel Architecture Instruction Set
>> Extensions Programming Reference, chapter 9.2.
>>
>> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
>> ---
>>
>> Changelog:
>> v2:
>>       Default set notify window to 0, less than 0 to disable.
>>       Add more description in commit message.
> 
> Sorry if this was already discussed, but in case of nested
> virtualization and when L1 also enables
> SECONDARY_EXEC_NOTIFY_VM_EXITING, shouldn't we just reflect NOTIFY exits
> during L2 execution to L1 instead of crashing the whole L1?
> 
Notify VM Exit will not crash L1 guest if VM context valid in exit 
qualification. After VM exit, VMM can resume the guest normally.
>> ---
>>   arch/x86/include/asm/vmx.h         |  7 +++++
>>   arch/x86/include/asm/vmxfeatures.h |  1 +
>>   arch/x86/include/uapi/asm/vmx.h    |  4 ++-
>>   arch/x86/kvm/vmx/capabilities.h    |  6 +++++
>>   arch/x86/kvm/vmx/vmx.c             | 42 ++++++++++++++++++++++++++++--
>>   include/uapi/linux/kvm.h           |  2 ++
>>   6 files changed, 59 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 0ffaa3156a4e..9104c85a973f 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -74,6 +74,7 @@
>>   #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
>>   #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
>>   #define SECONDARY_EXEC_BUS_LOCK_DETECTION	VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
>> +#define SECONDARY_EXEC_NOTIFY_VM_EXITING	VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
>>   
>>   #define PIN_BASED_EXT_INTR_MASK                 VMCS_CONTROL_BIT(INTR_EXITING)
>>   #define PIN_BASED_NMI_EXITING                   VMCS_CONTROL_BIT(NMI_EXITING)
>> @@ -269,6 +270,7 @@ enum vmcs_field {
>>   	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
>>   	PLE_GAP                         = 0x00004020,
>>   	PLE_WINDOW                      = 0x00004022,
>> +	NOTIFY_WINDOW                   = 0x00004024,
>>   	VM_INSTRUCTION_ERROR            = 0x00004400,
>>   	VM_EXIT_REASON                  = 0x00004402,
>>   	VM_EXIT_INTR_INFO               = 0x00004404,
>> @@ -555,6 +557,11 @@ enum vm_entry_failure_code {
>>   #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
>>   #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
>>   
>> +/*
>> + * Exit Qualifications for NOTIFY VM EXIT
>> + */
>> +#define NOTIFY_VM_CONTEXT_INVALID     BIT(0)
>> +
>>   /*
>>    * VM-instruction error numbers
>>    */
>> diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
>> index d9a74681a77d..15f0f2ab4f95 100644
>> --- a/arch/x86/include/asm/vmxfeatures.h
>> +++ b/arch/x86/include/asm/vmxfeatures.h
>> @@ -84,5 +84,6 @@
>>   #define VMX_FEATURE_USR_WAIT_PAUSE	( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
>>   #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
>>   #define VMX_FEATURE_BUS_LOCK_DETECTION	( 2*32+ 30) /* "" VM-Exit when bus lock caused */
>> +#define VMX_FEATURE_NOTIFY_VM_EXITING	( 2*32+ 31) /* VM-Exit when no event windows after notify window */
>>   
>>   #endif /* _ASM_X86_VMXFEATURES_H */
>> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
>> index 946d761adbd3..ef4c80f6553e 100644
>> --- a/arch/x86/include/uapi/asm/vmx.h
>> +++ b/arch/x86/include/uapi/asm/vmx.h
>> @@ -91,6 +91,7 @@
>>   #define EXIT_REASON_UMWAIT              67
>>   #define EXIT_REASON_TPAUSE              68
>>   #define EXIT_REASON_BUS_LOCK            74
>> +#define EXIT_REASON_NOTIFY              75
>>   
>>   #define VMX_EXIT_REASONS \
>>   	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
>> @@ -153,7 +154,8 @@
>>   	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
>>   	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
>>   	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
>> -	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }
>> +	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }, \
>> +	{ EXIT_REASON_NOTIFY,                "NOTIFY"}
>>   
>>   #define VMX_EXIT_REASON_FLAGS \
>>   	{ VMX_EXIT_REASONS_FAILED_VMENTRY,	"FAILED_VMENTRY" }
>> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
>> index 8dee8a5fbc17..8527f34a84ac 100644
>> --- a/arch/x86/kvm/vmx/capabilities.h
>> +++ b/arch/x86/kvm/vmx/capabilities.h
>> @@ -407,4 +407,10 @@ static inline u64 vmx_supported_debugctl(void)
>>   	return debugctl;
>>   }
>>   
>> +static inline bool cpu_has_notify_vm_exiting(void)
>> +{
>> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
>> +		SECONDARY_EXEC_NOTIFY_VM_EXITING;
>> +}
>> +
>>   #endif /* __KVM_X86_VMX_CAPS_H */
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 4bceb5ca3a89..c0ad01c88dac 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
>>   int __read_mostly pt_mode = PT_MODE_SYSTEM;
>>   module_param(pt_mode, int, S_IRUGO);
>>   
>> +/* Default is 0, less than 0 (for example, -1) disables notify window. */
>> +static int __read_mostly notify_window;
>> +module_param(notify_window, int, 0644);
>> +
>>   static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
>>   static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
>>   static DEFINE_MUTEX(vmx_l1d_flush_mutex);
>> @@ -2539,7 +2543,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>>   			SECONDARY_EXEC_PT_USE_GPA |
>>   			SECONDARY_EXEC_PT_CONCEAL_VMX |
>>   			SECONDARY_EXEC_ENABLE_VMFUNC |
>> -			SECONDARY_EXEC_BUS_LOCK_DETECTION;
>> +			SECONDARY_EXEC_BUS_LOCK_DETECTION |
>> +			SECONDARY_EXEC_NOTIFY_VM_EXITING;
>>   		if (cpu_has_sgx())
>>   			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
>>   		if (adjust_vmx_controls(min2, opt2,
>> @@ -4376,6 +4381,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
>>   	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
>>   		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
>>   
>> +	if (cpu_has_notify_vm_exiting() && notify_window < 0)
>> +		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
>> +
>>   	vmx->secondary_exec_control = exec_control;
>>   }
>>   
>> @@ -4423,6 +4431,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>>   		vmx->ple_window_dirty = true;
>>   	}
>>   
>> +	if (cpu_has_notify_vm_exiting() && notify_window >= 0)
>> +		vmcs_write32(NOTIFY_WINDOW, notify_window);
>> +
>>   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
>>   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
>>   	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
>> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>>   	return 0;
>>   }
>>   
>> +static int handle_notify(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
>> +
>> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
>> +		/*
>> +		 * Notify VM exit happened while executing iret from NMI,
>> +		 * "blocked by NMI" bit has to be set before next VM entry.
>> +		 */
>> +		if (enable_vnmi &&
>> +		    (exit_qual & INTR_INFO_UNBLOCK_NMI))
>> +			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
>> +				      GUEST_INTR_STATE_NMI);
>> +
>> +		return 1;
>> +	}
>> +
>> +	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> +	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_NO_EVENT_WINDOW;
>> +	vcpu->run->internal.ndata = 1;
>> +	vcpu->run->internal.data[0] = exit_qual;
>> +
>> +	return 0;
>> +}
>> +
>>   /*
>>    * The exit handlers return 1 if the exit was handled fully and guest execution
>>    * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
>> @@ -5699,6 +5735,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>>   	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>>   	[EXIT_REASON_ENCLS]		      = handle_encls,
>>   	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
>> +	[EXIT_REASON_NOTIFY]		      = handle_notify,
>>   };
>>   
>>   static const int kvm_vmx_max_exit_handlers =
>> @@ -6042,7 +6079,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
>>   	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
>>   	     exit_reason.basic != EXIT_REASON_PML_FULL &&
>>   	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
>> -	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
>> +	     exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
>> +	     exit_reason.basic != EXIT_REASON_NOTIFY)) {
>>   		int ndata = 3;
>>   
>>   		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 3fd9a7e9d90c..bb3b49b1fb0d 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -278,6 +278,8 @@ struct kvm_xen_exit {
>>   #define KVM_INTERNAL_ERROR_DELIVERY_EV	3
>>   /* Encounter unexpected vm-exit reason */
>>   #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON	4
>> +/* Encounter notify vm-exit */
>> +#define KVM_INTERNAL_ERROR_NO_EVENT_WINDOW   5
>>   
>>   /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
>>   struct kvm_run {
>
Xiaoyao Li June 3, 2021, 1:25 a.m. UTC | #3
On 6/2/2021 6:31 PM, Vitaly Kuznetsov wrote:
> Tao Xu <tao3.xu@intel.com> writes:
> 
>> There are some cases that malicious virtual machines can cause CPU stuck
>> (event windows don't open up), e.g., infinite loop in microcode when
>> nested #AC (CVE-2015-5307). No event window obviously means no events,
>> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
>> hardware CPU can't be used by host or other VM.
>>
>> To resolve those cases, it can enable a notify VM exit if no event
>> window occur in VMX non-root mode for a specified amount of time
>> (notify window). Since CPU is first observed the risk of not causing
>> forward progress, after notify window time in a units of crystal clock,
>> Notify VM exit will happen. Notify VM exit can happen incident to delivery
>> of a vectored event.
>>
>> Expose a module param for configuring notify window, which is in unit of
>> crystal clock cycle.
>> - A negative value (e.g. -1) is to disable this feature.
>> - Make the default as 0. It is safe because an internal threshold is added
>> to notify window to ensure all the normal instructions being coverd.
>> - User can set it to a large value when they want to give more cycles to
>> wait for some reasons, e.g., silicon wrongly kill some normal instruction
>> due to internal threshold is too small.
>>
>> Notify VM exit is defined in latest Intel Architecture Instruction Set
>> Extensions Programming Reference, chapter 9.2.
>>
>> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
>> ---
>>
>> Changelog:
>> v2:
>>       Default set notify window to 0, less than 0 to disable.
>>       Add more description in commit message.
> 
> Sorry if this was already discussed, but in case of nested
> virtualization and when L1 also enables
> SECONDARY_EXEC_NOTIFY_VM_EXITING, shouldn't we just reflect NOTIFY exits
> during L2 execution to L1 instead of crashing the whole L1?
> 

yes. If we expose it to nested, it should reflect the Notify VM exit to 
L1 when L1 enables it.

But regarding nested, there are more things need to be discussed. e.g.,
1) It has dependence between L0 and L1, for security consideration. When 
L0 enables it, it shouldn't be turned off during L2 VM is running.
    a. Don't expose to L1 but enable for L1 when L2 VM is running.
    b. expose it to L1 and force it enabled.

2) When expose it to L1, vmcs02.notify_window needs to be 
min(L0.notify_window, L1.nofity_window)

We don't deal with nested to make this Patch simple.
Jim Mattson June 3, 2021, 1:35 p.m. UTC | #4
On Wed, Jun 2, 2021 at 6:25 PM Xiaoyao Li <xiaoyao.li@intel.com> wrote:
>
> On 6/2/2021 6:31 PM, Vitaly Kuznetsov wrote:
> > Tao Xu <tao3.xu@intel.com> writes:
> >
> >> There are some cases that malicious virtual machines can cause CPU stuck
> >> (event windows don't open up), e.g., infinite loop in microcode when
> >> nested #AC (CVE-2015-5307). No event window obviously means no events,
> >> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
> >> hardware CPU can't be used by host or other VM.
> >>
> >> To resolve those cases, it can enable a notify VM exit if no event
> >> window occur in VMX non-root mode for a specified amount of time
> >> (notify window). Since CPU is first observed the risk of not causing
> >> forward progress, after notify window time in a units of crystal clock,
> >> Notify VM exit will happen. Notify VM exit can happen incident to delivery
> >> of a vectored event.
> >>
> >> Expose a module param for configuring notify window, which is in unit of
> >> crystal clock cycle.
> >> - A negative value (e.g. -1) is to disable this feature.
> >> - Make the default as 0. It is safe because an internal threshold is added
> >> to notify window to ensure all the normal instructions being coverd.
> >> - User can set it to a large value when they want to give more cycles to
> >> wait for some reasons, e.g., silicon wrongly kill some normal instruction
> >> due to internal threshold is too small.
> >>
> >> Notify VM exit is defined in latest Intel Architecture Instruction Set
> >> Extensions Programming Reference, chapter 9.2.
> >>
> >> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> >> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> >> Signed-off-by: Tao Xu <tao3.xu@intel.com>
> >> ---
> >>
> >> Changelog:
> >> v2:
> >>       Default set notify window to 0, less than 0 to disable.
> >>       Add more description in commit message.
> >
> > Sorry if this was already discussed, but in case of nested
> > virtualization and when L1 also enables
> > SECONDARY_EXEC_NOTIFY_VM_EXITING, shouldn't we just reflect NOTIFY exits
> > during L2 execution to L1 instead of crashing the whole L1?
> >
>
> yes. If we expose it to nested, it should reflect the Notify VM exit to
> L1 when L1 enables it.
>
> But regarding nested, there are more things need to be discussed. e.g.,
> 1) It has dependence between L0 and L1, for security consideration. When
> L0 enables it, it shouldn't be turned off during L2 VM is running.
>     a. Don't expose to L1 but enable for L1 when L2 VM is running.
>     b. expose it to L1 and force it enabled.
>
> 2) When expose it to L1, vmcs02.notify_window needs to be
> min(L0.notify_window, L1.nofity_window)

I don't think this can be a simple 'min', since L1's clock may run at
a different frequency from L0's clock.
Vitaly Kuznetsov June 3, 2021, 1:43 p.m. UTC | #5
Tao Xu <tao3.xu@intel.com> writes:

> On 6/2/21 6:31 PM, Vitaly Kuznetsov wrote:
>> Tao Xu <tao3.xu@intel.com> writes:
>> 
>>> There are some cases that malicious virtual machines can cause CPU stuck
>>> (event windows don't open up), e.g., infinite loop in microcode when
>>> nested #AC (CVE-2015-5307). No event window obviously means no events,
>>> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
>>> hardware CPU can't be used by host or other VM.
>>>
>>> To resolve those cases, it can enable a notify VM exit if no event
>>> window occur in VMX non-root mode for a specified amount of time
>>> (notify window). Since CPU is first observed the risk of not causing
>>> forward progress, after notify window time in a units of crystal clock,
>>> Notify VM exit will happen. Notify VM exit can happen incident to delivery
>>> of a vectored event.
>>>
>>> Expose a module param for configuring notify window, which is in unit of
>>> crystal clock cycle.
>>> - A negative value (e.g. -1) is to disable this feature.
>>> - Make the default as 0. It is safe because an internal threshold is added
>>> to notify window to ensure all the normal instructions being coverd.
>>> - User can set it to a large value when they want to give more cycles to
>>> wait for some reasons, e.g., silicon wrongly kill some normal instruction
>>> due to internal threshold is too small.
>>>
>>> Notify VM exit is defined in latest Intel Architecture Instruction Set
>>> Extensions Programming Reference, chapter 9.2.
>>>
>>> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
>>> ---
>>>
>>> Changelog:
>>> v2:
>>>       Default set notify window to 0, less than 0 to disable.
>>>       Add more description in commit message.
>> 
>> Sorry if this was already discussed, but in case of nested
>> virtualization and when L1 also enables
>> SECONDARY_EXEC_NOTIFY_VM_EXITING, shouldn't we just reflect NOTIFY exits
>> during L2 execution to L1 instead of crashing the whole L1?
>> 
> Notify VM Exit will not crash L1 guest if VM context valid in exit 
> qualification. After VM exit, VMM can resume the guest normally.

Wrong choice of words, sorry. Indeed, VMM is free to decide what to do
upon such vmexit.
Vitaly Kuznetsov June 3, 2021, 1:52 p.m. UTC | #6
Xiaoyao Li <xiaoyao.li@intel.com> writes:

> On 6/2/2021 6:31 PM, Vitaly Kuznetsov wrote:
>> Tao Xu <tao3.xu@intel.com> writes:
>> 
>>> There are some cases that malicious virtual machines can cause CPU stuck
>>> (event windows don't open up), e.g., infinite loop in microcode when
>>> nested #AC (CVE-2015-5307). No event window obviously means no events,
>>> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
>>> hardware CPU can't be used by host or other VM.
>>>
>>> To resolve those cases, it can enable a notify VM exit if no event
>>> window occur in VMX non-root mode for a specified amount of time
>>> (notify window). Since CPU is first observed the risk of not causing
>>> forward progress, after notify window time in a units of crystal clock,
>>> Notify VM exit will happen. Notify VM exit can happen incident to delivery
>>> of a vectored event.
>>>
>>> Expose a module param for configuring notify window, which is in unit of
>>> crystal clock cycle.
>>> - A negative value (e.g. -1) is to disable this feature.
>>> - Make the default as 0. It is safe because an internal threshold is added
>>> to notify window to ensure all the normal instructions being coverd.
>>> - User can set it to a large value when they want to give more cycles to
>>> wait for some reasons, e.g., silicon wrongly kill some normal instruction
>>> due to internal threshold is too small.
>>>
>>> Notify VM exit is defined in latest Intel Architecture Instruction Set
>>> Extensions Programming Reference, chapter 9.2.
>>>
>>> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
>>> ---
>>>
>>> Changelog:
>>> v2:
>>>       Default set notify window to 0, less than 0 to disable.
>>>       Add more description in commit message.
>> 
>> Sorry if this was already discussed, but in case of nested
>> virtualization and when L1 also enables
>> SECONDARY_EXEC_NOTIFY_VM_EXITING, shouldn't we just reflect NOTIFY exits
>> during L2 execution to L1 instead of crashing the whole L1?
>> 
>
> yes. If we expose it to nested, it should reflect the Notify VM exit to 
> L1 when L1 enables it.
>
> But regarding nested, there are more things need to be discussed. e.g.,
> 1) It has dependence between L0 and L1, for security consideration. When 
> L0 enables it, it shouldn't be turned off during L2 VM is running.
>     a. Don't expose to L1 but enable for L1 when L2 VM is running.
>     b. expose it to L1 and force it enabled.

Could you please elaborate on the 'security' concern? My understanding
that during L2 execution:
If L0 enables the feature and L1 doesn't, vmexit goes to L0.
If L1 enables the feature and L0 doesn't, vmexit goes to L1.
If both L0 and L1 enable the feature, vmexit can probably (I didn't put
enough though in it I'm afraid) go to the one which has smaller window.

>
> 2) When expose it to L1, vmcs02.notify_window needs to be 
> min(L0.notify_window, L1.nofity_window)
>
> We don't deal with nested to make this Patch simple.

Sure, I just wanted to check with you what's the future plan and if the
behavior you introduce is desireable in nested case.
Xiaoyao Li June 7, 2021, 9:23 a.m. UTC | #7
On 6/3/2021 9:52 PM, Vitaly Kuznetsov wrote:
> Xiaoyao Li <xiaoyao.li@intel.com> writes:
> 
>> On 6/2/2021 6:31 PM, Vitaly Kuznetsov wrote:
>>> Tao Xu <tao3.xu@intel.com> writes:
>>>
>>>> There are some cases that malicious virtual machines can cause CPU stuck
>>>> (event windows don't open up), e.g., infinite loop in microcode when
>>>> nested #AC (CVE-2015-5307). No event window obviously means no events,
>>>> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
>>>> hardware CPU can't be used by host or other VM.
>>>>
>>>> To resolve those cases, it can enable a notify VM exit if no event
>>>> window occur in VMX non-root mode for a specified amount of time
>>>> (notify window). Since CPU is first observed the risk of not causing
>>>> forward progress, after notify window time in a units of crystal clock,
>>>> Notify VM exit will happen. Notify VM exit can happen incident to delivery
>>>> of a vectored event.
>>>>
>>>> Expose a module param for configuring notify window, which is in unit of
>>>> crystal clock cycle.
>>>> - A negative value (e.g. -1) is to disable this feature.
>>>> - Make the default as 0. It is safe because an internal threshold is added
>>>> to notify window to ensure all the normal instructions being coverd.
>>>> - User can set it to a large value when they want to give more cycles to
>>>> wait for some reasons, e.g., silicon wrongly kill some normal instruction
>>>> due to internal threshold is too small.
>>>>
>>>> Notify VM exit is defined in latest Intel Architecture Instruction Set
>>>> Extensions Programming Reference, chapter 9.2.
>>>>
>>>> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
>>>> ---
>>>>
>>>> Changelog:
>>>> v2:
>>>>        Default set notify window to 0, less than 0 to disable.
>>>>        Add more description in commit message.
>>>
>>> Sorry if this was already discussed, but in case of nested
>>> virtualization and when L1 also enables
>>> SECONDARY_EXEC_NOTIFY_VM_EXITING, shouldn't we just reflect NOTIFY exits
>>> during L2 execution to L1 instead of crashing the whole L1?
>>>
>>
>> yes. If we expose it to nested, it should reflect the Notify VM exit to
>> L1 when L1 enables it.
>>
>> But regarding nested, there are more things need to be discussed. e.g.,
>> 1) It has dependence between L0 and L1, for security consideration. When
>> L0 enables it, it shouldn't be turned off during L2 VM is running.
>>      a. Don't expose to L1 but enable for L1 when L2 VM is running.
>>      b. expose it to L1 and force it enabled.
> 
> Could you please elaborate on the 'security' concern? 

I mean the case that if we expose this feature to L1 VMM, L1 VMM cannot 
en/dis-able this feature on its own purpose when L0 turns it on.

i.e., vmcs02.settings has to be (L0's | L1's)

otherwise L1 guest can escape by creating an nested guest and disabling it.

> My understanding
> that during L2 execution:
> If L0 enables the feature and L1 doesn't, vmexit goes to L0.
> If L1 enables the feature and L0 doesn't, vmexit goes to L1.

> If both L0 and L1 enable the feature, vmexit can probably (I didn't put
> enough though in it I'm afraid) go to the one which has smaller window.

It sounds reasonable.

>>
>> 2) When expose it to L1, vmcs02.notify_window needs to be
>> min(L0.notify_window, L1.nofity_window)
>>
>> We don't deal with nested to make this Patch simple.
> 
> Sure, I just wanted to check with you what's the future plan and if the
> behavior you introduce is desireable in nested case.
>
Xiaoyao Li June 7, 2021, 9:24 a.m. UTC | #8
On 6/3/2021 9:35 PM, Jim Mattson wrote:
> On Wed, Jun 2, 2021 at 6:25 PM Xiaoyao Li <xiaoyao.li@intel.com> wrote:
>>
>> On 6/2/2021 6:31 PM, Vitaly Kuznetsov wrote:
>>> Tao Xu <tao3.xu@intel.com> writes:
>>>
>>>> There are some cases that malicious virtual machines can cause CPU stuck
>>>> (event windows don't open up), e.g., infinite loop in microcode when
>>>> nested #AC (CVE-2015-5307). No event window obviously means no events,
>>>> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
>>>> hardware CPU can't be used by host or other VM.
>>>>
>>>> To resolve those cases, it can enable a notify VM exit if no event
>>>> window occur in VMX non-root mode for a specified amount of time
>>>> (notify window). Since CPU is first observed the risk of not causing
>>>> forward progress, after notify window time in a units of crystal clock,
>>>> Notify VM exit will happen. Notify VM exit can happen incident to delivery
>>>> of a vectored event.
>>>>
>>>> Expose a module param for configuring notify window, which is in unit of
>>>> crystal clock cycle.
>>>> - A negative value (e.g. -1) is to disable this feature.
>>>> - Make the default as 0. It is safe because an internal threshold is added
>>>> to notify window to ensure all the normal instructions being coverd.
>>>> - User can set it to a large value when they want to give more cycles to
>>>> wait for some reasons, e.g., silicon wrongly kill some normal instruction
>>>> due to internal threshold is too small.
>>>>
>>>> Notify VM exit is defined in latest Intel Architecture Instruction Set
>>>> Extensions Programming Reference, chapter 9.2.
>>>>
>>>> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>>> Signed-off-by: Tao Xu <tao3.xu@intel.com>
>>>> ---
>>>>
>>>> Changelog:
>>>> v2:
>>>>        Default set notify window to 0, less than 0 to disable.
>>>>        Add more description in commit message.
>>>
>>> Sorry if this was already discussed, but in case of nested
>>> virtualization and when L1 also enables
>>> SECONDARY_EXEC_NOTIFY_VM_EXITING, shouldn't we just reflect NOTIFY exits
>>> during L2 execution to L1 instead of crashing the whole L1?
>>>
>>
>> yes. If we expose it to nested, it should reflect the Notify VM exit to
>> L1 when L1 enables it.
>>
>> But regarding nested, there are more things need to be discussed. e.g.,
>> 1) It has dependence between L0 and L1, for security consideration. When
>> L0 enables it, it shouldn't be turned off during L2 VM is running.
>>      a. Don't expose to L1 but enable for L1 when L2 VM is running.
>>      b. expose it to L1 and force it enabled.
>>
>> 2) When expose it to L1, vmcs02.notify_window needs to be
>> min(L0.notify_window, L1.nofity_window)
> 
> I don't think this can be a simple 'min', since L1's clock may run at
> a different frequency from L0's clock.
> 

Good catch. We will take it into account.

thanks!
Tao Xu June 24, 2021, 4:52 a.m. UTC | #9
Hi,

Ping for comments.

Thank you!

On 5/25/21 1:12 PM, Tao Xu wrote:
> There are some cases that malicious virtual machines can cause CPU stuck
> (event windows don't open up), e.g., infinite loop in microcode when
> nested #AC (CVE-2015-5307). No event window obviously means no events,
> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
> hardware CPU can't be used by host or other VM.
> 
> To resolve those cases, it can enable a notify VM exit if no event
> window occur in VMX non-root mode for a specified amount of time
> (notify window). Since CPU is first observed the risk of not causing
> forward progress, after notify window time in a units of crystal clock,
> Notify VM exit will happen. Notify VM exit can happen incident to delivery
> of a vectored event.
> 
> Expose a module param for configuring notify window, which is in unit of
> crystal clock cycle.
> - A negative value (e.g. -1) is to disable this feature.
> - Make the default as 0. It is safe because an internal threshold is added
> to notify window to ensure all the normal instructions being coverd.
> - User can set it to a large value when they want to give more cycles to
> wait for some reasons, e.g., silicon wrongly kill some normal instruction
> due to internal threshold is too small.
> 
> Notify VM exit is defined in latest Intel Architecture Instruction Set
> Extensions Programming Reference, chapter 9.2.
> 
> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Tao Xu <tao3.xu@intel.com>
> ---
> 
> Changelog:
> v2:
>       Default set notify window to 0, less than 0 to disable.
>       Add more description in commit message.
> ---
>   arch/x86/include/asm/vmx.h         |  7 +++++
>   arch/x86/include/asm/vmxfeatures.h |  1 +
>   arch/x86/include/uapi/asm/vmx.h    |  4 ++-
>   arch/x86/kvm/vmx/capabilities.h    |  6 +++++
>   arch/x86/kvm/vmx/vmx.c             | 42 ++++++++++++++++++++++++++++--
>   include/uapi/linux/kvm.h           |  2 ++
>   6 files changed, 59 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 0ffaa3156a4e..9104c85a973f 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -74,6 +74,7 @@
>   #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
>   #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
>   #define SECONDARY_EXEC_BUS_LOCK_DETECTION	VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
> +#define SECONDARY_EXEC_NOTIFY_VM_EXITING	VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
>   
>   #define PIN_BASED_EXT_INTR_MASK                 VMCS_CONTROL_BIT(INTR_EXITING)
>   #define PIN_BASED_NMI_EXITING                   VMCS_CONTROL_BIT(NMI_EXITING)
> @@ -269,6 +270,7 @@ enum vmcs_field {
>   	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
>   	PLE_GAP                         = 0x00004020,
>   	PLE_WINDOW                      = 0x00004022,
> +	NOTIFY_WINDOW                   = 0x00004024,
>   	VM_INSTRUCTION_ERROR            = 0x00004400,
>   	VM_EXIT_REASON                  = 0x00004402,
>   	VM_EXIT_INTR_INFO               = 0x00004404,
> @@ -555,6 +557,11 @@ enum vm_entry_failure_code {
>   #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
>   #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
>   
> +/*
> + * Exit Qualifications for NOTIFY VM EXIT
> + */
> +#define NOTIFY_VM_CONTEXT_INVALID     BIT(0)
> +
>   /*
>    * VM-instruction error numbers
>    */
> diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
> index d9a74681a77d..15f0f2ab4f95 100644
> --- a/arch/x86/include/asm/vmxfeatures.h
> +++ b/arch/x86/include/asm/vmxfeatures.h
> @@ -84,5 +84,6 @@
>   #define VMX_FEATURE_USR_WAIT_PAUSE	( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
>   #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
>   #define VMX_FEATURE_BUS_LOCK_DETECTION	( 2*32+ 30) /* "" VM-Exit when bus lock caused */
> +#define VMX_FEATURE_NOTIFY_VM_EXITING	( 2*32+ 31) /* VM-Exit when no event windows after notify window */
>   
>   #endif /* _ASM_X86_VMXFEATURES_H */
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index 946d761adbd3..ef4c80f6553e 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -91,6 +91,7 @@
>   #define EXIT_REASON_UMWAIT              67
>   #define EXIT_REASON_TPAUSE              68
>   #define EXIT_REASON_BUS_LOCK            74
> +#define EXIT_REASON_NOTIFY              75
>   
>   #define VMX_EXIT_REASONS \
>   	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> @@ -153,7 +154,8 @@
>   	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
>   	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
>   	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
> -	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }
> +	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }, \
> +	{ EXIT_REASON_NOTIFY,                "NOTIFY"}
>   
>   #define VMX_EXIT_REASON_FLAGS \
>   	{ VMX_EXIT_REASONS_FAILED_VMENTRY,	"FAILED_VMENTRY" }
> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
> index 8dee8a5fbc17..8527f34a84ac 100644
> --- a/arch/x86/kvm/vmx/capabilities.h
> +++ b/arch/x86/kvm/vmx/capabilities.h
> @@ -407,4 +407,10 @@ static inline u64 vmx_supported_debugctl(void)
>   	return debugctl;
>   }
>   
> +static inline bool cpu_has_notify_vm_exiting(void)
> +{
> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> +		SECONDARY_EXEC_NOTIFY_VM_EXITING;
> +}
> +
>   #endif /* __KVM_X86_VMX_CAPS_H */
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 4bceb5ca3a89..c0ad01c88dac 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
>   int __read_mostly pt_mode = PT_MODE_SYSTEM;
>   module_param(pt_mode, int, S_IRUGO);
>   
> +/* Default is 0, less than 0 (for example, -1) disables notify window. */
> +static int __read_mostly notify_window;
> +module_param(notify_window, int, 0644);
> +
>   static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
>   static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
>   static DEFINE_MUTEX(vmx_l1d_flush_mutex);
> @@ -2539,7 +2543,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>   			SECONDARY_EXEC_PT_USE_GPA |
>   			SECONDARY_EXEC_PT_CONCEAL_VMX |
>   			SECONDARY_EXEC_ENABLE_VMFUNC |
> -			SECONDARY_EXEC_BUS_LOCK_DETECTION;
> +			SECONDARY_EXEC_BUS_LOCK_DETECTION |
> +			SECONDARY_EXEC_NOTIFY_VM_EXITING;
>   		if (cpu_has_sgx())
>   			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
>   		if (adjust_vmx_controls(min2, opt2,
> @@ -4376,6 +4381,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
>   	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
>   		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
>   
> +	if (cpu_has_notify_vm_exiting() && notify_window < 0)
> +		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
> +
>   	vmx->secondary_exec_control = exec_control;
>   }
>   
> @@ -4423,6 +4431,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>   		vmx->ple_window_dirty = true;
>   	}
>   
> +	if (cpu_has_notify_vm_exiting() && notify_window >= 0)
> +		vmcs_write32(NOTIFY_WINDOW, notify_window);
> +
>   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
>   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
>   	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>   
> +static int handle_notify(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
> +
> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
> +		/*
> +		 * Notify VM exit happened while executing iret from NMI,
> +		 * "blocked by NMI" bit has to be set before next VM entry.
> +		 */
> +		if (enable_vnmi &&
> +		    (exit_qual & INTR_INFO_UNBLOCK_NMI))
> +			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
> +				      GUEST_INTR_STATE_NMI);
> +
> +		return 1;
> +	}
> +
> +	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> +	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_NO_EVENT_WINDOW;
> +	vcpu->run->internal.ndata = 1;
> +	vcpu->run->internal.data[0] = exit_qual;
> +
> +	return 0;
> +}
> +
>   /*
>    * The exit handlers return 1 if the exit was handled fully and guest execution
>    * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
> @@ -5699,6 +5735,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>   	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>   	[EXIT_REASON_ENCLS]		      = handle_encls,
>   	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
> +	[EXIT_REASON_NOTIFY]		      = handle_notify,
>   };
>   
>   static const int kvm_vmx_max_exit_handlers =
> @@ -6042,7 +6079,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
>   	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
>   	     exit_reason.basic != EXIT_REASON_PML_FULL &&
>   	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
> -	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
> +	     exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
> +	     exit_reason.basic != EXIT_REASON_NOTIFY)) {
>   		int ndata = 3;
>   
>   		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 3fd9a7e9d90c..bb3b49b1fb0d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -278,6 +278,8 @@ struct kvm_xen_exit {
>   #define KVM_INTERNAL_ERROR_DELIVERY_EV	3
>   /* Encounter unexpected vm-exit reason */
>   #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON	4
> +/* Encounter notify vm-exit */
> +#define KVM_INTERNAL_ERROR_NO_EVENT_WINDOW   5
>   
>   /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
>   struct kvm_run {
>
Xiaoyao Li July 22, 2021, 3:25 a.m. UTC | #10
On 5/25/2021 1:12 PM, Tao Xu wrote:
> There are some cases that malicious virtual machines can cause CPU stuck
> (event windows don't open up), e.g., infinite loop in microcode when
> nested #AC (CVE-2015-5307). No event window obviously means no events,
> e.g. NMIs, SMIs, and IRQs will all be blocked, may cause the related
> hardware CPU can't be used by host or other VM.
> 
> To resolve those cases, it can enable a notify VM exit if no event
> window occur in VMX non-root mode for a specified amount of time
> (notify window). Since CPU is first observed the risk of not causing
> forward progress, after notify window time in a units of crystal clock,
> Notify VM exit will happen. Notify VM exit can happen incident to delivery
> of a vectored event.
> 
> Expose a module param for configuring notify window, which is in unit of
> crystal clock cycle.
> - A negative value (e.g. -1) is to disable this feature.
> - Make the default as 0. It is safe because an internal threshold is added
> to notify window to ensure all the normal instructions being coverd.

Paolo, Andy, and Sean,

Do you have any comment on this patch? Especially about making default 
notify_window as 0.

Thanks,
-Xiaoyao

> - User can set it to a large value when they want to give more cycles to
> wait for some reasons, e.g., silicon wrongly kill some normal instruction
> due to internal threshold is too small.
> 
> Notify VM exit is defined in latest Intel Architecture Instruction Set
> Extensions Programming Reference, chapter 9.2.
> 
> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Tao Xu <tao3.xu@intel.com>
> ---
> 
> Changelog:
> v2:
>       Default set notify window to 0, less than 0 to disable.
>       Add more description in commit message.
> ---
>   arch/x86/include/asm/vmx.h         |  7 +++++
>   arch/x86/include/asm/vmxfeatures.h |  1 +
>   arch/x86/include/uapi/asm/vmx.h    |  4 ++-
>   arch/x86/kvm/vmx/capabilities.h    |  6 +++++
>   arch/x86/kvm/vmx/vmx.c             | 42 ++++++++++++++++++++++++++++--
>   include/uapi/linux/kvm.h           |  2 ++
>   6 files changed, 59 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 0ffaa3156a4e..9104c85a973f 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -74,6 +74,7 @@
>   #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
>   #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
>   #define SECONDARY_EXEC_BUS_LOCK_DETECTION	VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
> +#define SECONDARY_EXEC_NOTIFY_VM_EXITING	VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
>   
>   #define PIN_BASED_EXT_INTR_MASK                 VMCS_CONTROL_BIT(INTR_EXITING)
>   #define PIN_BASED_NMI_EXITING                   VMCS_CONTROL_BIT(NMI_EXITING)
> @@ -269,6 +270,7 @@ enum vmcs_field {
>   	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
>   	PLE_GAP                         = 0x00004020,
>   	PLE_WINDOW                      = 0x00004022,
> +	NOTIFY_WINDOW                   = 0x00004024,
>   	VM_INSTRUCTION_ERROR            = 0x00004400,
>   	VM_EXIT_REASON                  = 0x00004402,
>   	VM_EXIT_INTR_INFO               = 0x00004404,
> @@ -555,6 +557,11 @@ enum vm_entry_failure_code {
>   #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
>   #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
>   
> +/*
> + * Exit Qualifications for NOTIFY VM EXIT
> + */
> +#define NOTIFY_VM_CONTEXT_INVALID     BIT(0)
> +
>   /*
>    * VM-instruction error numbers
>    */
> diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
> index d9a74681a77d..15f0f2ab4f95 100644
> --- a/arch/x86/include/asm/vmxfeatures.h
> +++ b/arch/x86/include/asm/vmxfeatures.h
> @@ -84,5 +84,6 @@
>   #define VMX_FEATURE_USR_WAIT_PAUSE	( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
>   #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
>   #define VMX_FEATURE_BUS_LOCK_DETECTION	( 2*32+ 30) /* "" VM-Exit when bus lock caused */
> +#define VMX_FEATURE_NOTIFY_VM_EXITING	( 2*32+ 31) /* VM-Exit when no event windows after notify window */
>   
>   #endif /* _ASM_X86_VMXFEATURES_H */
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index 946d761adbd3..ef4c80f6553e 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -91,6 +91,7 @@
>   #define EXIT_REASON_UMWAIT              67
>   #define EXIT_REASON_TPAUSE              68
>   #define EXIT_REASON_BUS_LOCK            74
> +#define EXIT_REASON_NOTIFY              75
>   
>   #define VMX_EXIT_REASONS \
>   	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
> @@ -153,7 +154,8 @@
>   	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
>   	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
>   	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
> -	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }
> +	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }, \
> +	{ EXIT_REASON_NOTIFY,                "NOTIFY"}
>   
>   #define VMX_EXIT_REASON_FLAGS \
>   	{ VMX_EXIT_REASONS_FAILED_VMENTRY,	"FAILED_VMENTRY" }
> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
> index 8dee8a5fbc17..8527f34a84ac 100644
> --- a/arch/x86/kvm/vmx/capabilities.h
> +++ b/arch/x86/kvm/vmx/capabilities.h
> @@ -407,4 +407,10 @@ static inline u64 vmx_supported_debugctl(void)
>   	return debugctl;
>   }
>   
> +static inline bool cpu_has_notify_vm_exiting(void)
> +{
> +	return vmcs_config.cpu_based_2nd_exec_ctrl &
> +		SECONDARY_EXEC_NOTIFY_VM_EXITING;
> +}
> +
>   #endif /* __KVM_X86_VMX_CAPS_H */
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 4bceb5ca3a89..c0ad01c88dac 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
>   int __read_mostly pt_mode = PT_MODE_SYSTEM;
>   module_param(pt_mode, int, S_IRUGO);
>   
> +/* Default is 0, less than 0 (for example, -1) disables notify window. */
> +static int __read_mostly notify_window;
> +module_param(notify_window, int, 0644);
> +
>   static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
>   static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
>   static DEFINE_MUTEX(vmx_l1d_flush_mutex);
> @@ -2539,7 +2543,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>   			SECONDARY_EXEC_PT_USE_GPA |
>   			SECONDARY_EXEC_PT_CONCEAL_VMX |
>   			SECONDARY_EXEC_ENABLE_VMFUNC |
> -			SECONDARY_EXEC_BUS_LOCK_DETECTION;
> +			SECONDARY_EXEC_BUS_LOCK_DETECTION |
> +			SECONDARY_EXEC_NOTIFY_VM_EXITING;
>   		if (cpu_has_sgx())
>   			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
>   		if (adjust_vmx_controls(min2, opt2,
> @@ -4376,6 +4381,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
>   	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
>   		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
>   
> +	if (cpu_has_notify_vm_exiting() && notify_window < 0)
> +		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
> +
>   	vmx->secondary_exec_control = exec_control;
>   }
>   
> @@ -4423,6 +4431,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>   		vmx->ple_window_dirty = true;
>   	}
>   
> +	if (cpu_has_notify_vm_exiting() && notify_window >= 0)
> +		vmcs_write32(NOTIFY_WINDOW, notify_window);
> +
>   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
>   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
>   	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>   
> +static int handle_notify(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
> +
> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
> +		/*
> +		 * Notify VM exit happened while executing iret from NMI,
> +		 * "blocked by NMI" bit has to be set before next VM entry.
> +		 */
> +		if (enable_vnmi &&
> +		    (exit_qual & INTR_INFO_UNBLOCK_NMI))
> +			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
> +				      GUEST_INTR_STATE_NMI);
> +
> +		return 1;
> +	}
> +
> +	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> +	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_NO_EVENT_WINDOW;
> +	vcpu->run->internal.ndata = 1;
> +	vcpu->run->internal.data[0] = exit_qual;
> +
> +	return 0;
> +}
> +
>   /*
>    * The exit handlers return 1 if the exit was handled fully and guest execution
>    * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
> @@ -5699,6 +5735,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>   	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>   	[EXIT_REASON_ENCLS]		      = handle_encls,
>   	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
> +	[EXIT_REASON_NOTIFY]		      = handle_notify,
>   };
>   
>   static const int kvm_vmx_max_exit_handlers =
> @@ -6042,7 +6079,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
>   	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
>   	     exit_reason.basic != EXIT_REASON_PML_FULL &&
>   	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
> -	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
> +	     exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
> +	     exit_reason.basic != EXIT_REASON_NOTIFY)) {
>   		int ndata = 3;
>   
>   		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 3fd9a7e9d90c..bb3b49b1fb0d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -278,6 +278,8 @@ struct kvm_xen_exit {
>   #define KVM_INTERNAL_ERROR_DELIVERY_EV	3
>   /* Encounter unexpected vm-exit reason */
>   #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON	4
> +/* Encounter notify vm-exit */
> +#define KVM_INTERNAL_ERROR_NO_EVENT_WINDOW   5
>   
>   /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
>   struct kvm_run {
>
Sean Christopherson July 30, 2021, 8:41 p.m. UTC | #11
On Tue, May 25, 2021, Tao Xu wrote:
>  #endif /* __KVM_X86_VMX_CAPS_H */
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 4bceb5ca3a89..c0ad01c88dac 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
>  int __read_mostly pt_mode = PT_MODE_SYSTEM;
>  module_param(pt_mode, int, S_IRUGO);
>  
> +/* Default is 0, less than 0 (for example, -1) disables notify window. */
> +static int __read_mostly notify_window;

I'm not sure I like the idea of trusting ucode to select an appropriate internal
threshold.  Unless the internal threshold is architecturally defined to be at
least N nanoseconds or whatever, I think KVM should provide its own sane default.
E.g. it's not hard to imagine a scenario where a ucode patch gets rolled out that
adjusts the threshold and starts silently degrading guest performance.

Even if the internal threshold isn't architecturally constrained, it would be very,
very helpful if Intel could publish the per-uarch/stepping thresholds, e.g. to give
us a ballpark idea of how agressive KVM can be before it risks false positives.

> +module_param(notify_window, int, 0644);

I really like the idea of making the module param writable, but doing so will
require far more effort.  At an absolute minimum, the module param would need to
be snapshotted at VM creation time, a la lapic_timer_advance_ns, otherwise the
behavior is non-deterministic.

But I don't think snapshotting is a worthwhile approach because the main reason
for adjusting the window while guests are running is probably going to be to relax
the window because guest's are observing degraded performance.  Hopefully that
never happens, but the "CPU adds a magic internal buffer" behavior makes me more
than a bit nervous.

And on the other hand, adding a ton of logic to forcefully update every VMCS is
likely overkill.

So, that takes us back to providing a sane, somewhat conservative default.  I've
said in the past that ideally the notify_window would be as small as possible,
but pushing it down to single digit cycles swings the pendulum too far in the
other direction.

> +
>  static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
>  static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
>  static DEFINE_MUTEX(vmx_l1d_flush_mutex);
> @@ -2539,7 +2543,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  			SECONDARY_EXEC_PT_USE_GPA |
>  			SECONDARY_EXEC_PT_CONCEAL_VMX |
>  			SECONDARY_EXEC_ENABLE_VMFUNC |
> -			SECONDARY_EXEC_BUS_LOCK_DETECTION;
> +			SECONDARY_EXEC_BUS_LOCK_DETECTION |
> +			SECONDARY_EXEC_NOTIFY_VM_EXITING;
>  		if (cpu_has_sgx())
>  			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
>  		if (adjust_vmx_controls(min2, opt2,
> @@ -4376,6 +4381,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
>  	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
>  		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
>  
> +	if (cpu_has_notify_vm_exiting() && notify_window < 0)
> +		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
> +
>  	vmx->secondary_exec_control = exec_control;
>  }
>  
> @@ -4423,6 +4431,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  		vmx->ple_window_dirty = true;
>  	}
>  
> +	if (cpu_has_notify_vm_exiting() && notify_window >= 0)
> +		vmcs_write32(NOTIFY_WINDOW, notify_window);

I'm all for punting full nested support to a future patch, but _this_ patch
absolutely needs to apply KVM's notify_window to vmcs02, otherwise L1 can simply
run in L2 to avoid the restriction.  init_vmcs() is used only for vmcs01, i.e.
prepare_vmcs02_constant_state() needs to set the correct vmcs.NOTIFY_WINDOW,
and prepare_vmcs02_early() needs to set/clear SECONDARY_EXEC_NOTIFY_VM_EXITING
appropriately.

> +
>  	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
>  	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
>  	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>  	return 0;
>  }
>  
> +static int handle_notify(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
> +
> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {

What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.

> +		/*
> +		 * Notify VM exit happened while executing iret from NMI,
> +		 * "blocked by NMI" bit has to be set before next VM entry.
> +		 */
> +		if (enable_vnmi &&
> +		    (exit_qual & INTR_INFO_UNBLOCK_NMI))
> +			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
> +				      GUEST_INTR_STATE_NMI);

Hmm, logging of some kind is probably a good idea if this exit occurs, e.g. so
that the host can (a) get an indication that a guest is potentially malicious and
(b) rule out (or confirm) notify_window exits as the source of degraded guest
performance.

Maybe add a per-vCPU stat, "u64 notify_window_exits"?

Another thought would be to also do pr_info/warn_ratelimited if a vCPU gets
multiple notify_window exits and doesn't appear to be making forward progress,
e.g. same RIP observed two notify_window exits in a row.  Even if the guest is
making forward progress, displaying the guest RIP and instruction (if possible)
could be useful in triaging why the guest appears to be getting false positives.

> +		return 1;
> +	}
> +
> +	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> +	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_NO_EVENT_WINDOW;
> +	vcpu->run->internal.ndata = 1;
> +	vcpu->run->internal.data[0] = exit_qual;

Unless an invalid context can _never_ happen, or is already fatal to the guest,
I don't think effectively killing the guest is a good idea.  KVM doesn't know
for certain that the guest was being malicious, all it knows is that the CPU
didn't open an event window for some arbitrary amount of time (arbitrary because
the internal threshold is likely to be uarch specific).  KVM is getting exits,
which means it's getting a chance to check for signals, etc..., so resuming the
guest is ok.

> +
> +	return 0;
> +}
> +
>  /*
>   * The exit handlers return 1 if the exit was handled fully and guest execution
>   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
> @@ -5699,6 +5735,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>  	[EXIT_REASON_ENCLS]		      = handle_encls,
>  	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
> +	[EXIT_REASON_NOTIFY]		      = handle_notify,
>  };
>
Xiaoyao Li Aug. 2, 2021, 12:53 p.m. UTC | #12
On 7/31/2021 4:41 AM, Sean Christopherson wrote:
> On Tue, May 25, 2021, Tao Xu wrote:
>>   #endif /* __KVM_X86_VMX_CAPS_H */
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 4bceb5ca3a89..c0ad01c88dac 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
>>   int __read_mostly pt_mode = PT_MODE_SYSTEM;
>>   module_param(pt_mode, int, S_IRUGO);
>>   
>> +/* Default is 0, less than 0 (for example, -1) disables notify window. */
>> +static int __read_mostly notify_window;
> 
> I'm not sure I like the idea of trusting ucode to select an appropriate internal
> threshold.  Unless the internal threshold is architecturally defined to be at
> least N nanoseconds or whatever, I think KVM should provide its own sane default.
> E.g. it's not hard to imagine a scenario where a ucode patch gets rolled out that
> adjusts the threshold and starts silently degrading guest performance.

You mean when internal threshold gets smaller somehow, and cases 
false-positive that leads unexpected VM exit on normal instruction? In 
this case, we set increase the vmcs.notify_window in KVM.

I think there is no better to avoid this case if ucode changes internal 
threshold. Unless KVM's default notify_window is bigger enough.

> Even if the internal threshold isn't architecturally constrained, it would be very,
> very helpful if Intel could publish the per-uarch/stepping thresholds, e.g. to give
> us a ballpark idea of how agressive KVM can be before it risks false positives.

Even Intel publishes the internal threshold, we still need to provide a 
final best_value (internal + vmcs.notify_window). Then what's that value?

If we have an option for final best_value, then I think it's OK to just 
let vmcs.notify_window = best_value. Then the true final value is 
best_value + internal.
  - if it's a normal instruction, it should finish within best_value or 
best_value + internal. So it makes no difference.
  - if it's an instruction in malicious case, it won't go to next 
instruction whether wait for best_value or best_value + internal.

>> +module_param(notify_window, int, 0644);
> 
> I really like the idea of making the module param writable, but doing so will
> require far more effort.  At an absolute minimum, the module param would need to
> be snapshotted at VM creation time, a la lapic_timer_advance_ns, otherwise the
> behavior is non-deterministic.
> 
> But I don't think snapshotting is a worthwhile approach because the main reason
> for adjusting the window while guests are running is probably going to be to relax
> the window because guest's are observing degraded performance.  

Let's make it non-writable.

> Hopefully that
> never happens, but the "CPU adds a magic internal buffer" behavior makes me more
> than a bit nervous.

If we don't trust internal value, we can just treat it as 0.

> And on the other hand, adding a ton of logic to forcefully update every VMCS is
> likely overkill.
> 
> So, that takes us back to providing a sane, somewhat conservative default.  I've
> said in the past that ideally the notify_window would be as small as possible,
> but pushing it down to single digit cycles swings the pendulum too far in the
> other direction.
> 
>> +
>>   static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
>>   static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
>>   static DEFINE_MUTEX(vmx_l1d_flush_mutex);
>> @@ -2539,7 +2543,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>>   			SECONDARY_EXEC_PT_USE_GPA |
>>   			SECONDARY_EXEC_PT_CONCEAL_VMX |
>>   			SECONDARY_EXEC_ENABLE_VMFUNC |
>> -			SECONDARY_EXEC_BUS_LOCK_DETECTION;
>> +			SECONDARY_EXEC_BUS_LOCK_DETECTION |
>> +			SECONDARY_EXEC_NOTIFY_VM_EXITING;
>>   		if (cpu_has_sgx())
>>   			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
>>   		if (adjust_vmx_controls(min2, opt2,
>> @@ -4376,6 +4381,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
>>   	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
>>   		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
>>   
>> +	if (cpu_has_notify_vm_exiting() && notify_window < 0)
>> +		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
>> +
>>   	vmx->secondary_exec_control = exec_control;
>>   }
>>   
>> @@ -4423,6 +4431,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>>   		vmx->ple_window_dirty = true;
>>   	}
>>   
>> +	if (cpu_has_notify_vm_exiting() && notify_window >= 0)
>> +		vmcs_write32(NOTIFY_WINDOW, notify_window);
> 
> I'm all for punting full nested support to a future patch, but _this_ patch
> absolutely needs to apply KVM's notify_window to vmcs02, otherwise L1 can simply
> run in L2 to avoid the restriction.  init_vmcs() is used only for vmcs01, i.e.
> prepare_vmcs02_constant_state() needs to set the correct vmcs.NOTIFY_WINDOW,
> and prepare_vmcs02_early() needs to set/clear SECONDARY_EXEC_NOTIFY_VM_EXITING
> appropriately.

Thanks for pointing it out. We will fix it in next version.

>> +
>>   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
>>   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
>>   	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
>> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>>   	return 0;
>>   }
>>   
>> +static int handle_notify(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
>> +
>> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
> 
> What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.

It means whether the VM context is corrupted and not valid in the VMCS.

>> +		/*
>> +		 * Notify VM exit happened while executing iret from NMI,
>> +		 * "blocked by NMI" bit has to be set before next VM entry.
>> +		 */
>> +		if (enable_vnmi &&
>> +		    (exit_qual & INTR_INFO_UNBLOCK_NMI))
>> +			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
>> +				      GUEST_INTR_STATE_NMI);
> 
> Hmm, logging of some kind is probably a good idea if this exit occurs, e.g. so
> that the host can (a) get an indication that a guest is potentially malicious and
> (b) rule out (or confirm) notify_window exits as the source of degraded guest
> performance.
> 
> Maybe add a per-vCPU stat, "u64 notify_window_exits"?

Good idea.

> Another thought would be to also do pr_info/warn_ratelimited if a vCPU gets
> multiple notify_window exits and doesn't appear to be making forward progress,
> e.g. same RIP observed two notify_window exits in a row.  Even if the guest is
> making forward progress, displaying the guest RIP and instruction (if possible)
> could be useful in triaging why the guest appears to be getting false positives.

I suppose kvm_exit trace can be used if we find there are too many 
notify_exit.

>> +		return 1;
>> +	}
>> +
>> +	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> +	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_NO_EVENT_WINDOW;
>> +	vcpu->run->internal.ndata = 1;
>> +	vcpu->run->internal.data[0] = exit_qual;
> 
> Unless an invalid context can _never_ happen, or is already fatal to the guest,

As I explained, invalid means VM context is corrupted and not valid in 
VMCS. We have no choice.

> I don't think effectively killing the guest is a good idea.  KVM doesn't know
> for certain that the guest was being malicious, all it knows is that the CPU
> didn't open an event window for some arbitrary amount of time (arbitrary because
> the internal threshold is likely to be uarch specific).  KVM is getting exits,
> which means it's getting a chance to check for signals, etc..., so resuming the
> guest is ok.
> 
>> +
>> +	return 0;
>> +}
>> +
>>   /*
>>    * The exit handlers return 1 if the exit was handled fully and guest execution
>>    * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
>> @@ -5699,6 +5735,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>>   	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>>   	[EXIT_REASON_ENCLS]		      = handle_encls,
>>   	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
>> +	[EXIT_REASON_NOTIFY]		      = handle_notify,
>>   };
>>
Sean Christopherson Aug. 2, 2021, 3:46 p.m. UTC | #13
On Mon, Aug 02, 2021, Xiaoyao Li wrote:
> On 7/31/2021 4:41 AM, Sean Christopherson wrote:
> > On Tue, May 25, 2021, Tao Xu wrote:
> > >   #endif /* __KVM_X86_VMX_CAPS_H */
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > index 4bceb5ca3a89..c0ad01c88dac 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
> > >   int __read_mostly pt_mode = PT_MODE_SYSTEM;
> > >   module_param(pt_mode, int, S_IRUGO);
> > > +/* Default is 0, less than 0 (for example, -1) disables notify window. */
> > > +static int __read_mostly notify_window;
> > 
> > I'm not sure I like the idea of trusting ucode to select an appropriate internal
> > threshold.  Unless the internal threshold is architecturally defined to be at
> > least N nanoseconds or whatever, I think KVM should provide its own sane default.
> > E.g. it's not hard to imagine a scenario where a ucode patch gets rolled out that
> > adjusts the threshold and starts silently degrading guest performance.
> 
> You mean when internal threshold gets smaller somehow, and cases
> false-positive that leads unexpected VM exit on normal instruction? In this
> case, we set increase the vmcs.notify_window in KVM.

Not while VMs are running though.

> I think there is no better to avoid this case if ucode changes internal
> threshold. Unless KVM's default notify_window is bigger enough.
> 
> > Even if the internal threshold isn't architecturally constrained, it would be very,
> > very helpful if Intel could publish the per-uarch/stepping thresholds, e.g. to give
> > us a ballpark idea of how agressive KVM can be before it risks false positives.
> 
> Even Intel publishes the internal threshold, we still need to provide a
> final best_value (internal + vmcs.notify_window). Then what's that value?

The ideal value would be high enough to guarantee there are zero false positives,
yet low enough to prevent a malicious guest from causing instability in the host
by blocking events for an extended duration.  The problem is that there's no
magic answer for the threshold at which a blocked event would lead to system
instability, and without at least a general idea of the internal value there's no
answer at all.

IIRC, SGX instructions have a hard upper bound of 25k cycles before they have to
check for pending interrupts, e.g. it's why EINIT is interruptible.  The 25k cycle
limit is likely a good starting point for the combined minimum.  That's why I want
to know the internal minimum; if the internal minimum is _guaranteed_ to be >25k,
then KVM can be more aggressive with its default value.

> If we have an option for final best_value, then I think it's OK to just let
> vmcs.notify_window = best_value. Then the true final value is best_value +
> internal.
>  - if it's a normal instruction, it should finish within best_value or
> best_value + internal. So it makes no difference.
>  - if it's an instruction in malicious case, it won't go to next instruction
> whether wait for best_value or best_value + internal.

...

> > > +
> > >   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
> > >   	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
> > >   	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
> > > @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
> > >   	return 0;
> > >   }
> > > +static int handle_notify(struct kvm_vcpu *vcpu)
> > > +{
> > > +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
> > > +
> > > +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
> > 
> > What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.
> 
> It means whether the VM context is corrupted and not valid in the VMCS.

Well that's a bit terrifying.  Under what conditions can the VM context become
corrupted?  E.g. if the context can be corrupted by an inopportune NOTIFY exit,
then KVM needs to be ultra conservative as a false positive could be fatal to a
guest.
Xiaoyao Li Aug. 3, 2021, 12:38 a.m. UTC | #14
On 8/2/2021 11:46 PM, Sean Christopherson wrote:
> On Mon, Aug 02, 2021, Xiaoyao Li wrote:
>> On 7/31/2021 4:41 AM, Sean Christopherson wrote:
>>> On Tue, May 25, 2021, Tao Xu wrote:
>>>>    #endif /* __KVM_X86_VMX_CAPS_H */
>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>>> index 4bceb5ca3a89..c0ad01c88dac 100644
>>>> --- a/arch/x86/kvm/vmx/vmx.c
>>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>>> @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
>>>>    int __read_mostly pt_mode = PT_MODE_SYSTEM;
>>>>    module_param(pt_mode, int, S_IRUGO);
>>>> +/* Default is 0, less than 0 (for example, -1) disables notify window. */
>>>> +static int __read_mostly notify_window;
>>>
>>> I'm not sure I like the idea of trusting ucode to select an appropriate internal
>>> threshold.  Unless the internal threshold is architecturally defined to be at
>>> least N nanoseconds or whatever, I think KVM should provide its own sane default.
>>> E.g. it's not hard to imagine a scenario where a ucode patch gets rolled out that
>>> adjusts the threshold and starts silently degrading guest performance.
>>
>> You mean when internal threshold gets smaller somehow, and cases
>> false-positive that leads unexpected VM exit on normal instruction? In this
>> case, we set increase the vmcs.notify_window in KVM.
> 
> Not while VMs are running though.
> 
>> I think there is no better to avoid this case if ucode changes internal
>> threshold. Unless KVM's default notify_window is bigger enough.
>>
>>> Even if the internal threshold isn't architecturally constrained, it would be very,
>>> very helpful if Intel could publish the per-uarch/stepping thresholds, e.g. to give
>>> us a ballpark idea of how agressive KVM can be before it risks false positives.
>>
>> Even Intel publishes the internal threshold, we still need to provide a
>> final best_value (internal + vmcs.notify_window). Then what's that value?
> 
> The ideal value would be high enough to guarantee there are zero false positives,
> yet low enough to prevent a malicious guest from causing instability in the host
> by blocking events for an extended duration.  The problem is that there's no
> magic answer for the threshold at which a blocked event would lead to system
> instability, and without at least a general idea of the internal value there's no
> answer at all.
> 
> IIRC, SGX instructions have a hard upper bound of 25k cycles before they have to
> check for pending interrupts, e.g. it's why EINIT is interruptible.  The 25k cycle
> limit is likely a good starting point for the combined minimum.  That's why I want
> to know the internal minimum; if the internal minimum is _guaranteed_ to be >25k,
> then KVM can be more aggressive with its default value.

OK. I will go internally to see if we can publish the internal threshold.

>> If we have an option for final best_value, then I think it's OK to just let
>> vmcs.notify_window = best_value. Then the true final value is best_value +
>> internal.
>>   - if it's a normal instruction, it should finish within best_value or
>> best_value + internal. So it makes no difference.
>>   - if it's an instruction in malicious case, it won't go to next instruction
>> whether wait for best_value or best_value + internal.
> 
> ...
> 
>>>> +
>>>>    	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
>>>>    	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
>>>>    	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
>>>> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>>>>    	return 0;
>>>>    }
>>>> +static int handle_notify(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
>>>> +
>>>> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
>>>
>>> What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.
>>
>> It means whether the VM context is corrupted and not valid in the VMCS.
> 
> Well that's a bit terrifying.  Under what conditions can the VM context become
> corrupted?  E.g. if the context can be corrupted by an inopportune NOTIFY exit,
> then KVM needs to be ultra conservative as a false positive could be fatal to a
> guest.
> 

Short answer is no case will set the VM_CONTEXT_INVALID bit.

VM_CONTEXT_INVALID is so fatal and IMHO it won't be set for any 
inopportune NOTIFY exit.
Chenyi Qiang Sept. 2, 2021, 9:28 a.m. UTC | #15
On 8/3/2021 8:38 AM, Xiaoyao Li wrote:
> On 8/2/2021 11:46 PM, Sean Christopherson wrote:
>> On Mon, Aug 02, 2021, Xiaoyao Li wrote:
>>> On 7/31/2021 4:41 AM, Sean Christopherson wrote:
>>>> On Tue, May 25, 2021, Tao Xu wrote:
>>>>>    #endif /* __KVM_X86_VMX_CAPS_H */
>>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>>>> index 4bceb5ca3a89..c0ad01c88dac 100644
>>>>> --- a/arch/x86/kvm/vmx/vmx.c
>>>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>>>> @@ -205,6 +205,10 @@ module_param(ple_window_max, uint, 0444);
>>>>>    int __read_mostly pt_mode = PT_MODE_SYSTEM;
>>>>>    module_param(pt_mode, int, S_IRUGO);
>>>>> +/* Default is 0, less than 0 (for example, -1) disables notify 
>>>>> window. */
>>>>> +static int __read_mostly notify_window;
>>>>
>>>> I'm not sure I like the idea of trusting ucode to select an 
>>>> appropriate internal
>>>> threshold.  Unless the internal threshold is architecturally defined 
>>>> to be at
>>>> least N nanoseconds or whatever, I think KVM should provide its own 
>>>> sane default.
>>>> E.g. it's not hard to imagine a scenario where a ucode patch gets 
>>>> rolled out that
>>>> adjusts the threshold and starts silently degrading guest performance.
>>>
>>> You mean when internal threshold gets smaller somehow, and cases
>>> false-positive that leads unexpected VM exit on normal instruction? 
>>> In this
>>> case, we set increase the vmcs.notify_window in KVM.
>>
>> Not while VMs are running though.
>>
>>> I think there is no better to avoid this case if ucode changes internal
>>> threshold. Unless KVM's default notify_window is bigger enough.
>>>
>>>> Even if the internal threshold isn't architecturally constrained, it 
>>>> would be very,
>>>> very helpful if Intel could publish the per-uarch/stepping 
>>>> thresholds, e.g. to give
>>>> us a ballpark idea of how agressive KVM can be before it risks false 
>>>> positives.
>>>
>>> Even Intel publishes the internal threshold, we still need to provide a
>>> final best_value (internal + vmcs.notify_window). Then what's that 
>>> value?
>>
>> The ideal value would be high enough to guarantee there are zero false 
>> positives,
>> yet low enough to prevent a malicious guest from causing instability 
>> in the host
>> by blocking events for an extended duration.  The problem is that 
>> there's no
>> magic answer for the threshold at which a blocked event would lead to 
>> system
>> instability, and without at least a general idea of the internal value 
>> there's no
>> answer at all.
>>
>> IIRC, SGX instructions have a hard upper bound of 25k cycles before 
>> they have to
>> check for pending interrupts, e.g. it's why EINIT is interruptible.  
>> The 25k cycle
>> limit is likely a good starting point for the combined minimum.  
>> That's why I want
>> to know the internal minimum; if the internal minimum is _guaranteed_ 
>> to be >25k,
>> then KVM can be more aggressive with its default value.
> 
> OK. I will go internally to see if we can publish the internal threshold.
> 

Hi Sean,

After syncing internally, we know that the internal threshold is not 
architectural but a model-specific value. It will be published in some 
place in future.

On Sapphire Rapids platform, the threshold is 128k. With this in mind, 
is it appropriate to set 0 as the default value of notify_window?

>>> If we have an option for final best_value, then I think it's OK to 
>>> just let
>>> vmcs.notify_window = best_value. Then the true final value is 
>>> best_value +
>>> internal.
>>>   - if it's a normal instruction, it should finish within best_value or
>>> best_value + internal. So it makes no difference.
>>>   - if it's an instruction in malicious case, it won't go to next 
>>> instruction
>>> whether wait for best_value or best_value + internal.
>>
>> ...
>>
>>>>> +
>>>>>        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
>>>>>        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
>>>>>        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
>>>>> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct 
>>>>> kvm_vcpu *vcpu)
>>>>>        return 0;
>>>>>    }
>>>>> +static int handle_notify(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +    unsigned long exit_qual = vmx_get_exit_qual(vcpu);
>>>>> +
>>>>> +    if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
>>>>
>>>> What does CONTEXT_INVALID mean?  The ISE doesn't provide any 
>>>> information whatsoever.
>>>
>>> It means whether the VM context is corrupted and not valid in the VMCS.
>>
>> Well that's a bit terrifying.  Under what conditions can the VM 
>> context become
>> corrupted?  E.g. if the context can be corrupted by an inopportune 
>> NOTIFY exit,
>> then KVM needs to be ultra conservative as a false positive could be 
>> fatal to a
>> guest.
>>
> 
> Short answer is no case will set the VM_CONTEXT_INVALID bit.
> 
> VM_CONTEXT_INVALID is so fatal and IMHO it won't be set for any 
> inopportune NOTIFY exit.
Sean Christopherson Sept. 2, 2021, 4:15 p.m. UTC | #16
On Tue, Aug 03, 2021, Xiaoyao Li wrote:
> On 8/2/2021 11:46 PM, Sean Christopherson wrote:
> > > > > @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
> > > > >    	return 0;
> > > > >    }
> > > > > +static int handle_notify(struct kvm_vcpu *vcpu)
> > > > > +{
> > > > > +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
> > > > > +
> > > > > +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
> > > > 
> > > > What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.
> > > 
> > > It means whether the VM context is corrupted and not valid in the VMCS.
> > 
> > Well that's a bit terrifying.  Under what conditions can the VM context become
> > corrupted?  E.g. if the context can be corrupted by an inopportune NOTIFY exit,
> > then KVM needs to be ultra conservative as a false positive could be fatal to a
> > guest.
> > 
> 
> Short answer is no case will set the VM_CONTEXT_INVALID bit.

But something must set it, otherwise it wouldn't exist.  The condition(s) under
which it can be set matters because it affects how KVM should respond.  E.g. if
the guest can trigger VM_CONTEXT_INVALID at will, then we should probably treat
it as a shutdown and reset the VMCS.  But if VM_CONTEXT_INVALID can occur if and
only if there's a hardware/ucode issue, then we can do:

	if (KVM_BUG_ON(exit_qual & NOTIFY_VM_CONTEXT_INVALID, vcpu->kvm))
		return -EIO;

Either way, to enable this by default we need some form of documentation that
describes what conditions lead to VM_CONTEXT_INVALID.

> VM_CONTEXT_INVALID is so fatal and IMHO it won't be set for any inopportune
> NOTIFY exit.
Sean Christopherson Sept. 2, 2021, 4:29 p.m. UTC | #17
On Thu, Sep 02, 2021, Chenyi Qiang wrote:
> On 8/3/2021 8:38 AM, Xiaoyao Li wrote:
> > On 8/2/2021 11:46 PM, Sean Christopherson wrote:
> > > IIRC, SGX instructions have a hard upper bound of 25k cycles before they
> > > have to check for pending interrupts, e.g. it's why EINIT is
> > > interruptible.  The 25k cycle limit is likely a good starting point for
> > > the combined minimum.  That's why I want to know the internal minimum; if
> > > the internal minimum is _guaranteed_ to be >25k, then KVM can be more
> > > aggressive with its default value.
> > 
> > OK. I will go internally to see if we can publish the internal threshold.
> > 
> 
> Hi Sean,
> 
> After syncing internally, we know that the internal threshold is not
> architectural but a model-specific value. It will be published in some place
> in future.

Any chance it will also be discoverable, e.g. via an MSR?  That would be ideal
as we could give the module param an "auto" mode where the combined threshold is
set to a minimum KVM-defined value, e.g.

	static int __read_mostly notify_window = -1;
	module_param(notify_window, int, 444);

	...

	rdmsrl_safe(MSR_NOTIFY_WINDOW_BUFFER, &buffer);
	if (notify_window == -1) {
		if (buffer < KVM_DEFAULT_NOTIFY_WINDOW)
			notify_window = 0;
		else
			notifiy_window = KVM_DEFAULT_NOTIFY_WINDOW - buffer;
	}
		
> On Sapphire Rapids platform, the threshold is 128k. With this in mind, is it
> appropriate to set 0 as the default value of notify_window?

Maybe?  That's still not a guarantee that _future_ CPUs will have an internal
threshold >25k.

On a related topic, this needs tests.  One thought would be to stop unconditionally
intercepting #AC if NOTIFY_WINDOW is enabled, and then have the test set up the
infinite #AC vectoring scenario.
Sean Christopherson Sept. 2, 2021, 4:36 p.m. UTC | #18
On Thu, Sep 02, 2021, Sean Christopherson wrote:
> On Tue, Aug 03, 2021, Xiaoyao Li wrote:
> > On 8/2/2021 11:46 PM, Sean Christopherson wrote:
> > > > > > @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
> > > > > >    	return 0;
> > > > > >    }
> > > > > > +static int handle_notify(struct kvm_vcpu *vcpu)
> > > > > > +{
> > > > > > +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
> > > > > > +
> > > > > > +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
> > > > > 
> > > > > What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.
> > > > 
> > > > It means whether the VM context is corrupted and not valid in the VMCS.
> > > 
> > > Well that's a bit terrifying.  Under what conditions can the VM context become
> > > corrupted?  E.g. if the context can be corrupted by an inopportune NOTIFY exit,
> > > then KVM needs to be ultra conservative as a false positive could be fatal to a
> > > guest.
> > > 
> > 
> > Short answer is no case will set the VM_CONTEXT_INVALID bit.
> 
> But something must set it, otherwise it wouldn't exist.  The condition(s) under
> which it can be set matters because it affects how KVM should respond.  E.g. if
> the guest can trigger VM_CONTEXT_INVALID at will, then we should probably treat
> it as a shutdown and reset the VMCS.

Oh, and "shutdown" would be relative to the VMCS, i.e. if L2 triggers a NOTIFY
exit with VM_CONTEXT_INVALID then KVM shouldn't kill the entire VM.  The least
awful option would probably be to synthesize a shutdown VM-Exit to L1.  That
won't communicate to L1 that vmcs12 state is stale/bogus, but I don't see any way
to handle that via an existing VM-Exit reason :-/

> But if VM_CONTEXT_INVALID can occur if and only if there's a hardware/ucode
> issue, then we can do:
> 
> 	if (KVM_BUG_ON(exit_qual & NOTIFY_VM_CONTEXT_INVALID, vcpu->kvm))
> 		return -EIO;
> 
> Either way, to enable this by default we need some form of documentation that
> describes what conditions lead to VM_CONTEXT_INVALID.
Xiaoyao Li Sept. 7, 2021, 1:33 p.m. UTC | #19
On 9/3/2021 12:29 AM, Sean Christopherson wrote:
> On Thu, Sep 02, 2021, Chenyi Qiang wrote:
>> On 8/3/2021 8:38 AM, Xiaoyao Li wrote:
>>> On 8/2/2021 11:46 PM, Sean Christopherson wrote:
>>>> IIRC, SGX instructions have a hard upper bound of 25k cycles before they
>>>> have to check for pending interrupts, e.g. it's why EINIT is
>>>> interruptible.  The 25k cycle limit is likely a good starting point for
>>>> the combined minimum.  That's why I want to know the internal minimum; if
>>>> the internal minimum is _guaranteed_ to be >25k, then KVM can be more
>>>> aggressive with its default value.
>>>
>>> OK. I will go internally to see if we can publish the internal threshold.
>>>
>>
>> Hi Sean,
>>
>> After syncing internally, we know that the internal threshold is not
>> architectural but a model-specific value. It will be published in some place
>> in future.
> 
> Any chance it will also be discoverable, e.g. via an MSR?  

I also hope we can expose it via MSR. If not, we can maintain a table 
per FMS in KVM to get the internal threshold. However, per FMS info is 
not friendly to be virtualized (when we are going to enable the nested 
support). I'll try to persuade internal to expose it via MSR, but I 
guarantee nothing.

> That would be ideal
> as we could give the module param an "auto" mode where the combined threshold is
> set to a minimum KVM-defined value, e.g.
> 
> 	static int __read_mostly notify_window = -1;
> 	module_param(notify_window, int, 444);
> 
> 	...
> 
> 	rdmsrl_safe(MSR_NOTIFY_WINDOW_BUFFER, &buffer);
> 	if (notify_window == -1) {
> 		if (buffer < KVM_DEFAULT_NOTIFY_WINDOW)
> 			notify_window = 0;
> 		else
> 			notifiy_window = KVM_DEFAULT_NOTIFY_WINDOW - buffer;
> 	}
> 		
>> On Sapphire Rapids platform, the threshold is 128k. With this in mind, is it
>> appropriate to set 0 as the default value of notify_window?
> 
> Maybe?  That's still not a guarantee that _future_ CPUs will have an internal
> threshold >25k.

but we can always use the formula above to guarantee it.

vmcs.notify_window = KVM_DEFAULT_NOTIFY_WINDOW > internal ?
                      KVM_DEFAULT_NOTIFY_WINDOW - internal : 0;

> On a related topic, this needs tests.  One thought would be to stop unconditionally
> intercepting #AC if NOTIFY_WINDOW is enabled, and then have the test set up the
> infinite #AC vectoring scenario.
> 

yes, we have already tested with this case with notify_window set to 0. 
No false positive.
Xiaoyao Li Sept. 7, 2021, 1:45 p.m. UTC | #20
On 9/3/2021 12:36 AM, Sean Christopherson wrote:
> On Thu, Sep 02, 2021, Sean Christopherson wrote:
>> On Tue, Aug 03, 2021, Xiaoyao Li wrote:
>>> On 8/2/2021 11:46 PM, Sean Christopherson wrote:
>>>>>>> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>>>>>>>     	return 0;
>>>>>>>     }
>>>>>>> +static int handle_notify(struct kvm_vcpu *vcpu)
>>>>>>> +{
>>>>>>> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
>>>>>>> +
>>>>>>> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
>>>>>>
>>>>>> What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.
>>>>>
>>>>> It means whether the VM context is corrupted and not valid in the VMCS.
>>>>
>>>> Well that's a bit terrifying.  Under what conditions can the VM context become
>>>> corrupted?  E.g. if the context can be corrupted by an inopportune NOTIFY exit,
>>>> then KVM needs to be ultra conservative as a false positive could be fatal to a
>>>> guest.
>>>>
>>>
>>> Short answer is no case will set the VM_CONTEXT_INVALID bit.
>>
>> But something must set it, otherwise it wouldn't exist.  

For existing Intel silicon, no case will set it. Maybe in the future new 
case will set it.

> The condition(s) under
>> which it can be set matters because it affects how KVM should respond.  E.g. if
>> the guest can trigger VM_CONTEXT_INVALID at will, then we should probably treat
>> it as a shutdown and reset the VMCS.
> 
> Oh, and "shutdown" would be relative to the VMCS, i.e. if L2 triggers a NOTIFY
> exit with VM_CONTEXT_INVALID then KVM shouldn't kill the entire VM.  The least
> awful option would probably be to synthesize a shutdown VM-Exit to L1.  That
> won't communicate to L1 that vmcs12 state is stale/bogus, but I don't see any way
> to handle that via an existing VM-Exit reason :-/
> 
>> But if VM_CONTEXT_INVALID can occur if and only if there's a hardware/ucode
>> issue, then we can do:
>>
>> 	if (KVM_BUG_ON(exit_qual & NOTIFY_VM_CONTEXT_INVALID, vcpu->kvm))
>> 		return -EIO;
>>
>> Either way, to enable this by default we need some form of documentation that
>> describes what conditions lead to VM_CONTEXT_INVALID.

I still don't know why the conditions lead to it matters. I think the 
consensus is that once VM_CONTEXT_INVALID happens, the vcpu can no 
longer run. Either KVM_BUG_ON() or a specific EXIT to userspace should 
be OK?
Sean Christopherson Sept. 9, 2021, 6:47 p.m. UTC | #21
On Tue, Sep 07, 2021, Xiaoyao Li wrote:
> On 9/3/2021 12:29 AM, Sean Christopherson wrote:
> > > After syncing internally, we know that the internal threshold is not
> > > architectural but a model-specific value. It will be published in some place
> > > in future.
> > 
> > Any chance it will also be discoverable, e.g. via an MSR?
> 
> I also hope we can expose it via MSR. If not, we can maintain a table per
> FMS in KVM to get the internal threshold. However, per FMS info is not
> friendly to be virtualized (when we are going to enable the nested support).

Yeah, FMS is awful.  If the built-in buffer isn't discoverable, my vote is to
assume the worst, i.e. a built-in buffer of '0', and have the notify_window
param default to a safe value, e.g. 25k or maybe even 150k (to go above what the
hardware folks apparently deemed safe for SPR).  It's obviously not idea, but
it's better than playing FMS guessing games.

> I'll try to persuade internal to expose it via MSR, but I guarantee nothing.

...

> > On a related topic, this needs tests.  One thought would be to stop unconditionally
> > intercepting #AC if NOTIFY_WINDOW is enabled, and then have the test set up the
> > infinite #AC vectoring scenario.
> > 
> 
> yes, we have already tested with this case with notify_window set to 0. No
> false positive.

Can you send a selftest or kvm-unit-test?
Sean Christopherson Sept. 9, 2021, 6:59 p.m. UTC | #22
On Tue, Sep 07, 2021, Xiaoyao Li wrote:
> On 9/3/2021 12:36 AM, Sean Christopherson wrote:
> > On Thu, Sep 02, 2021, Sean Christopherson wrote:
> > > On Tue, Aug 03, 2021, Xiaoyao Li wrote:
> > > > On 8/2/2021 11:46 PM, Sean Christopherson wrote:
> > > > > > > > @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
> > > > > > > >     	return 0;
> > > > > > > >     }
> > > > > > > > +static int handle_notify(struct kvm_vcpu *vcpu)
> > > > > > > > +{
> > > > > > > > +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
> > > > > > > > +
> > > > > > > > +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
> > > > > > > 
> > > > > > > What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.
> > > > > > 
> > > > > > It means whether the VM context is corrupted and not valid in the VMCS.
> > > > > 
> > > > > Well that's a bit terrifying.  Under what conditions can the VM context become
> > > > > corrupted?  E.g. if the context can be corrupted by an inopportune NOTIFY exit,
> > > > > then KVM needs to be ultra conservative as a false positive could be fatal to a
> > > > > guest.
> > > > > 
> > > > 
> > > > Short answer is no case will set the VM_CONTEXT_INVALID bit.
> > > 
> > > But something must set it, otherwise it wouldn't exist.
> 
> For existing Intel silicon, no case will set it. Maybe in the future new
> case will set it.
> 
> > The condition(s) under
> > > which it can be set matters because it affects how KVM should respond.  E.g. if
> > > the guest can trigger VM_CONTEXT_INVALID at will, then we should probably treat
> > > it as a shutdown and reset the VMCS.
> > 
> > Oh, and "shutdown" would be relative to the VMCS, i.e. if L2 triggers a NOTIFY
> > exit with VM_CONTEXT_INVALID then KVM shouldn't kill the entire VM.  The least
> > awful option would probably be to synthesize a shutdown VM-Exit to L1.  That
> > won't communicate to L1 that vmcs12 state is stale/bogus, but I don't see any way
> > to handle that via an existing VM-Exit reason :-/
> > 
> > > But if VM_CONTEXT_INVALID can occur if and only if there's a hardware/ucode
> > > issue, then we can do:
> > > 
> > > 	if (KVM_BUG_ON(exit_qual & NOTIFY_VM_CONTEXT_INVALID, vcpu->kvm))
> > > 		return -EIO;
> > > 
> > > Either way, to enable this by default we need some form of documentation that
> > > describes what conditions lead to VM_CONTEXT_INVALID.
> 
> I still don't know why the conditions lead to it matters. I think the
> consensus is that once VM_CONTEXT_INVALID happens, the vcpu can no longer
> run.

Yes, and no longer being able to run the vCPU is precisely the problem.  The
condition(s) matters because if there's a possibility, however small, that enabling
NOTIFY_WINDOW can kill a well-behaved guest then it absolutely cannot be enabled by
default.

> Either KVM_BUG_ON() or a specific EXIT to userspace should be OK?

Not if the VM_CONTEXT_INVALID happens while L2 is running.  If software can trigger
VM_CONTEXT_INVALID at will, then killing the VM would open up the door to a
malicious L2 killing L1 (which would be rather ironic since this is an anti-DoS
feature).  IIUC, VM_CONTEXT_INVALID only means the current VMCS is garbage, thus
an occurence while L2 is active means that vmcs02 is junk, but L1's state in vmcs01,
vmcs12, etc... is still valid.
Xiaoyao Li Sept. 10, 2021, 7:39 a.m. UTC | #23
On 9/10/2021 2:47 AM, Sean Christopherson wrote:
> On Tue, Sep 07, 2021, Xiaoyao Li wrote:
>> On 9/3/2021 12:29 AM, Sean Christopherson wrote:
>>>> After syncing internally, we know that the internal threshold is not
>>>> architectural but a model-specific value. It will be published in some place
>>>> in future.
>>>
>>> Any chance it will also be discoverable, e.g. via an MSR?
>>
>> I also hope we can expose it via MSR. If not, we can maintain a table per
>> FMS in KVM to get the internal threshold. However, per FMS info is not
>> friendly to be virtualized (when we are going to enable the nested support).
> 
> Yeah, FMS is awful.  If the built-in buffer isn't discoverable, my vote is to
> assume the worst, i.e. a built-in buffer of '0', and have the notify_window
> param default to a safe value, e.g. 25k or maybe even 150k (to go above what the
> hardware folks apparently deemed safe for SPR).  It's obviously not idea, but
> it's better than playing FMS guessing games.
> 
>> I'll try to persuade internal to expose it via MSR, but I guarantee nothing.
> 
> ...
> 
>>> On a related topic, this needs tests.  One thought would be to stop unconditionally
>>> intercepting #AC if NOTIFY_WINDOW is enabled, and then have the test set up the
>>> infinite #AC vectoring scenario.
>>>
>>
>> yes, we have already tested with this case with notify_window set to 0. No
>> false positive.
> 
> Can you send a selftest or kvm-unit-test?
> 

Actually we implement the attacking case of CVE-2015-5307 with 
kvm-unit-test, while manually disabling the intercept of #AC.

First, it requires modification of KVM that only posting the 
kvm-unit-test doesn't help.

Second, release the attacking case is not the correct action.
Sean Christopherson Sept. 10, 2021, 5:55 p.m. UTC | #24
On Fri, Sep 10, 2021, Xiaoyao Li wrote:
> On 9/10/2021 2:47 AM, Sean Christopherson wrote:
> > On Tue, Sep 07, 2021, Xiaoyao Li wrote:
> > > On 9/3/2021 12:29 AM, Sean Christopherson wrote:
> > > > > After syncing internally, we know that the internal threshold is not
> > > > > architectural but a model-specific value. It will be published in some place
> > > > > in future.
> > > > 
> > > > Any chance it will also be discoverable, e.g. via an MSR?
> > > 
> > > I also hope we can expose it via MSR. If not, we can maintain a table per
> > > FMS in KVM to get the internal threshold. However, per FMS info is not
> > > friendly to be virtualized (when we are going to enable the nested support).
> > 
> > Yeah, FMS is awful.  If the built-in buffer isn't discoverable, my vote is to
> > assume the worst, i.e. a built-in buffer of '0', and have the notify_window
> > param default to a safe value, e.g. 25k or maybe even 150k (to go above what the
> > hardware folks apparently deemed safe for SPR).  It's obviously not idea, but
> > it's better than playing FMS guessing games.
> > 
> > > I'll try to persuade internal to expose it via MSR, but I guarantee nothing.
> > 
> > ...
> > 
> > > > On a related topic, this needs tests.  One thought would be to stop unconditionally
> > > > intercepting #AC if NOTIFY_WINDOW is enabled, and then have the test set up the
> > > > infinite #AC vectoring scenario.
> > > > 
> > > 
> > > yes, we have already tested with this case with notify_window set to 0. No
> > > false positive.
> > 
> > Can you send a selftest or kvm-unit-test?
> > 
> 
> Actually we implement the attacking case of CVE-2015-5307 with
> kvm-unit-test, while manually disabling the intercept of #AC.
> 
> First, it requires modification of KVM that only posting the kvm-unit-test
> doesn't help.

It helps in that hacking KVM to disable #AC interception is a lot easier than
re-writing a test from scratch.

> Second, release the attacking case is not the correct action.

As in it's irresponsible to provide code that can be used to DoS a hypervisor?
The CVE is six years old, IMO security-through-obscurity is unnecessary at this
point.
Xiaoyao Li Sept. 13, 2021, 2:58 a.m. UTC | #25
On 9/10/2021 2:59 AM, Sean Christopherson wrote:
> On Tue, Sep 07, 2021, Xiaoyao Li wrote:
>> On 9/3/2021 12:36 AM, Sean Christopherson wrote:
>>> On Thu, Sep 02, 2021, Sean Christopherson wrote:
>>>> On Tue, Aug 03, 2021, Xiaoyao Li wrote:
>>>>> On 8/2/2021 11:46 PM, Sean Christopherson wrote:
>>>>>>>>> @@ -5642,6 +5653,31 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
>>>>>>>>>      	return 0;
>>>>>>>>>      }
>>>>>>>>> +static int handle_notify(struct kvm_vcpu *vcpu)
>>>>>>>>> +{
>>>>>>>>> +	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
>>>>>>>>> +
>>>>>>>>> +	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
>>>>>>>>
>>>>>>>> What does CONTEXT_INVALID mean?  The ISE doesn't provide any information whatsoever.
>>>>>>>
>>>>>>> It means whether the VM context is corrupted and not valid in the VMCS.
>>>>>>
>>>>>> Well that's a bit terrifying.  Under what conditions can the VM context become
>>>>>> corrupted?  E.g. if the context can be corrupted by an inopportune NOTIFY exit,
>>>>>> then KVM needs to be ultra conservative as a false positive could be fatal to a
>>>>>> guest.
>>>>>>
>>>>>
>>>>> Short answer is no case will set the VM_CONTEXT_INVALID bit.
>>>>
>>>> But something must set it, otherwise it wouldn't exist.
>>
>> For existing Intel silicon, no case will set it. Maybe in the future new
>> case will set it.
>>
>>> The condition(s) under
>>>> which it can be set matters because it affects how KVM should respond.  E.g. if
>>>> the guest can trigger VM_CONTEXT_INVALID at will, then we should probably treat
>>>> it as a shutdown and reset the VMCS.
>>>
>>> Oh, and "shutdown" would be relative to the VMCS, i.e. if L2 triggers a NOTIFY
>>> exit with VM_CONTEXT_INVALID then KVM shouldn't kill the entire VM.  The least
>>> awful option would probably be to synthesize a shutdown VM-Exit to L1.  That
>>> won't communicate to L1 that vmcs12 state is stale/bogus, but I don't see any way
>>> to handle that via an existing VM-Exit reason :-/
>>>
>>>> But if VM_CONTEXT_INVALID can occur if and only if there's a hardware/ucode
>>>> issue, then we can do:
>>>>
>>>> 	if (KVM_BUG_ON(exit_qual & NOTIFY_VM_CONTEXT_INVALID, vcpu->kvm))
>>>> 		return -EIO;
>>>>
>>>> Either way, to enable this by default we need some form of documentation that
>>>> describes what conditions lead to VM_CONTEXT_INVALID.
>>
>> I still don't know why the conditions lead to it matters. I think the
>> consensus is that once VM_CONTEXT_INVALID happens, the vcpu can no longer
>> run.
> 
> Yes, and no longer being able to run the vCPU is precisely the problem.  The
> condition(s) matters because if there's a possibility, however small, that enabling
> NOTIFY_WINDOW can kill a well-behaved guest then it absolutely cannot be enabled by
> default.

For now, no condition will set it. For future, I believe it will be set 
only for some fatal case. However, we cannot guarantee no silicon bug to 
break a well-behaved the guest. Maybe let's make it opt-in?

>> Either KVM_BUG_ON() or a specific EXIT to userspace should be OK?
> 
> Not if the VM_CONTEXT_INVALID happens while L2 is running.  If software can trigger
> VM_CONTEXT_INVALID at will, then killing the VM would open up the door to a
> malicious L2 killing L1 (which would be rather ironic since this is an anti-DoS
> feature).  IIUC, VM_CONTEXT_INVALID only means the current VMCS is garbage, thus
> an occurence while L2 is active means that vmcs02 is junk, but L1's state in vmcs01,
> vmcs12, etc... is still valid.
> 

Maybe we can kill the L2 when VM_CONTEXT_INVALID happens in L2.
Sean Christopherson Oct. 15, 2021, 6:29 p.m. UTC | #26
On Mon, Sep 13, 2021, Xiaoyao Li wrote:
> On 9/10/2021 2:59 AM, Sean Christopherson wrote:
> > Yes, and no longer being able to run the vCPU is precisely the problem.  The
> > condition(s) matters because if there's a possibility, however small, that enabling
> > NOTIFY_WINDOW can kill a well-behaved guest then it absolutely cannot be enabled by
> > default.
> 
> For now, no condition will set it. For future, I believe it will be set only
> for some fatal case. However, we cannot guarantee no silicon bug to break a
> well-behaved the guest. Maybe let's make it opt-in?

Ya, I think an off-by-default module param makes sense.

> > > Either KVM_BUG_ON() or a specific EXIT to userspace should be OK?
> > 
> > Not if the VM_CONTEXT_INVALID happens while L2 is running.  If software can trigger
> > VM_CONTEXT_INVALID at will, then killing the VM would open up the door to a
> > malicious L2 killing L1 (which would be rather ironic since this is an anti-DoS
> > feature).  IIUC, VM_CONTEXT_INVALID only means the current VMCS is garbage, thus
> > an occurence while L2 is active means that vmcs02 is junk, but L1's state in vmcs01,
> > vmcs12, etc... is still valid.
> > 
> 
> Maybe we can kill the L2 when VM_CONTEXT_INVALID happens in L2.

Ya, synthesizing a nested EXIT_REASON_TRIPLE_FAULT and sanitizing vmcs02/vmcs12 is
the least awful solution I can think of.  I could have sworn I suggested as much,
but apparently that thought never made it from my brain to the keyboard.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 0ffaa3156a4e..9104c85a973f 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -74,6 +74,7 @@ 
 #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
 #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
 #define SECONDARY_EXEC_BUS_LOCK_DETECTION	VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
+#define SECONDARY_EXEC_NOTIFY_VM_EXITING	VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
 
 #define PIN_BASED_EXT_INTR_MASK                 VMCS_CONTROL_BIT(INTR_EXITING)
 #define PIN_BASED_NMI_EXITING                   VMCS_CONTROL_BIT(NMI_EXITING)
@@ -269,6 +270,7 @@  enum vmcs_field {
 	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
 	PLE_GAP                         = 0x00004020,
 	PLE_WINDOW                      = 0x00004022,
+	NOTIFY_WINDOW                   = 0x00004024,
 	VM_INSTRUCTION_ERROR            = 0x00004400,
 	VM_EXIT_REASON                  = 0x00004402,
 	VM_EXIT_INTR_INFO               = 0x00004404,
@@ -555,6 +557,11 @@  enum vm_entry_failure_code {
 #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
 #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
+/*
+ * Exit Qualifications for NOTIFY VM EXIT
+ */
+#define NOTIFY_VM_CONTEXT_INVALID     BIT(0)
+
 /*
  * VM-instruction error numbers
  */
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index d9a74681a77d..15f0f2ab4f95 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -84,5 +84,6 @@ 
 #define VMX_FEATURE_USR_WAIT_PAUSE	( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
 #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
 #define VMX_FEATURE_BUS_LOCK_DETECTION	( 2*32+ 30) /* "" VM-Exit when bus lock caused */
+#define VMX_FEATURE_NOTIFY_VM_EXITING	( 2*32+ 31) /* VM-Exit when no event windows after notify window */
 
 #endif /* _ASM_X86_VMXFEATURES_H */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 946d761adbd3..ef4c80f6553e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -91,6 +91,7 @@ 
 #define EXIT_REASON_UMWAIT              67
 #define EXIT_REASON_TPAUSE              68
 #define EXIT_REASON_BUS_LOCK            74
+#define EXIT_REASON_NOTIFY              75
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -153,7 +154,8 @@ 
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
 	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
 	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
-	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }
+	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }, \
+	{ EXIT_REASON_NOTIFY,                "NOTIFY"}
 
 #define VMX_EXIT_REASON_FLAGS \
 	{ VMX_EXIT_REASONS_FAILED_VMENTRY,	"FAILED_VMENTRY" }
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 8dee8a5fbc17..8527f34a84ac 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -407,4 +407,10 @@  static inline u64 vmx_supported_debugctl(void)
 	return debugctl;
 }
 
+static inline bool cpu_has_notify_vm_exiting(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_NOTIFY_VM_EXITING;
+}
+
 #endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4bceb5ca3a89..c0ad01c88dac 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -205,6 +205,10 @@  module_param(ple_window_max, uint, 0444);
 int __read_mostly pt_mode = PT_MODE_SYSTEM;
 module_param(pt_mode, int, S_IRUGO);
 
+/* Default is 0, less than 0 (for example, -1) disables notify window. */
+static int __read_mostly notify_window;
+module_param(notify_window, int, 0644);
+
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
@@ -2539,7 +2543,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			SECONDARY_EXEC_PT_USE_GPA |
 			SECONDARY_EXEC_PT_CONCEAL_VMX |
 			SECONDARY_EXEC_ENABLE_VMFUNC |
-			SECONDARY_EXEC_BUS_LOCK_DETECTION;
+			SECONDARY_EXEC_BUS_LOCK_DETECTION |
+			SECONDARY_EXEC_NOTIFY_VM_EXITING;
 		if (cpu_has_sgx())
 			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
 		if (adjust_vmx_controls(min2, opt2,
@@ -4376,6 +4381,9 @@  static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
 		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
 
+	if (cpu_has_notify_vm_exiting() && notify_window < 0)
+		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
 	vmx->secondary_exec_control = exec_control;
 }
 
@@ -4423,6 +4431,9 @@  static void init_vmcs(struct vcpu_vmx *vmx)
 		vmx->ple_window_dirty = true;
 	}
 
+	if (cpu_has_notify_vm_exiting() && notify_window >= 0)
+		vmcs_write32(NOTIFY_WINDOW, notify_window);
+
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
@@ -5642,6 +5653,31 @@  static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+
+	if (!(exit_qual & NOTIFY_VM_CONTEXT_INVALID)) {
+		/*
+		 * Notify VM exit happened while executing iret from NMI,
+		 * "blocked by NMI" bit has to be set before next VM entry.
+		 */
+		if (enable_vnmi &&
+		    (exit_qual & INTR_INFO_UNBLOCK_NMI))
+			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				      GUEST_INTR_STATE_NMI);
+
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_NO_EVENT_WINDOW;
+	vcpu->run->internal.ndata = 1;
+	vcpu->run->internal.data[0] = exit_qual;
+
+	return 0;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5699,6 +5735,7 @@  static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 	[EXIT_REASON_ENCLS]		      = handle_encls,
 	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
+	[EXIT_REASON_NOTIFY]		      = handle_notify,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6042,7 +6079,8 @@  static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
 	     exit_reason.basic != EXIT_REASON_PML_FULL &&
 	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
-	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+	     exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+	     exit_reason.basic != EXIT_REASON_NOTIFY)) {
 		int ndata = 3;
 
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3fd9a7e9d90c..bb3b49b1fb0d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -278,6 +278,8 @@  struct kvm_xen_exit {
 #define KVM_INTERNAL_ERROR_DELIVERY_EV	3
 /* Encounter unexpected vm-exit reason */
 #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON	4
+/* Encounter notify vm-exit */
+#define KVM_INTERNAL_ERROR_NO_EVENT_WINDOW   5
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {