diff mbox

[v4,8/8] x86,kvm: Teach KVM's VMX code that CR3 isn't a constant

Message ID 7d369dab491071edc02d39e2fa2b218a3be401f2.1495990440.git.luto@kernel.org (mailing list archive)
State New, archived
Headers show

Commit Message

Andy Lutomirski May 28, 2017, 5 p.m. UTC
When PCID is enabled, CR3's PCID bits can change during context
switches, so KVM won't be able to treat CR3 as a per-mm constant any
more.

I structured this like the existing CR4 handling.  Under ordinary
circumstances (PCID disabled or if the current PCID and the value
that's already in the VMCS match), then we won't do an extra VMCS
write, and we'll never do an extra direct CR3 read.  The overhead
should be minimal.

I disallowed using the new helper in non-atomic context because
PCID support will cause CR3 to stop being constant in non-atomic
process context.

(Frankly, it also scares me a bit that KVM ever treated CR3 as
constant, but it looks like it was okay before.)

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: kvm@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 19 +++++++++++++++++++
 arch/x86/kvm/vmx.c                 | 21 ++++++++++++++++++---
 2 files changed, 37 insertions(+), 3 deletions(-)

Comments

Roman Kagan July 14, 2017, 8:06 p.m. UTC | #1
On Sun, May 28, 2017 at 10:00:17AM -0700, Andy Lutomirski wrote:
> When PCID is enabled, CR3's PCID bits can change during context
> switches, so KVM won't be able to treat CR3 as a per-mm constant any
> more.
> 
> I structured this like the existing CR4 handling.  Under ordinary
> circumstances (PCID disabled or if the current PCID and the value
> that's already in the VMCS match), then we won't do an extra VMCS
> write, and we'll never do an extra direct CR3 read.  The overhead
> should be minimal.
> 
> I disallowed using the new helper in non-atomic context because
> PCID support will cause CR3 to stop being constant in non-atomic
> process context.
> 
> (Frankly, it also scares me a bit that KVM ever treated CR3 as
> constant, but it looks like it was okay before.)
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Cc: kvm@vger.kernel.org
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Dave Hansen <dave.hansen@intel.com>
> Cc: Nadav Amit <namit@vmware.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Arjan van de Ven <arjan@linux.intel.com>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  arch/x86/include/asm/mmu_context.h | 19 +++++++++++++++++++
>  arch/x86/kvm/vmx.c                 | 21 ++++++++++++++++++---
>  2 files changed, 37 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index 187c39470a0b..f20d7ea47095 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -266,4 +266,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
>  	return __pkru_allows_pkey(vma_pkey(vma), write);
>  }
>  
> +
> +/*
> + * This can be used from process context to figure out what the value of
> + * CR3 is without needing to do a (slow) read_cr3().
> + *
> + * It's intended to be used for code like KVM that sneakily changes CR3
> + * and needs to restore it.  It needs to be used very carefully.
> + */
> +static inline unsigned long __get_current_cr3_fast(void)
> +{
> +	unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
> +
> +	/* For now, be very restrictive about when this can be called. */
> +	VM_WARN_ON(in_nmi() || !in_atomic());

With the following config (from Fedora26 + olddefconfig)

  $ grep PREEMPT .config
  CONFIG_PREEMPT_NOTIFIERS=y
  # CONFIG_PREEMPT_NONE is not set
  CONFIG_PREEMPT_VOLUNTARY=y
  # CONFIG_PREEMPT is not set

I hit this warning on !in_atomic() on every vm entry.  Shouldn't this be
preemptible() instead?

Roman.
Andy Lutomirski July 15, 2017, 4:42 p.m. UTC | #2
On Fri, Jul 14, 2017 at 1:06 PM, Roman Kagan <rkagan@virtuozzo.com> wrote:
> On Sun, May 28, 2017 at 10:00:17AM -0700, Andy Lutomirski wrote:
>> When PCID is enabled, CR3's PCID bits can change during context
>> switches, so KVM won't be able to treat CR3 as a per-mm constant any
>> more.
>>
>> I structured this like the existing CR4 handling.  Under ordinary
>> circumstances (PCID disabled or if the current PCID and the value
>> that's already in the VMCS match), then we won't do an extra VMCS
>> write, and we'll never do an extra direct CR3 read.  The overhead
>> should be minimal.
>>
>> I disallowed using the new helper in non-atomic context because
>> PCID support will cause CR3 to stop being constant in non-atomic
>> process context.
>>
>> (Frankly, it also scares me a bit that KVM ever treated CR3 as
>> constant, but it looks like it was okay before.)
>>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Radim Krčmář <rkrcmar@redhat.com>
>> Cc: kvm@vger.kernel.org
>> Cc: Rik van Riel <riel@redhat.com>
>> Cc: Dave Hansen <dave.hansen@intel.com>
>> Cc: Nadav Amit <namit@vmware.com>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Arjan van de Ven <arjan@linux.intel.com>
>> Signed-off-by: Andy Lutomirski <luto@kernel.org>
>> ---
>>  arch/x86/include/asm/mmu_context.h | 19 +++++++++++++++++++
>>  arch/x86/kvm/vmx.c                 | 21 ++++++++++++++++++---
>>  2 files changed, 37 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>> index 187c39470a0b..f20d7ea47095 100644
>> --- a/arch/x86/include/asm/mmu_context.h
>> +++ b/arch/x86/include/asm/mmu_context.h
>> @@ -266,4 +266,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
>>       return __pkru_allows_pkey(vma_pkey(vma), write);
>>  }
>>
>> +
>> +/*
>> + * This can be used from process context to figure out what the value of
>> + * CR3 is without needing to do a (slow) read_cr3().
>> + *
>> + * It's intended to be used for code like KVM that sneakily changes CR3
>> + * and needs to restore it.  It needs to be used very carefully.
>> + */
>> +static inline unsigned long __get_current_cr3_fast(void)
>> +{
>> +     unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
>> +
>> +     /* For now, be very restrictive about when this can be called. */
>> +     VM_WARN_ON(in_nmi() || !in_atomic());
>
> With the following config (from Fedora26 + olddefconfig)
>
>   $ grep PREEMPT .config
>   CONFIG_PREEMPT_NOTIFIERS=y
>   # CONFIG_PREEMPT_NONE is not set
>   CONFIG_PREEMPT_VOLUNTARY=y
>   # CONFIG_PREEMPT is not set
>
> I hit this warning on !in_atomic() on every vm entry.  Shouldn't this be
> preemptible() instead?

Ugh, I hate in_atomic() and its willingness to return the sort-of-wrong answer.

Want to send a patch?

--Andy
diff mbox

Patch

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 187c39470a0b..f20d7ea47095 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -266,4 +266,23 @@  static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 	return __pkru_allows_pkey(vma_pkey(vma), write);
 }
 
+
+/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it.  It needs to be used very carefully.
+ */
+static inline unsigned long __get_current_cr3_fast(void)
+{
+	unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
+
+	/* For now, be very restrictive about when this can be called. */
+	VM_WARN_ON(in_nmi() || !in_atomic());
+
+	VM_BUG_ON(cr3 != read_cr3());
+	return cr3;
+}
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 72f78396bc09..b7b36c9ffa3d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -48,6 +48,7 @@ 
 #include <asm/kexec.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
+#include <asm/mmu_context.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -596,6 +597,7 @@  struct vcpu_vmx {
 		int           gs_ldt_reload_needed;
 		int           fs_reload_needed;
 		u64           msr_host_bndcfgs;
+		unsigned long vmcs_host_cr3;	/* May not match real cr3 */
 		unsigned long vmcs_host_cr4;	/* May not match real cr4 */
 	} host_state;
 	struct {
@@ -5012,12 +5014,19 @@  static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	u32 low32, high32;
 	unsigned long tmpl;
 	struct desc_ptr dt;
-	unsigned long cr0, cr4;
+	unsigned long cr0, cr3, cr4;
 
 	cr0 = read_cr0();
 	WARN_ON(cr0 & X86_CR0_TS);
 	vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
-	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
+
+	/*
+	 * Save the most likely value for this task's CR3 in the VMCS.
+	 * We can't use __get_current_cr3_fast() because we're not atomic.
+	 */
+	cr3 = read_cr3();
+	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
+	vmx->host_state.vmcs_host_cr3 = cr3;
 
 	/* Save the most likely value for this task's CR4 in the VMCS. */
 	cr4 = cr4_read_shadow();
@@ -8843,7 +8852,7 @@  static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long debugctlmsr, cr4;
+	unsigned long debugctlmsr, cr3, cr4;
 
 	/* Don't enter VMX if guest state is invalid, let the exit handler
 	   start emulation until we arrive back to a valid state */
@@ -8865,6 +8874,12 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
+	cr3 = __get_current_cr3_fast();
+	if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+		vmcs_writel(HOST_CR3, cr3);
+		vmx->host_state.vmcs_host_cr3 = cr3;
+	}
+
 	cr4 = cr4_read_shadow();
 	if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
 		vmcs_writel(HOST_CR4, cr4);