@@ -4532,7 +4532,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
gfn_t base = gfn_round_for_level(fault->gfn,
fault->max_level);
- if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
+ if (kvm_mtrr_check_gfn_range_consistency(vcpu->kvm, base, page_num))
break;
}
}
@@ -105,7 +105,7 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
}
-static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
+static u8 mtrr_disabled_type(struct kvm *kvm)
{
/*
* Intel SDM 11.11.2.2: all MTRRs are disabled when
@@ -117,10 +117,7 @@ static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
* enable MTRRs and it is obviously undesirable to run the
* guest entirely with UC memory and we use WB.
*/
- if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
- return MTRR_TYPE_UNCACHABLE;
- else
- return MTRR_TYPE_WRBACK;
+ return kvm->arch.has_mtrr ? MTRR_TYPE_UNCACHABLE : MTRR_TYPE_WRBACK;
}
/*
@@ -310,6 +307,12 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
gfn_t start, end;
int index;
+ /* MTRR is consistency between all the processors in the system
+ * so just update the TDP according to MTRR settings in vcpu0
+ */
+ if (vcpu->vcpu_id)
+ return;
+
if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
!kvm_arch_has_noncoherent_dma(vcpu->kvm))
return;
@@ -635,10 +638,11 @@ static void mtrr_lookup_next(struct mtrr_iter *iter)
for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+u8 kvm_mtrr_get_guest_memory_type(struct kvm *kvm, gfn_t gfn)
{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct kvm_mtrr *mtrr_state;
struct mtrr_iter iter;
+ int srcu_idx;
u64 start, end;
int type = -1;
const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
@@ -647,6 +651,16 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
start = gfn_to_gpa(gfn);
end = start + PAGE_SIZE;
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ mtrr_state = srcu_dereference(kvm->arch.mtrr_state, &kvm->srcu);
+ /* kvm mtrr_state points to mtrr_state of vcpu0.
+ * should not reach here unless vcpu0 is destroyed
+ */
+ if (WARN_ON(!mtrr_state)) {
+ type = mtrr_disabled_type(kvm);
+ goto out;
+ }
+
mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
int curr_type = iter.mem_type;
@@ -694,12 +708,16 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
return MTRR_TYPE_WRBACK;
}
- if (iter.mtrr_disabled)
- return mtrr_disabled_type(vcpu);
+ if (iter.mtrr_disabled) {
+ type = mtrr_disabled_type(kvm);
+ goto out;
+ }
/* not contained in any MTRRs. */
- if (type == -1)
- return mtrr_default_type(mtrr_state);
+ if (type == -1) {
+ type = mtrr_default_type(mtrr_state);
+ goto out;
+ }
/*
* We just check one page, partially covered by MTRRs is
@@ -707,38 +725,64 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
*/
WARN_ON(iter.partial_map);
+out:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
return type;
}
EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm *kvm, gfn_t gfn,
int page_num)
{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct kvm_mtrr *mtrr_state;
struct mtrr_iter iter;
+ int srcu_idx;
u64 start, end;
int type = -1;
+ int ret;
start = gfn_to_gpa(gfn);
end = gfn_to_gpa(gfn + page_num);
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ mtrr_state = srcu_dereference(kvm->arch.mtrr_state, &kvm->srcu);
+ /* kvm mtrr_state points to mtrr_state of vcpu0.
+ * should not reach here unless vcpu0 is destroyed
+ */
+ if (WARN_ON(!mtrr_state)) {
+ ret = true;
+ goto out;
+ }
+
mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
if (type == -1) {
type = iter.mem_type;
continue;
}
- if (type != iter.mem_type)
- return false;
+ if (type != iter.mem_type) {
+ ret = false;
+ goto out;
+ }
}
- if (iter.mtrr_disabled)
- return true;
+ if (iter.mtrr_disabled) {
+ ret = true;
+ goto out;
+ }
- if (!iter.partial_map)
- return true;
+ if (!iter.partial_map) {
+ ret = true;
+ goto out;
+ }
- if (type == -1)
- return true;
+ if (type == -1) {
+ ret = true;
+ goto out;
+ }
- return type == mtrr_default_type(mtrr_state);
+ ret = (type == mtrr_default_type(mtrr_state));
+out:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ return ret;
}
@@ -7540,7 +7540,7 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
}
- return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+ return kvm_mtrr_get_guest_memory_type(vcpu->kvm, gfn) << VMX_EPT_MT_EPTE_SHIFT;
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -310,11 +310,11 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
void kvm_mtrr_init(struct kvm_vcpu *vcpu);
void kvm_mtrr_destroy(struct kvm_vcpu *vcpu);
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
+u8 kvm_mtrr_get_guest_memory_type(struct kvm *kvm, gfn_t gfn);
bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm *kvm, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
When KVM mmu checking guest MTRR, check the per-VM one and only zap EPT if per-VM MTRR (MTRR of vCPU 0) changes. Before this patch, if there're noncoherent DMA, EPT violation handler will reference the guest MTRR state of the vCPU causing the violation. EPT leaf entries will be zapped if MTRR settings of each vCPU changes. But as one EPT leaf entry can only have one memory type, it may still cause problem if vCPUs have different MTRR state. So, insane guests without consistent MTRR state across vCPUs will only cause problem to its own. Therefore, this patch switches to use per-VM MTRR and only zap EPT when this per-VM MTRR changes, which can avoid several EPT zap during guest boot. A reference data (average of 10 times of guest boot) is as below: Physical CPU frequency: 3100 MHz | vCPU cnt | memory | EPT zap cnt | EPT zap cycles | bootup time before | 8 | 2G | 84 | 4164.57M | 19.38s after | 8 | 2G | 14 | 16.07M | 18.83s before | 8 | 16G | 84 | 4163.38M | 24.51s after | 8 | 16G | 14 | 16.68M | 23.94s Legends: before: before this patch after: after this patch vCPU cnt: guest vCPU count of a VM memory: guest memory size EPT zap cnt: the count of EPT zap caused by update_mtrr() during guest boot EPT zap cycles: the cpu cyles of EPT zap caused by update_mtrr() during guest boot bootup time: guest bootup time, measured from starting QEMU to guest rc.local Signed-off-by: Yan Zhao <yan.y.zhao@intel.com> --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mtrr.c | 88 +++++++++++++++++++++++++++++++----------- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.h | 4 +- 4 files changed, 70 insertions(+), 26 deletions(-)