Message ID | 1422413668-3509-7-git-send-email-kai.huang@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
2015-01-28 10:54+0800, Kai Huang: > This patch adds PML support in VMX. A new module parameter 'enable_pml' is added (+module_param_named(pml, enable_pml, bool, S_IRUGO);) > to allow user to enable/disable it manually. > > Signed-off-by: Kai Huang <kai.huang@linux.intel.com> > --- > diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h > index 587149b..a139977 100644 > --- a/arch/x86/kvm/trace.h > +++ b/arch/x86/kvm/trace.h > @@ -846,6 +846,24 @@ TRACE_EVENT(kvm_track_tsc, > +TRACE_EVENT(kvm_pml_full, > + TP_PROTO(unsigned int vcpu_id), > + TP_ARGS(vcpu_id), > + > + TP_STRUCT__entry( > + __field( unsigned int, vcpu_id ) > + ), > + > + TP_fast_assign( > + __entry->vcpu_id = vcpu_id; > + ), > + > + TP_printk("vcpu %d: PML full", __entry->vcpu_id) > +); > + > #endif /* CONFIG_X86_64 */ You have it protected by CONFIG_X86_64, but use it unconditionally. (Also, we can get all this information from kvm_exit tracepoint; I'd just drop it.) > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index c987374..de5ce82 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -516,6 +519,10 @@ struct vcpu_vmx { > + /* Support for PML */ > +#define PML_ENTITY_NUM 512 (Readers of this struct likely aren't interested in this number, so it would be nicer to define it outside. I thought it is here to hint the amount of allocated pages, but PML_ENTITY_NUM / 512 isn't obvious.) > + struct page *pml_pg; > @@ -4355,6 +4368,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) > a current VMCS12 > */ > exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; > + /* PML is enabled/disabled in creating/destorying vcpu */ > + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; What is the harm of enabling it here? (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES seems similar and does it.) > + > return exec_control; > } > @@ -6971,6 +7001,31 @@ static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector) > +static int handle_pml_full(struct kvm_vcpu *vcpu) > +{ > + unsigned long exit_qualification; > + > + trace_kvm_pml_full(vcpu->vcpu_id); > + > + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); > + > + /* > + * PML buffer FULL happened while executing iret from NMI, > + * "blocked by NMI" bit has to be set before next VM entry. > + */ > + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && > + cpu_has_virtual_nmis() && > + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) > + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, > + GUEST_INTR_STATE_NMI); Relevant part of the specification is pasted from 27.2.2 (Information for VM Exits Due to Vectored Events), which makes me think that this bit is mirrored to IDT-vectoring information field ... Isn't vmx_recover_nmi_blocking() enough? (I see the same code in handle_ept_violation(), but wasn't that needed just because of a hardware error?) > + > + /* > + * PML buffer already flushed at beginning of VMEXIT. Nothing to do > + * here.., and there's no userspace involvement needed for PML. > + */ > + return 1; > +} > @@ -7325,6 +7381,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) > +static int vmx_enable_pml(struct vcpu_vmx *vmx) > +{ > + struct page *pml_pg; > + u32 exec_control; > + > + pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); > + if (!pml_pg) > + return -ENOMEM; > + > + vmx->pml_pg = pml_pg; (It's safe to use vmx->pml_pg directly.) > + > + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); > + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); > + > + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); > + exec_control |= SECONDARY_EXEC_ENABLE_PML; > + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); > + > + return 0; > +} > +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) > +{ > + struct kvm *kvm = vmx->vcpu.kvm; > + u64 *pml_buf; > + u16 pml_idx; > + > + pml_idx = vmcs_read16(GUEST_PML_INDEX); > + > + /* Do nothing if PML buffer is empty */ > + if (pml_idx == (PML_ENTITY_NUM - 1)) > + return; > + > + /* PML index always points to next available PML buffer entity */ > + if (pml_idx >= PML_ENTITY_NUM) > + pml_idx = 0; > + else > + pml_idx++; If the pml_idx is >= PML_ENTITY_NUM and < 0xffff, the log is empty, so it would be better to use just 'pml_idx++' and unsigned modulo. (Could also 'assert(pml_idx < PML_ENTITY_NUM)' then.) > + > + pml_buf = page_address(vmx->pml_pg); > + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { > + u64 gpa; > + > + gpa = pml_buf[pml_idx]; > + WARN_ON(gpa & (PAGE_SIZE - 1)); > + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); > + } > + > + /* reset PML index */ > + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); > +} > @@ -9492,6 +9655,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) > +static void vmx_slot_enable_log_dirty(struct kvm *kvm, > + struct kvm_memory_slot *slot) > +{ > + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); (New slot contains dirty pages?) > + kvm_mmu_slot_largepage_remove_write_access(kvm, slot); > +} Thanks. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 03/02/2015 16:18, Radim Kr?má? wrote: > (I see the same code in handle_ept_violation(), but wasn't that needed > just because of a hardware error?) That was how I read it initially, but actually that means: "this statement could be broken if the processor has that erratum". >> +static void vmx_slot_enable_log_dirty(struct kvm *kvm, >> + struct kvm_memory_slot *slot) >> +{ >> + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); > > (New slot contains dirty pages?) New slots contain clean pages as far as the KVM dirty log is concerned. In the case of PML, note that D=1 does not mean the page is dirty. It only means that writes will not be logged by PML. The page may thus also have logging disabled. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2015-02-03 16:39+0100, Paolo Bonzini: > > > On 03/02/2015 16:18, Radim Kr?má? wrote: > > (I see the same code in handle_ept_violation(), but wasn't that needed > > just because of a hardware error?) > > That was how I read it initially, but actually that means: "this > statement could be broken if the processor has that erratum". Thanks, that was a nice ruse for the original bug :) > >> +static void vmx_slot_enable_log_dirty(struct kvm *kvm, > >> + struct kvm_memory_slot *slot) > >> +{ > >> + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); > > > > (New slot contains dirty pages?) > > New slots contain clean pages as far as the KVM dirty log is concerned. > > In the case of PML, note that D=1 does not mean the page is dirty. It > only means that writes will not be logged by PML. The page may thus > also have logging disabled. Yeah, it would be a problem if we had dirty pages at the beginning, but I don't think it is possible as was too lazy to check. (It's not important and I wanted to do this review today :) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 02/03/2015 11:18 PM, Radim Kr?má? wrote: > 2015-01-28 10:54+0800, Kai Huang: >> This patch adds PML support in VMX. A new module parameter 'enable_pml' is added > (+module_param_named(pml, enable_pml, bool, S_IRUGO);) > >> to allow user to enable/disable it manually. >> >> Signed-off-by: Kai Huang <kai.huang@linux.intel.com> >> --- >> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h >> index 587149b..a139977 100644 >> --- a/arch/x86/kvm/trace.h >> +++ b/arch/x86/kvm/trace.h >> @@ -846,6 +846,24 @@ TRACE_EVENT(kvm_track_tsc, >> +TRACE_EVENT(kvm_pml_full, >> + TP_PROTO(unsigned int vcpu_id), >> + TP_ARGS(vcpu_id), >> + >> + TP_STRUCT__entry( >> + __field( unsigned int, vcpu_id ) >> + ), >> + >> + TP_fast_assign( >> + __entry->vcpu_id = vcpu_id; >> + ), >> + >> + TP_printk("vcpu %d: PML full", __entry->vcpu_id) >> +); >> + >> #endif /* CONFIG_X86_64 */ > You have it protected by CONFIG_X86_64, but use it unconditionally. Thanks for catching. This has been fixed by another patch, and the fix has also been merged by Paolo. Thanks, -Kai > > (Also, we can get all this information from kvm_exit tracepoint; > I'd just drop it.) > >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index c987374..de5ce82 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -516,6 +519,10 @@ struct vcpu_vmx { >> + /* Support for PML */ >> +#define PML_ENTITY_NUM 512 > (Readers of this struct likely aren't interested in this number, so it > would be nicer to define it outside. I thought it is here to hint the > amount of allocated pages, but PML_ENTITY_NUM / 512 isn't obvious.) > >> + struct page *pml_pg; >> @@ -4355,6 +4368,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) >> a current VMCS12 >> */ >> exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; >> + /* PML is enabled/disabled in creating/destorying vcpu */ >> + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; > What is the harm of enabling it here? > > (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES seems similar and does it.) Because the PML feature detection is unconditional (meaning SECONDARY_EXEC_ENABLE_PML is always in vmcs_config.cpu_based_2nd_exec_ctrl), but the PML buffer is only created when vcpu is created, and it is controlled by 'enable_pml' module parameter, if we always enable SECONDARY_EXEC_ENABLE_PML here, no PML buffer will be created if PML is disabled by 'enable_pml' parameter, so it's better to enable it along with creating PML buffer. > >> + >> return exec_control; >> } >> @@ -6971,6 +7001,31 @@ static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector) >> +static int handle_pml_full(struct kvm_vcpu *vcpu) >> +{ >> + unsigned long exit_qualification; >> + >> + trace_kvm_pml_full(vcpu->vcpu_id); >> + >> + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); >> + >> + /* >> + * PML buffer FULL happened while executing iret from NMI, >> + * "blocked by NMI" bit has to be set before next VM entry. >> + */ >> + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && >> + cpu_has_virtual_nmis() && >> + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) >> + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, >> + GUEST_INTR_STATE_NMI); > Relevant part of the specification is pasted from 27.2.2 (Information > for VM Exits Due to Vectored Events), which makes me think that this bit > is mirrored to IDT-vectoring information field ... > > Isn't vmx_recover_nmi_blocking() enough? > > (I see the same code in handle_ept_violation(), but wasn't that needed > just because of a hardware error?) This needs to be handled in both EPT violation and PML. If you look at the PML specification (the link is in cover letter), you can see the definition of bit 12 of exit_qualification (section 1.5), which explains above code. The same definition of exit_qualification is in EPT violation part in Intel's SDM. >> + >> + /* >> + * PML buffer already flushed at beginning of VMEXIT. Nothing to do >> + * here.., and there's no userspace involvement needed for PML. >> + */ >> + return 1; >> +} >> @@ -7325,6 +7381,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) >> +static int vmx_enable_pml(struct vcpu_vmx *vmx) >> +{ >> + struct page *pml_pg; >> + u32 exec_control; >> + >> + pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); >> + if (!pml_pg) >> + return -ENOMEM; >> + >> + vmx->pml_pg = pml_pg; > (It's safe to use vmx->pml_pg directly.) > >> + >> + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); >> + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); >> + >> + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); >> + exec_control |= SECONDARY_EXEC_ENABLE_PML; >> + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); >> + >> + return 0; >> +} >> +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) >> +{ >> + struct kvm *kvm = vmx->vcpu.kvm; >> + u64 *pml_buf; >> + u16 pml_idx; >> + >> + pml_idx = vmcs_read16(GUEST_PML_INDEX); >> + >> + /* Do nothing if PML buffer is empty */ >> + if (pml_idx == (PML_ENTITY_NUM - 1)) >> + return; >> + >> + /* PML index always points to next available PML buffer entity */ >> + if (pml_idx >= PML_ENTITY_NUM) >> + pml_idx = 0; >> + else >> + pml_idx++; > If the pml_idx is >= PML_ENTITY_NUM and < 0xffff, the log is empty, In this case, the log is full, actually. PML index is set to PML_ENTITY_NUM - 1 initially, and hardware decrease it after GPA is logged. Below is the pseudocode of PML hardware logic. IF (PML Index[15:9] ? 0) THEN VM exit; FI; set accessed and dirty flags for EPT; IF (a dirty flag was updated from 0 to 1) THEN PML address[PML index] ? 4-KByte-aligned guest-physical address; PML index is decremented; FI; Thanks, -Kai > so it would be better to use just 'pml_idx++' and unsigned modulo. > > (Could also 'assert(pml_idx < PML_ENTITY_NUM)' then.) > >> + >> + pml_buf = page_address(vmx->pml_pg); >> + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { >> + u64 gpa; >> + >> + gpa = pml_buf[pml_idx]; >> + WARN_ON(gpa & (PAGE_SIZE - 1)); >> + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); >> + } >> + >> + /* reset PML index */ >> + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); >> +} >> @@ -9492,6 +9655,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) >> +static void vmx_slot_enable_log_dirty(struct kvm *kvm, >> + struct kvm_memory_slot *slot) >> +{ >> + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); > (New slot contains dirty pages?) > >> + kvm_mmu_slot_largepage_remove_write_access(kvm, slot); >> +} > Thanks. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2015-02-05 14:23+0800, Kai Huang: > On 02/03/2015 11:18 PM, Radim Kr?má? wrote: > >You have it protected by CONFIG_X86_64, but use it unconditionally. > Thanks for catching. This has been fixed by another patch, and the fix has > also been merged by Paolo. (And I haven't noticed the followup either ...) I don't know of any present bugs in this patch and all questions have been cleared, Thanks. > >>+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML; > >What is the harm of enabling it here? > > > >(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES seems similar and does it.) > Because the PML feature detection is unconditional (meaning > SECONDARY_EXEC_ENABLE_PML is always in vmcs_config.cpu_based_2nd_exec_ctrl), > but the PML buffer is only created when vcpu is created, and it is > controlled by 'enable_pml' module parameter, if we always enable > SECONDARY_EXEC_ENABLE_PML here, no PML buffer will be created if PML is > disabled by 'enable_pml' parameter, I meant if (!enable_pml) exec_control &= ~SECONDARY_EXEC_ENABLE_PML here and no exec_control operations in vmx_create_vcpu(). > so it's better to enable it along with > creating PML buffer. I think the reason why KVM split the setup into vmx_create_vcpu() and vmx_secondary_exec_control() is to simplify nested virtualization. > >>+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && > >>+ cpu_has_virtual_nmis() && > >>+ (exit_qualification & INTR_INFO_UNBLOCK_NMI)) > >>+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, > >>+ GUEST_INTR_STATE_NMI); > >Relevant part of the specification is pasted from 27.2.2 (Information > >for VM Exits Due to Vectored Events), which makes me think that this bit > >is mirrored to IDT-vectoring information field ... > > > >Isn't vmx_recover_nmi_blocking() enough? > > > >(I see the same code in handle_ept_violation(), but wasn't that needed > > just because of a hardware error?) > This needs to be handled in both EPT violation and PML. If you look at the > PML specification (the link is in cover letter), you can see the definition > of bit 12 of exit_qualification (section 1.5), which explains above code. > The same definition of exit_qualification is in EPT violation part in > Intel's SDM. True ... IDT-vectoring doesn't set bit 12 because PML exit isn't a fault and 27.2.2: "— For all other relevant VM exits, bit 12 is cleared to 0." (This was humourously pasted to PML as well.) > >>+ /* PML index always points to next available PML buffer entity */ > >>+ if (pml_idx >= PML_ENTITY_NUM) > >>+ pml_idx = 0; > >>+ else > >>+ pml_idx++; > >If the pml_idx is >= PML_ENTITY_NUM and < 0xffff, the log is empty, > In this case, the log is full, actually. PML index is set to PML_ENTITY_NUM > - 1 initially, and hardware decrease it after GPA is logged. > > Below is the pseudocode of PML hardware logic. > > IF (PML Index[15:9] ? 0) > THEN VM exit; > FI; pml_idx >= PML_ENTITY_NUM exits without modifying the buffer, > set accessed and dirty flags for EPT; > IF (a dirty flag was updated from 0 to 1) > THEN > PML address[PML index] ? 4-KByte-aligned guest-physical address; > PML index is decremented; 0xffff is the only value that specifies full buffer, the rest means that we incorrectly set up the initial index and VMX exited without changing the buffer => we shouldn't read it. (I should have said "untouched" instead of "empty".) > FI; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 02/05/2015 11:04 PM, Radim Kr?má? wrote: > 2015-02-05 14:23+0800, Kai Huang: >> On 02/03/2015 11:18 PM, Radim Kr?má? wrote: >>> You have it protected by CONFIG_X86_64, but use it unconditionally. >> Thanks for catching. This has been fixed by another patch, and the fix has >> also been merged by Paolo. > (And I haven't noticed the followup either ...) > > I don't know of any present bugs in this patch and all questions have > been cleared, > > Thanks. > >>>> + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; >>> What is the harm of enabling it here? >>> >>> (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES seems similar and does it.) >> Because the PML feature detection is unconditional (meaning >> SECONDARY_EXEC_ENABLE_PML is always in vmcs_config.cpu_based_2nd_exec_ctrl), >> but the PML buffer is only created when vcpu is created, and it is >> controlled by 'enable_pml' module parameter, if we always enable >> SECONDARY_EXEC_ENABLE_PML here, no PML buffer will be created if PML is >> disabled by 'enable_pml' parameter, > I meant > > if (!enable_pml) > exec_control &= ~SECONDARY_EXEC_ENABLE_PML > > here and no exec_control operations in vmx_create_vcpu(). This also works. I originally thought put the enabling part and buffer allocation together is more straightforward. > >> so it's better to enable it along with >> creating PML buffer. > I think the reason why KVM split the setup into vmx_create_vcpu() and > vmx_secondary_exec_control() is to simplify nested virtualization. This is a good point and I'll think about it :) > >>>> + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && >>>> + cpu_has_virtual_nmis() && >>>> + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) >>>> + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, >>>> + GUEST_INTR_STATE_NMI); >>> Relevant part of the specification is pasted from 27.2.2 (Information >>> for VM Exits Due to Vectored Events), which makes me think that this bit >>> is mirrored to IDT-vectoring information field ... >>> >>> Isn't vmx_recover_nmi_blocking() enough? >>> >>> (I see the same code in handle_ept_violation(), but wasn't that needed >>> just because of a hardware error?) >> This needs to be handled in both EPT violation and PML. If you look at the >> PML specification (the link is in cover letter), you can see the definition >> of bit 12 of exit_qualification (section 1.5), which explains above code. >> The same definition of exit_qualification is in EPT violation part in >> Intel's SDM. > True ... IDT-vectoring doesn't set bit 12 because PML exit isn't a fault > and 27.2.2: "— For all other relevant VM exits, bit 12 is cleared to 0." > (This was humourously pasted to PML as well.) > >>>> + /* PML index always points to next available PML buffer entity */ >>>> + if (pml_idx >= PML_ENTITY_NUM) >>>> + pml_idx = 0; >>>> + else >>>> + pml_idx++; >>> If the pml_idx is >= PML_ENTITY_NUM and < 0xffff, the log is empty, >> In this case, the log is full, actually. PML index is set to PML_ENTITY_NUM >> - 1 initially, and hardware decrease it after GPA is logged. >> >> Below is the pseudocode of PML hardware logic. >> >> IF (PML Index[15:9] ? 0) >> THEN VM exit; >> FI; > pml_idx >= PML_ENTITY_NUM exits without modifying the buffer, Yes. > >> set accessed and dirty flags for EPT; >> IF (a dirty flag was updated from 0 to 1) >> THEN >> PML address[PML index] ? 4-KByte-aligned guest-physical address; >> PML index is decremented; > 0xffff is the only value that specifies full buffer, the rest means > that we incorrectly set up the initial index and VMX exited without > changing the buffer => we shouldn't read it. > (I should have said "untouched" instead of "empty".) Yes theoretically, according to the pseudocode, 0xffff is the only possible value that specifies full buffer. Probably a better way is to check if pml_idx is in range [-1, 511] at the beginning and give a warning if it's not, or just ASSERT it. Then we can simply to do a ++pml_idx (with changing pml_idx to signed short type). But the current code should also work anyway. Thanks, -Kai > >> FI; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 02/05/2015 11:04 PM, Radim Kr?má? wrote: > 2015-02-05 14:23+0800, Kai Huang: >> On 02/03/2015 11:18 PM, Radim Kr?má? wrote: >>> You have it protected by CONFIG_X86_64, but use it unconditionally. >> Thanks for catching. This has been fixed by another patch, and the fix has >> also been merged by Paolo. > (And I haven't noticed the followup either ...) > > I don't know of any present bugs in this patch and all questions have > been cleared, So far I see no other bug reported :) Thanks, -Kai > > Thanks. > >>>> + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; >>> What is the harm of enabling it here? >>> >>> (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES seems similar and does it.) >> Because the PML feature detection is unconditional (meaning >> SECONDARY_EXEC_ENABLE_PML is always in vmcs_config.cpu_based_2nd_exec_ctrl), >> but the PML buffer is only created when vcpu is created, and it is >> controlled by 'enable_pml' module parameter, if we always enable >> SECONDARY_EXEC_ENABLE_PML here, no PML buffer will be created if PML is >> disabled by 'enable_pml' parameter, > I meant > > if (!enable_pml) > exec_control &= ~SECONDARY_EXEC_ENABLE_PML > > here and no exec_control operations in vmx_create_vcpu(). > >> so it's better to enable it along with >> creating PML buffer. > I think the reason why KVM split the setup into vmx_create_vcpu() and > vmx_secondary_exec_control() is to simplify nested virtualization. > >>>> + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && >>>> + cpu_has_virtual_nmis() && >>>> + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) >>>> + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, >>>> + GUEST_INTR_STATE_NMI); >>> Relevant part of the specification is pasted from 27.2.2 (Information >>> for VM Exits Due to Vectored Events), which makes me think that this bit >>> is mirrored to IDT-vectoring information field ... >>> >>> Isn't vmx_recover_nmi_blocking() enough? >>> >>> (I see the same code in handle_ept_violation(), but wasn't that needed >>> just because of a hardware error?) >> This needs to be handled in both EPT violation and PML. If you look at the >> PML specification (the link is in cover letter), you can see the definition >> of bit 12 of exit_qualification (section 1.5), which explains above code. >> The same definition of exit_qualification is in EPT violation part in >> Intel's SDM. > True ... IDT-vectoring doesn't set bit 12 because PML exit isn't a fault > and 27.2.2: "— For all other relevant VM exits, bit 12 is cleared to 0." > (This was humourously pasted to PML as well.) > >>>> + /* PML index always points to next available PML buffer entity */ >>>> + if (pml_idx >= PML_ENTITY_NUM) >>>> + pml_idx = 0; >>>> + else >>>> + pml_idx++; >>> If the pml_idx is >= PML_ENTITY_NUM and < 0xffff, the log is empty, >> In this case, the log is full, actually. PML index is set to PML_ENTITY_NUM >> - 1 initially, and hardware decrease it after GPA is logged. >> >> Below is the pseudocode of PML hardware logic. >> >> IF (PML Index[15:9] ? 0) >> THEN VM exit; >> FI; > pml_idx >= PML_ENTITY_NUM exits without modifying the buffer, > >> set accessed and dirty flags for EPT; >> IF (a dirty flag was updated from 0 to 1) >> THEN >> PML address[PML index] ? 4-KByte-aligned guest-physical address; >> PML index is decremented; > 0xffff is the only value that specifies full buffer, the rest means > that we incorrectly set up the initial index and VMX exited without > changing the buffer => we shouldn't read it. > (I should have said "untouched" instead of "empty".) > >> FI; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/02/2015 07:23, Kai Huang wrote: >>> >>> + /* PML is enabled/disabled in creating/destorying vcpu */ >>> + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; >> What is the harm of enabling it here? >> >> (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES seems similar and does it.) > > Because the PML feature detection is unconditional (meaning > SECONDARY_EXEC_ENABLE_PML is always in > vmcs_config.cpu_based_2nd_exec_ctrl), but the PML buffer is only created > when vcpu is created, and it is controlled by 'enable_pml' module > parameter, if we always enable SECONDARY_EXEC_ENABLE_PML here, no PML > buffer will be created if PML is disabled by 'enable_pml' parameter, so > it's better to enable it along with creating PML buffer. I guess this is the most interesting comment from Radim. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 45afaee..da772ed 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,7 @@ #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 @@ -121,6 +122,7 @@ enum vmcs_field { GUEST_LDTR_SELECTOR = 0x0000080c, GUEST_TR_SELECTOR = 0x0000080e, GUEST_INTR_STATUS = 0x00000810, + GUEST_PML_INDEX = 0x00000812, HOST_ES_SELECTOR = 0x00000c00, HOST_CS_SELECTOR = 0x00000c02, HOST_SS_SELECTOR = 0x00000c04, @@ -140,6 +142,8 @@ enum vmcs_field { VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + PML_ADDRESS = 0x0000200e, + PML_ADDRESS_HIGH = 0x0000200f, TSC_OFFSET = 0x00002010, TSC_OFFSET_HIGH = 0x00002011, VIRTUAL_APIC_PAGE_ADDR = 0x00002012, diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index ff2b8e2..c5f1a1d 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -73,6 +73,7 @@ #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 587149b..a139977 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -846,6 +846,24 @@ TRACE_EVENT(kvm_track_tsc, __print_symbolic(__entry->host_clock, host_clocks)) ); +/* + * Tracepoint for PML full VMEXIT. + */ +TRACE_EVENT(kvm_pml_full, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %d: PML full", __entry->vcpu_id) +); + #endif /* CONFIG_X86_64 */ TRACE_EVENT(kvm_ple_window, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c987374..de5ce82 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -101,6 +101,9 @@ module_param(nested, bool, S_IRUGO); static u64 __read_mostly host_xss; +static bool __read_mostly enable_pml = 1; +module_param_named(pml, enable_pml, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -516,6 +519,10 @@ struct vcpu_vmx { /* Dynamic PLE window. */ int ple_window; bool ple_window_dirty; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; }; enum segment_cache_field { @@ -1068,6 +1075,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void) SECONDARY_EXEC_SHADOW_VMCS; } +static inline bool cpu_has_vmx_pml(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -2924,7 +2936,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_PML; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -4355,6 +4368,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + /* PML is enabled/disabled in creating/destorying vcpu */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + return exec_control; } @@ -5942,6 +5958,20 @@ static __init int hardware_setup(void) update_ple_window_actual_max(); + /* + * Only enable PML when hardware supports PML feature, and both EPT + * and EPT A/D bit features are enabled -- PML depends on them to work. + */ + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) + enable_pml = 0; + + if (!enable_pml) { + kvm_x86_ops->slot_enable_log_dirty = NULL; + kvm_x86_ops->slot_disable_log_dirty = NULL; + kvm_x86_ops->flush_log_dirty = NULL; + kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + } + return alloc_kvm_area(); out7: @@ -6971,6 +7001,31 @@ static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector) return pi_test_pir(vector, &vmx->pi_desc); } +static int handle_pml_full(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + + trace_kvm_pml_full(vcpu->vcpu_id); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + /* + * PML buffer FULL happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + /* + * PML buffer already flushed at beginning of VMEXIT. Nothing to do + * here.., and there's no userspace involvement needed for PML. + */ + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7019,6 +7074,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_invvpid, [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_PML_FULL] = handle_pml_full, }; static const int kvm_vmx_max_exit_handlers = @@ -7325,6 +7381,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } +static int vmx_enable_pml(struct vcpu_vmx *vmx) +{ + struct page *pml_pg; + u32 exec_control; + + pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!pml_pg) + return -ENOMEM; + + vmx->pml_pg = pml_pg; + + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + + return 0; +} + +static void vmx_disable_pml(struct vcpu_vmx *vmx) +{ + u32 exec_control; + + ASSERT(vmx->pml_pg); + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); +} + +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) +{ + struct kvm *kvm = vmx->vcpu.kvm; + u64 *pml_buf; + u16 pml_idx; + + pml_idx = vmcs_read16(GUEST_PML_INDEX); + + /* Do nothing if PML buffer is empty */ + if (pml_idx == (PML_ENTITY_NUM - 1)) + return; + + /* PML index always points to next available PML buffer entity */ + if (pml_idx >= PML_ENTITY_NUM) + pml_idx = 0; + else + pml_idx++; + + pml_buf = page_address(vmx->pml_pg); + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + u64 gpa; + + gpa = pml_buf[pml_idx]; + WARN_ON(gpa & (PAGE_SIZE - 1)); + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + } + + /* reset PML index */ + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); +} + +/* + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. + * Called before reporting dirty_bitmap to userspace. + */ +static void kvm_flush_pml_buffers(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + /* + * We only need to kick vcpu out of guest mode here, as PML buffer + * is flushed at beginning of all VMEXITs, and it's obvious that only + * vcpus running in guest are possible to have unflushed GPAs in PML + * buffer. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7335,6 +7474,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + /* + * Flush logged GPAs PML buffer, this will make dirty_bitmap more + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before + * querying dirty_bitmap, we only need to kick all vcpus out of guest + * mode as if vcpus is in root mode, the PML buffer must has been + * flushed already. + */ + if (enable_pml) + vmx_flush_pml_buffer(vmx); + /* If guest state is invalid, start emulating */ if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); @@ -7981,6 +8130,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_pml) + vmx_disable_pml(vmx); free_vpid(vmx); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); @@ -8051,6 +8202,18 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. + */ + if (enable_pml) { + err = vmx_enable_pml(vmx); + if (err) + goto free_vmcs; + } + return &vmx->vcpu; free_vmcs: @@ -9492,6 +9655,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) shrink_ple_window(vcpu); } +static void vmx_slot_enable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + kvm_mmu_slot_largepage_remove_write_access(kvm, slot); +} + +static void vmx_slot_disable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_set_dirty(kvm, slot); +} + +static void vmx_flush_log_dirty(struct kvm *kvm) +{ + kvm_flush_pml_buffers(kvm); +} + +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *memslot, + gfn_t offset, unsigned long mask) +{ + kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -9601,6 +9789,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_nested_events = vmx_check_nested_events, .sched_in = vmx_sched_in, + + .slot_enable_log_dirty = vmx_slot_enable_log_dirty, + .slot_disable_log_dirty = vmx_slot_disable_log_dirty, + .flush_log_dirty = vmx_flush_log_dirty, + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 442ee7d..1373e04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7880,3 +7880,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
This patch adds PML support in VMX. A new module parameter 'enable_pml' is added to allow user to enable/disable it manually. Signed-off-by: Kai Huang <kai.huang@linux.intel.com> --- arch/x86/include/asm/vmx.h | 4 + arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kvm/trace.h | 18 ++++ arch/x86/kvm/vmx.c | 195 +++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 1 + 5 files changed, 218 insertions(+), 1 deletion(-)