Message ID | 20201214083905.2017260-18-dwmw2@infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: Add minimal support for Xen HVM guests | expand |
On 12/14/20 8:39 AM, David Woodhouse wrote: > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index df44d9e50adc..e627139cf8cd 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -8896,7 +8896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > kvm_x86_ops.msr_filter_changed(vcpu); > } > > - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { > + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || > + kvm_xen_has_interrupt(vcpu)) { > ++vcpu->stat.req_event; > kvm_apic_accept_events(vcpu); > if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c > index 17cbb4462b7e..4bc9da9fcfb8 100644 > --- a/arch/x86/kvm/xen.c > +++ b/arch/x86/kvm/xen.c > @@ -176,6 +176,45 @@ void kvm_xen_setup_runstate_page(struct kvm_vcpu *v) > kvm_xen_update_runstate(v, RUNSTATE_running, steal_time); > } > > +int kvm_xen_has_interrupt(struct kvm_vcpu *v) > +{ > + u8 rc = 0; > + > + /* > + * If the global upcall vector (HVMIRQ_callback_vector) is set and > + * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. > + */ > + if (v->arch.xen.vcpu_info_set && v->kvm->arch.xen.upcall_vector) { > + struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; > + struct kvm_memslots *slots = kvm_memslots(v->kvm); > + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); > + To have less nesting, wouldn't it be better to invert the logic? say: u8 rc = 0; struct gfn_to_hva_cache *ghc struct kvm_memslots *slots; unsigned int offset; if (!v->arch.xen.vcpu_info_set || !v->kvm->arch.xen.upcall_vector) return 0; BUILD_BUG_ON(...) ghc = &v->arch.xen.vcpu_info_cache; slots = kvm_memslots(v->kvm); offset = offsetof(struct vcpu_info, evtchn_upcall_pending); But I think there's a flaw here. That is handling the case where you don't have a vcpu_info registered, and only shared info. The vcpu_info is then placed elsewhere, i.e. another offset out of shared_info -- which is *I think* the case for PVHVM Windows guests. Perhaps introducing a helper which adds xen_vcpu_info() and returns you the hva (picking the right cache) similar to the RFC patch. Albeit that was with page pinning, but borrowing an older version I had with hva_to_gfn_cache incantation would probably look like: if (v->arch.xen.vcpu_info_set) { ghc = &v->arch.xen.vcpu_info_cache; } else { ghc = &v->arch.xen.vcpu_info_cache; offset += offsetof(struct shared_info, vcpu_info); offset += (v - kvm_get_vcpu_by_id(0)) * sizeof(struct vcpu_info); } if (likely(slots->generation == ghc->generation && !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { /* Fast path */ __get_user(rc, (u8 __user *)ghc->hva + offset); } else { /* Slow path */ kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, sizeof(rc)); } ? Joao
> -----Original Message----- > From: Joao Martins <joao.m.martins@oracle.com> > Sent: 14 December 2020 13:20 > To: David Woodhouse <dwmw2@infradead.org> > Cc: Paolo Bonzini <pbonzini@redhat.com>; Ankur Arora <ankur.a.arora@oracle.com>; Boris Ostrovsky > <boris.ostrovsky@oracle.com>; Sean Christopherson <seanjc@google.com>; Graf (AWS), Alexander > <graf@amazon.de>; Aslanidis (AWS), Ioannis <iaslan@amazon.de>; Durrant, Paul <pdurrant@amazon.co.uk>; > Agache, Alexandru <aagch@amazon.com>; Florescu, Andreea <fandree@amazon.com>; kvm@vger.kernel.org > Subject: RE: [EXTERNAL] [PATCH v3 17/17] KVM: x86/xen: Add event channel interrupt vector upcall > > CAUTION: This email originated from outside of the organization. Do not click links or open > attachments unless you can confirm the sender and know the content is safe. > > > > On 12/14/20 8:39 AM, David Woodhouse wrote: > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index df44d9e50adc..e627139cf8cd 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -8896,7 +8896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > > kvm_x86_ops.msr_filter_changed(vcpu); > > } > > > > - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { > > + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || > > + kvm_xen_has_interrupt(vcpu)) { > > ++vcpu->stat.req_event; > > kvm_apic_accept_events(vcpu); > > if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > > diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c > > index 17cbb4462b7e..4bc9da9fcfb8 100644 > > --- a/arch/x86/kvm/xen.c > > +++ b/arch/x86/kvm/xen.c > > @@ -176,6 +176,45 @@ void kvm_xen_setup_runstate_page(struct kvm_vcpu *v) > > kvm_xen_update_runstate(v, RUNSTATE_running, steal_time); > > } > > > > +int kvm_xen_has_interrupt(struct kvm_vcpu *v) > > +{ > > + u8 rc = 0; > > + > > + /* > > + * If the global upcall vector (HVMIRQ_callback_vector) is set and > > + * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. > > + */ > > + if (v->arch.xen.vcpu_info_set && v->kvm->arch.xen.upcall_vector) { > > + struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; > > + struct kvm_memslots *slots = kvm_memslots(v->kvm); > > + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); > > + > > To have less nesting, wouldn't it be better to invert the logic? > > say: > > u8 rc = 0; > struct gfn_to_hva_cache *ghc > struct kvm_memslots *slots; > unsigned int offset; > > > if (!v->arch.xen.vcpu_info_set || !v->kvm->arch.xen.upcall_vector) > return 0; > > BUILD_BUG_ON(...) > > ghc = &v->arch.xen.vcpu_info_cache; > slots = kvm_memslots(v->kvm); > offset = offsetof(struct vcpu_info, evtchn_upcall_pending); > > But I think there's a flaw here. That is handling the case where you don't have a > vcpu_info registered, and only shared info. The vcpu_info is then placed elsewhere, i.e. > another offset out of shared_info -- which is *I think* the case for PVHVM Windows guests. Only been true for Windows for about a week ;-) Paul > > Perhaps introducing a helper which adds xen_vcpu_info() and returns you the hva (picking > the right cache) similar to the RFC patch. Albeit that was with page pinning, but > borrowing an older version I had with hva_to_gfn_cache incantation would probably look like: > > > if (v->arch.xen.vcpu_info_set) { > ghc = &v->arch.xen.vcpu_info_cache; > } else { > ghc = &v->arch.xen.vcpu_info_cache; > offset += offsetof(struct shared_info, vcpu_info); > offset += (v - kvm_get_vcpu_by_id(0)) * sizeof(struct vcpu_info); > } > > if (likely(slots->generation == ghc->generation && > !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { > /* Fast path */ > __get_user(rc, (u8 __user *)ghc->hva + offset); > } else { > /* Slow path */ > kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, > sizeof(rc)); > } > > ? > > Joao
On Mon, 2020-12-14 at 13:19 +0000, Joao Martins wrote: > But I think there's a flaw here. That is handling the case where you don't have a > vcpu_info registered, and only shared info. The vcpu_info is then placed elsewhere, i.e. > another offset out of shared_info -- which is *I think* the case for PVHVM Windows guests. There is no such case any more. In my v3 patch set I *stopped* the kernel from attempting to use the vcpu_info embedded in the shinfo, and went to *requiring* that the VMM explicitly tell the kernel where it is. $ git diff xenpv-post-2..xenpv-post-3 -- Documentation diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d98c2ff90880..d1c30105e6fd 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4834,7 +4834,7 @@ experience inconsistent filtering behavior on MSR accesses. __u64 gfn; } shared_info; struct { - __u32 vcpu; + __u32 vcpu_id; __u64 gpa; } vcpu_attr; __u64 pad[4]; @@ -4849,9 +4849,13 @@ KVM_XEN_ATTR_TYPE_LONG_MODE KVM_XEN_ATTR_TYPE_SHARED_INFO Sets the guest physical frame number at which the Xen "shared info" - page resides. It is the default location for the vcpu_info for the - first 32 vCPUs, and contains other VM-wide data structures shared - between the VM and the host. + page resides. Note that although Xen places vcpu_info for the first + 32 vCPUs in the shared_info page, KVM does not automatically do so + and requires that KVM_XEN_ATTR_TYPE_VCPU_INFO be used explicitly + even when the vcpu_info for a given vCPU resides at the "default" + location in the shared_info page. This is because KVM is not aware + of the Xen CPU id which is used as the index into the vcpu_info[] + array, so cannot know the correct default location. KVM_XEN_ATTR_TYPE_VCPU_INFO Sets the guest physical address of the vcpu_info for a given vCPU. > Perhaps introducing a helper which adds xen_vcpu_info() and returns you the hva (picking > the right cache) similar to the RFC patch. Albeit that was with page pinning, but > borrowing an older version I had with hva_to_gfn_cache incantation would probably look like: > > > if (v->arch.xen.vcpu_info_set) { > ghc = &v->arch.xen.vcpu_info_cache; > } else { > ghc = &v->arch.xen.vcpu_info_cache; > offset += offsetof(struct shared_info, vcpu_info); > offset += (v - kvm_get_vcpu_by_id(0)) * sizeof(struct vcpu_info); > } The problem is that we *can't* just use shared_info->vcpu_info[N] because we don't know what N is. This is what I spent half of last week chasing down, because whatever I tried, the original version just ended up delivering interrupts to the wrong CPUs. The N we want there is actually the *XEN* vcpu_id, which Xen puts into CPUID leaf 0x40000004.ebx and which is also the ACPI ID. (Those two had better match up since Linux guests use CPUID 0x40000004 for the BSP and ACPI for the APs). The kernel knows nothing of those numbers. In particular they do *not* match up to the indices in kvm->vcpus[M] (which is vcpu->vcpu_idx and means nothing except the chronological order in which each vCPU's userspace thread happened to create it during startup), and they do not match up to vcpu->vcpu_id which is the APIC (not ACPI) ID. The kernel doesn't know. Let userspace tell it, since we already needed that API for the separate vcpu_info case anyway.
On 12/14/20 2:57 PM, David Woodhouse wrote: > On Mon, 2020-12-14 at 13:19 +0000, Joao Martins wrote: >> But I think there's a flaw here. That is handling the case where you don't have a >> vcpu_info registered, and only shared info. The vcpu_info is then placed elsewhere, i.e. >> another offset out of shared_info -- which is *I think* the case for PVHVM Windows guests. > > There is no such case any more. In my v3 patch set I *stopped* the > kernel from attempting to use the vcpu_info embedded in the shinfo, and > went to *requiring* that the VMM explicitly tell the kernel where it > is. > Sigh yes, I forgot about that -- and you did mentioned it in earlier posts. > $ git diff xenpv-post-2..xenpv-post-3 -- Documentation > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst > index d98c2ff90880..d1c30105e6fd 100644 > --- a/Documentation/virt/kvm/api.rst > +++ b/Documentation/virt/kvm/api.rst > @@ -4834,7 +4834,7 @@ experience inconsistent filtering behavior on MSR accesses. > __u64 gfn; > } shared_info; > struct { > - __u32 vcpu; > + __u32 vcpu_id; > __u64 gpa; > } vcpu_attr; > __u64 pad[4]; > @@ -4849,9 +4849,13 @@ KVM_XEN_ATTR_TYPE_LONG_MODE > > KVM_XEN_ATTR_TYPE_SHARED_INFO > Sets the guest physical frame number at which the Xen "shared info" > - page resides. It is the default location for the vcpu_info for the > - first 32 vCPUs, and contains other VM-wide data structures shared > - between the VM and the host. > + page resides. Note that although Xen places vcpu_info for the first > + 32 vCPUs in the shared_info page, KVM does not automatically do so > + and requires that KVM_XEN_ATTR_TYPE_VCPU_INFO be used explicitly > + even when the vcpu_info for a given vCPU resides at the "default" > + location in the shared_info page. This is because KVM is not aware > + of the Xen CPU id which is used as the index into the vcpu_info[] > + array, so cannot know the correct default location. > /me nods > KVM_XEN_ATTR_TYPE_VCPU_INFO > Sets the guest physical address of the vcpu_info for a given vCPU. > >> Perhaps introducing a helper which adds xen_vcpu_info() and returns you the hva (picking >> the right cache) similar to the RFC patch. Albeit that was with page pinning, but >> borrowing an older version I had with hva_to_gfn_cache incantation would probably look like: >> >> >> if (v->arch.xen.vcpu_info_set) { >> ghc = &v->arch.xen.vcpu_info_cache; >> } else { >> ghc = &v->arch.xen.vcpu_info_cache; >> offset += offsetof(struct shared_info, vcpu_info); >> offset += (v - kvm_get_vcpu_by_id(0)) * sizeof(struct vcpu_info); >> } > > The problem is that we *can't* just use shared_info->vcpu_info[N] > because we don't know what N is. > > This is what I spent half of last week chasing down, because whatever I > tried, the original version just ended up delivering interrupts to the > wrong CPUs. > > The N we want there is actually the *XEN* vcpu_id, which Xen puts into > CPUID leaf 0x40000004.ebx and which is also the ACPI ID. (Those two had > better match up since Linux guests use CPUID 0x40000004 for the BSP and > ACPI for the APs). > > The kernel knows nothing of those numbers. In particular they do *not* > match up to the indices in kvm->vcpus[M] (which is vcpu->vcpu_idx and > means nothing except the chronological order in which each vCPU's > userspace thread happened to create it during startup), and they do not > match up to vcpu->vcpu_id which is the APIC (not ACPI) ID. > > The kernel doesn't know. Let userspace tell it, since we already needed > that API for the separate vcpu_info case anyway. > All this time, I was only considering the guest hypercalls in XENMEM_populate_physmap() for shared_info, and register vcpu_info as the VMM usage for setting the corresponding attributes. So you register the vcpu_info regardless of guest placement, and I didn't correlate that and your earlier comment (and also forgot about it) that you used to remove that complexity. Joao
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4b345a8945ea..68a66b98fd5f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -903,6 +903,7 @@ struct msr_bitmap_range { struct kvm_xen { bool long_mode; bool shinfo_set; + u8 upcall_vector; struct gfn_to_hva_cache shinfo_cache; }; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 814698e5b152..24668b51b5c8 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -14,6 +14,7 @@ #include "irq.h" #include "i8254.h" #include "x86.h" +#include "xen.h" /* * check if there are pending timer events @@ -56,6 +57,9 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.injected; + if (kvm_xen_has_interrupt(v)) + return 1; + if (!kvm_apic_accept_pic_intr(v)) return 0; @@ -110,6 +114,9 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; + if (kvm_xen_has_interrupt(v)) + return v->kvm->arch.xen.upcall_vector; + if (irqchip_split(v->kvm)) { int vector = v->arch.pending_external_vector; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index df44d9e50adc..e627139cf8cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8896,7 +8896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops.msr_filter_changed(vcpu); } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || + kvm_xen_has_interrupt(vcpu)) { ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 17cbb4462b7e..4bc9da9fcfb8 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -176,6 +176,45 @@ void kvm_xen_setup_runstate_page(struct kvm_vcpu *v) kvm_xen_update_runstate(v, RUNSTATE_running, steal_time); } +int kvm_xen_has_interrupt(struct kvm_vcpu *v) +{ + u8 rc = 0; + + /* + * If the global upcall vector (HVMIRQ_callback_vector) is set and + * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. + */ + if (v->arch.xen.vcpu_info_set && v->kvm->arch.xen.upcall_vector) { + struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; + struct kvm_memslots *slots = kvm_memslots(v->kvm); + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); + + /* No need for compat handling here */ + BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != + offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); + BUILD_BUG_ON(sizeof(rc) != + sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending)); + BUILD_BUG_ON(sizeof(rc) != + sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending)); + + /* + * For efficiency, this mirrors the checks for using the valid + * cache in kvm_read_guest_offset_cached(), but just uses + * __get_user() instead. And falls back to the slow path. + */ + if (likely(slots->generation == ghc->generation && + !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { + /* Fast path */ + __get_user(rc, (u8 __user *)ghc->hva + offset); + } else { + /* Slow path */ + kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, + sizeof(rc)); + } + } + return rc; +} + int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { struct kvm_vcpu *v; @@ -245,6 +284,14 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) v->arch.xen.last_state_ns = ktime_get_ns(); break; + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: + if (data->u.vector < 0x10) + return -EINVAL; + + kvm->arch.xen.upcall_vector = data->u.vector; + r = 0; + break; + default: break; } @@ -303,6 +350,11 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) } break; + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: + data->u.vector = kvm->arch.xen.upcall_vector; + r = 0; + break; + default: break; } diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 407e717476d6..d64916ac4a12 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -11,6 +11,7 @@ void kvm_xen_setup_runstate_page(struct kvm_vcpu *vcpu); void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu); +int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu); int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hypercall(struct kvm_vcpu *vcpu); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 749a7112df99..33609bc25021 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1587,6 +1587,7 @@ struct kvm_xen_hvm_attr { union { __u8 long_mode; + __u8 vector; struct { __u64 gfn; } shared_info; @@ -1604,6 +1605,7 @@ struct kvm_xen_hvm_attr { #define KVM_XEN_ATTR_TYPE_VCPU_INFO 0x2 #define KVM_XEN_ATTR_TYPE_VCPU_TIME_INFO 0x3 #define KVM_XEN_ATTR_TYPE_VCPU_RUNSTATE 0x4 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x5 /* Secure Encrypted Virtualization command */ enum sev_cmd_id {