[v1,1/2] KVM: x86/xen: Reconcile fast and slow paths for runstate update

Message ID	20221127122210.248427-2-dwmw2@infradead.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> From: David Woodhouse <dwmw2@infradead.org> To: Sean Christopherson <seanjc@google.com>, Paolo Bonzini <pbonzini@redhat.com> Cc: Paul Durrant <paul@xen.org>, Michal Luczaj <mhal@rbox.co>, kvm@vger.kernel.org Subject: [PATCH v1 1/2] KVM: x86/xen: Reconcile fast and slow paths for runstate update Date: Sun, 27 Nov 2022 12:22:09 +0000 Message-Id: <20221127122210.248427-2-dwmw2@infradead.org> In-Reply-To: <20221127122210.248427-1-dwmw2@infradead.org> References: <20221127122210.248427-1-dwmw2@infradead.org> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sender: David Woodhouse <dwmw2@infradead.org> Precedence: bulk
Series	KVM: x86/xen: Runstate cleanups on top of kvm/queue \| expand [v1,0/2] KVM: x86/xen: Runstate cleanups on top of kvm/queue [v1,1/2] KVM: x86/xen: Reconcile fast and slow paths for runstate update [v1,2/2] KVM: x86/xen: Allow XEN_RUNSTATE_UPDATE flag behaviour to be configured

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index b246decb53a9..d1a506986a32 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -177,44 +177,78 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; size_t user_len, user_len1, user_len2; struct vcpu_runstate_info rs; - int *rs_state = &rs.state; unsigned long flags; size_t times_ofs; - u8 *update_bit; + uint8_t *update_bit; + uint64_t *rs_times; + int *rs_state; /* * The only difference between 32-bit and 64-bit versions of the * runstate struct is the alignment of uint64_t in 32-bit, which * means that the 64-bit version has an additional 4 bytes of - * padding after the first field 'state'. + * padding after the first field 'state'. Let's be really really + * paranoid about that, and matching it with our internal data + * structures that we memcpy into it... */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); #ifdef CONFIG_X86_64 + /* + * The 64-bit structure has 4 bytes of padding before 'state_entry_time' + * so each subsequent field is shifted by 4, and it's 4 bytes longer. + */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); + BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4); #endif + /* + * The state field is in the same place at the start of both structs, + * and is the same size (int) as vx->current_runstate. + */ + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != + offsetof(struct compat_vcpu_runstate_info, state)); + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) != + sizeof(vx->current_runstate)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != + sizeof(vx->current_runstate)); + + /* + * The state_entry_time field is 64 bits in both versions, and the + * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86 + * is little-endian means that it's in the last *byte* of the word. + * That detail is important later. + */ + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80); + + /* + * The time array is four 64-bit quantities in both versions, matching + * the vx->runstate_times and immediately following state_entry_time. + */ + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != + offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t)); + BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != + offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t)); + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != + sizeof_field(struct compat_vcpu_runstate_info, time)); + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != + sizeof(vx->runstate_times)); if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { user_len = sizeof(struct vcpu_runstate_info); times_ofs = offsetof(struct vcpu_runstate_info, state_entry_time); -#ifdef CONFIG_X86_64 - /* - * Don't leak kernel memory through the padding in the 64-bit - * struct if we take the split-page path. - */ - memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time)); -#endif } else { user_len = sizeof(struct compat_vcpu_runstate_info); times_ofs = offsetof(struct compat_vcpu_runstate_info, state_entry_time); - /* Start of struct for compat mode (qv). */ - rs_state++; } /* @@ -251,156 +285,123 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) read_lock_irqsave(&gpc1->lock, flags); } - /* - * The common case is that it all fits on a page and we can - * just do it the simple way. - */ if (likely(!user_len2)) { /* - * We use 'int *user_state' to point to the state field, and - * 'u64 *user_times' for runstate_entry_time. So the actual - * array of time[] in each state starts at user_times[1]. + * Set up three pointers directly to the runstate_info + * struct in the guest (via the GPC). + * + * • @rs_state → state field + * • @rs_times → state_entry_time field. + * • @update_bit → last byte of state_entry_time, which + * contains the XEN_RUNSTATE_UPDATE bit. */ - int *user_state = gpc1->khva; - u64 *user_times = gpc1->khva + times_ofs; - + rs_state = gpc1->khva; + rs_times = gpc1->khva + times_ofs; + update_bit = ((void *)(&rs_times[1])) - 1; + } else { /* - * The XEN_RUNSTATE_UPDATE bit is the top bit of the state_entry_time - * field. We need to set it (and write-barrier) before writing to the - * the rest of the structure, and clear it last. Just as Xen does, we - * address the single *byte* in which it resides because it might be - * in a different cache line to the rest of the 64-bit word, due to - * the (lack of) alignment constraints. + * The guest's runstate_info is split across two pages and we + * need to hold and validate both GPCs simultaneously. We can + * declare a lock ordering GPC1 > GPC2 because nothing else + * takes them more than one at a time. */ - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != - sizeof(uint64_t)); - BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != - sizeof(uint64_t)); - BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80); + read_lock(&gpc2->lock); - update_bit = ((u8 *)(&user_times[1])) - 1; - *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; - smp_wmb(); + if (!kvm_gpc_check(v->kvm, gpc2, gpc2->gpa, user_len2)) { + read_unlock(&gpc2->lock); + read_unlock_irqrestore(&gpc1->lock, flags); - /* - * Next, write the new runstate. This is in the *same* place - * for 32-bit and 64-bit guests, asserted here for paranoia. - */ - BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != - offsetof(struct compat_vcpu_runstate_info, state)); - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) != - sizeof(vx->current_runstate)); - BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != - sizeof(vx->current_runstate)); - *user_state = vx->current_runstate; + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; - /* - * Then the actual runstate_entry_time (with the UPDATE bit - * still set). - */ - *user_times = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; + /* + * Use kvm_gpc_activate() here because if the runstate + * area was configured in 32-bit mode and only extends + * to the second page now because the guest changed to + * 64-bit mode, the second GPC won't have been set up. + */ + if (kvm_gpc_activate(v->kvm, gpc2, NULL, KVM_HOST_USES_PFN, + gpc1->gpa + user_len1, user_len2)) + return; + + /* + * We dropped the lock on GPC1 so we have to go all the + * way back and revalidate that too. + */ + goto retry; + } /* - * Write the actual runstate times immediately after the - * runstate_entry_time. + * In this case, the runstate_info struct will be assembled on + * the kernel stack (compat or not as appropriate) and will + * be copied to GPC1/GPC2 with a dual memcpy. Set up the three + * rs pointers accordingly. */ - BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != - offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); - BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != - offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != - sizeof_field(struct compat_vcpu_runstate_info, time)); - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != - sizeof(vx->runstate_times)); - memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); - - smp_wmb(); + rs_times = &rs.state_entry_time; /* - * Finally, clear the 'updating' bit. Don't use &= here because - * the compiler may not realise that update_bit and user_times - * point to the same place. That's a classic pointer-aliasing - * problem. + * The rs_state pointer points to the start of what we'll + * copy to the guest, which in the case of a compat guest + * is the 32-bit field that the compiler thinks is padding. */ - *update_bit = vx->runstate_entry_time >> 56; - smp_wmb(); - - goto done_1; - } - - /* - * The painful code path. It's split across two pages and we need to - * hold and validate both GPCs simultaneously. Thankfully we can get - * away with declaring a lock ordering GPC1 > GPC2 because nothing - * else takes them more than one at a time. - */ - read_lock(&gpc2->lock); - - if (!kvm_gpc_check(v->kvm, gpc2, gpc2->gpa, user_len2)) { - read_unlock(&gpc2->lock); - read_unlock_irqrestore(&gpc1->lock, flags); - - /* When invoked from kvm_sched_out() we cannot sleep */ - if (atomic) - return; + rs_state = ((void *)rs_times) - times_ofs; /* - * Use kvm_gpc_activate() here because if the runstate - * area was configured in 32-bit mode and only extends - * to the second page now because the guest changed to - * 64-bit mode, the second GPC won't have been set up. + * The update_bit is still directly in the guest memory, + * via one GPC or the other. */ - if (kvm_gpc_activate(v->kvm, gpc2, NULL, KVM_HOST_USES_PFN, - gpc1->gpa + user_len1, user_len2)) - return; + if (user_len1 >= times_ofs + sizeof(uint64_t)) + update_bit = gpc1->khva + times_ofs + + sizeof(uint64_t) - 1; + else + update_bit = gpc2->khva + times_ofs + + sizeof(uint64_t) - 1 - user_len1; +#ifdef CONFIG_X86_64 /* - * We dropped the lock on GPC1 so we have to go all the way - * back and revalidate that too. + * Don't leak kernel memory through the padding in the 64-bit + * version of the struct. */ - goto retry; + memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time)); +#endif } /* - * Work out where the byte containing the XEN_RUNSTATE_UPDATE bit is. + * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the + * state_entry_time field, directly in the guest. We need to set + * that (and write-barrier) before writing to the rest of the + * structure, and clear it last. Just as Xen does, we address the + * single *byte* in which it resides because it might be in a + * different cache line to the rest of the 64-bit word, due to + * the (lack of) alignment constraints. */ - if (user_len1 >= times_ofs + sizeof(uint64_t)) - update_bit = ((u8 *)gpc1->khva) + times_ofs + sizeof(u64) - 1; - else - update_bit = ((u8 *)gpc2->khva) + times_ofs + sizeof(u64) - 1 - - user_len1; + *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; + smp_wmb(); /* - * Create a structure on our stack with everything in the right place. - * The rs_state pointer points to the start of it, which in the case - * of a compat guest on a 64-bit host is the 32 bit field that the - * compiler thinks is padding. + * Now assemble the actual structure, either on our kernel stack + * or directly in the guest according to how the rs_state and + * rs_times pointers were set up above. */ *rs_state = vx->current_runstate; - rs.state_entry_time = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; - memcpy(rs.time, vx->runstate_times, sizeof(vx->runstate_times)); - - *update_bit = rs.state_entry_time >> 56; - smp_wmb(); + rs_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; + memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); - /* - * Having constructed the structure starting at *rs_state, copy it - * into the first and second pages as appropriate using user_len1 - * and user_len2. - */ - memcpy(gpc1->khva, rs_state, user_len1); - memcpy(gpc2->khva, ((u8 *)rs_state) + user_len1, user_len2); + /* For the split case, we have to then copy it to the guest. */ + if (user_len2) { + memcpy(gpc1->khva, rs_state, user_len1); + memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2); + } smp_wmb(); - /* - * Finally, clear the XEN_RUNSTATE_UPDATE bit. - */ + /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ *update_bit = vx->runstate_entry_time >> 56; smp_wmb(); if (user_len2) read_unlock(&gpc2->lock); - done_1: + read_unlock_irqrestore(&gpc1->lock, flags); mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);

[v1,1/2] KVM: x86/xen: Reconcile fast and slow paths for runstate update

Commit Message

Comments

Patch