@@ -294,8 +294,14 @@ data:
steal:
the amount of time in which this vCPU did not run, in
- nanoseconds. Time during which the vcpu is idle, will not be
- reported as steal time.
+ nanoseconds. This includes the time during which the host is
+ suspended. Time during which the vcpu is idle, might not be
+ reported as steal time. The case where the host suspends
+ during a VM migration might not be accounted if VCPUs aren't
+ entered post-resume, because KVM does not currently support
+ suspend/resuming the associated metadata. A workaround would
+ be for the VMM to ensure that the guest is entered with
+ KVM_RUN after resuming from suspend.
preempted:
indicate the vCPU who owns this struct is running or
@@ -124,6 +124,7 @@
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
+#define KVM_REQ_WAIT_FOR_RESUME KVM_ARCH_REQ(35)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -916,8 +917,13 @@ struct kvm_vcpu_arch {
struct {
u8 preempted;
+ bool host_suspended;
u64 msr_val;
u64 last_steal;
+ u64 last_suspend;
+ u64 suspend_ns;
+ u64 last_suspend_ns;
+ wait_queue_head_t resume_waitq;
struct gfn_to_hva_cache cache;
} st;
@@ -3717,6 +3717,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ steal += vcpu->arch.st.suspend_ns - vcpu->arch.st.last_suspend_ns;
+ vcpu->arch.st.last_suspend_ns = vcpu->arch.st.suspend_ns;
unsafe_put_user(steal, &st->steal, out);
version += 1;
@@ -6930,6 +6932,19 @@ long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
}
#endif
+static void wait_for_resume(struct kvm_vcpu *vcpu)
+{
+ wait_event_interruptible(vcpu->arch.st.resume_waitq,
+ vcpu->arch.st.host_suspended == 0);
+
+ /*
+ * This might happen if we blocked here before the freezing of tasks
+ * and we get woken up by the freezer.
+ */
+ if (vcpu->arch.st.host_suspended)
+ kvm_make_request(KVM_REQ_WAIT_FOR_RESUME, vcpu);
+}
+
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
static int kvm_arch_suspend_notifier(struct kvm *kvm)
{
@@ -6939,6 +6954,19 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
mutex_lock(&kvm->lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.st.last_suspend = ktime_get_boottime_ns();
+ /*
+ * Tasks get thawed before the resume notifier has been called
+ * so we need to block vCPUs until the resume notifier has run.
+ * Otherwise, suspend steal time might get applied too late,
+ * and get accounted to the wrong guest task.
+ * This also ensures that the guest paused bit set below
+ * doesn't get checked and cleared before the host actually
+ * suspends.
+ */
+ vcpu->arch.st.host_suspended = 1;
+ kvm_make_request(KVM_REQ_WAIT_FOR_RESUME, vcpu);
+
if (!vcpu->arch.pv_time.active)
continue;
@@ -6954,12 +6982,32 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
return ret ? NOTIFY_BAD : NOTIFY_DONE;
}
+static int kvm_arch_resume_notifier(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.st.host_suspended = 0;
+ vcpu->arch.st.suspend_ns += ktime_get_boottime_ns() -
+ vcpu->arch.st.last_suspend;
+ wake_up_interruptible(&vcpu->arch.st.resume_waitq);
+ }
+ mutex_unlock(&kvm->lock);
+
+ return NOTIFY_DONE;
+}
+
int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
{
switch (state) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
return kvm_arch_suspend_notifier(kvm);
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ return kvm_arch_resume_notifier(kvm);
}
return NOTIFY_DONE;
@@ -10813,6 +10861,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 1;
goto out;
}
+ if (kvm_check_request(KVM_REQ_WAIT_FOR_RESUME, vcpu))
+ wait_for_resume(vcpu);
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
@@ -12341,6 +12391,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (r)
goto free_guest_fpu;
+ init_waitqueue_head(&vcpu->arch.st.resume_waitq);
kvm_xen_init_vcpu(vcpu);
vcpu_load(vcpu);
kvm_vcpu_after_set_cpuid(vcpu);
When the host resumes from a suspend, the guest thinks any task that was running during the suspend ran for a long time, even though the effective run time was much shorter, which can end up having negative effects with scheduling. To mitigate this issue, the time that the host was suspended is included in steal time, which lets the guest can subtract the duration from the tasks' runtime. In order to implement this behavior, once the suspend notifier fires, vCPUs trying to run block until the resume notifier finishes. This is because the freezing of userspace tasks happens between these two points, which means that vCPUs could otherwise run and get their suspend steal time misaccounted, particularly if a vCPU would run after resume before the resume notifier. Incidentally, doing this also addresses a potential race with the suspend notifier setting PVCLOCK_GUEST_STOPPED, which could then get cleared before the suspend actually happened. One potential caveat is that in the case of a suspend happening during a VM migration, the suspend time might not be accounted. A workaround would be for the VMM to ensure that the guest is entered with KVM_RUN after resuming from suspend. Signed-off-by: Suleiman Souhlal <suleiman@google.com> --- Documentation/virt/kvm/x86/msr.rst | 10 ++++-- arch/x86/include/asm/kvm_host.h | 6 ++++ arch/x86/kvm/x86.c | 51 ++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 2 deletions(-)