Message ID | 20240710074410.770409-1-suleiman@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86: Include host suspended time in steal time. | expand |
Hello, On Wed, Jul 10, 2024 at 4:44 PM Suleiman Souhlal <suleiman@google.com> wrote: > > When the host resumes from a suspend, the guest thinks any task > that was running during the suspend ran for a long time, even though > the effective run time was much shorter, which can end up having > negative effects with scheduling. This can be particularly noticeable > if the guest task was RT, as it can end up getting throttled for a > long time. > > To mitigate this issue, we include the time that the host was > suspended in steal time, which lets the guest can subtract the > duration from the tasks' runtime. > > Signed-off-by: Suleiman Souhlal <suleiman@google.com> > --- > arch/x86/kvm/x86.c | 23 ++++++++++++++++++++++- > include/linux/kvm_host.h | 4 ++++ > 2 files changed, 26 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 0763a0f72a067f..94bbdeef843863 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > struct kvm_steal_time __user *st; > struct kvm_memslots *slots; > gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; > - u64 steal; > + u64 steal, suspend_duration; > u32 version; > > if (kvm_xen_msr_enabled(vcpu->kvm)) { > @@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > return; > } > > + suspend_duration = 0; > + if (READ_ONCE(vcpu->suspended)) { > + suspend_duration = vcpu->kvm->last_suspend_duration; > + vcpu->suspended = 0; > + } > + > st = (struct kvm_steal_time __user *)ghc->hva; > /* > * Doing a TLB flush here, on the guest's behalf, can avoid > @@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > unsafe_get_user(steal, &st->steal, out); > steal += current->sched_info.run_delay - > vcpu->arch.st.last_steal; > + steal += suspend_duration; > vcpu->arch.st.last_steal = current->sched_info.run_delay; > unsafe_put_user(steal, &st->steal, out); > > @@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > mutex_lock(&kvm->lock); > kvm_for_each_vcpu(i, vcpu, kvm) { > + WRITE_ONCE(vcpu->suspended, 1); > if (!vcpu->arch.pv_time.active) > continue; > > @@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > } > mutex_unlock(&kvm->lock); > > + kvm->suspended_time = ktime_get_boottime_ns(); > + > return ret ? NOTIFY_BAD : NOTIFY_DONE; > } > > +static int > +kvm_arch_resume_notifier(struct kvm *kvm) > +{ > + kvm->last_suspend_duration = ktime_get_boottime_ns() - > + kvm->suspended_time; > + return NOTIFY_DONE; > +} > + > int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) > { > switch (state) { > case PM_HIBERNATION_PREPARE: > case PM_SUSPEND_PREPARE: > return kvm_arch_suspend_notifier(kvm); > + case PM_POST_HIBERNATION: > + case PM_POST_SUSPEND: > + return kvm_arch_resume_notifier(kvm); > } > > return NOTIFY_DONE; > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 692c01e41a18ef..2d37af9a348648 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -366,6 +366,8 @@ struct kvm_vcpu { > } async_pf; > #endif > > + bool suspended; > + > #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT > /* > * Cpu relax intercept or pause loop exit optimization > @@ -840,6 +842,8 @@ struct kvm { > struct xarray mem_attr_array; > #endif > char stats_id[KVM_STATS_NAME_SIZE]; > + u64 last_suspend_duration; > + u64 suspended_time; > }; > > #define kvm_err(fmt, ...) \ > -- > 2.45.2.993.g49e7a77208-goog > Gentle ping. Thanks, -- Suleiman
Hi, On Wed, Jul 10, 2024 at 04:44:10PM +0900, Suleiman Souhlal wrote: >When the host resumes from a suspend, the guest thinks any task >that was running during the suspend ran for a long time, even though >the effective run time was much shorter, which can end up having >negative effects with scheduling. This can be particularly noticeable >if the guest task was RT, as it can end up getting throttled for a >long time. > >To mitigate this issue, we include the time that the host was >suspended in steal time, which lets the guest can subtract the >duration from the tasks' runtime. > >Signed-off-by: Suleiman Souhlal <suleiman@google.com> >--- > arch/x86/kvm/x86.c | 23 ++++++++++++++++++++++- > include/linux/kvm_host.h | 4 ++++ > 2 files changed, 26 insertions(+), 1 deletion(-) > >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >index 0763a0f72a067f..94bbdeef843863 100644 >--- a/arch/x86/kvm/x86.c >+++ b/arch/x86/kvm/x86.c >@@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > struct kvm_steal_time __user *st; > struct kvm_memslots *slots; > gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; >- u64 steal; >+ u64 steal, suspend_duration; > u32 version; > > if (kvm_xen_msr_enabled(vcpu->kvm)) { >@@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > return; > } > >+ suspend_duration = 0; >+ if (READ_ONCE(vcpu->suspended)) { >+ suspend_duration = vcpu->kvm->last_suspend_duration; >+ vcpu->suspended = 0; Can you explain why READ_ONCE() is necessary here, but WRITE_ONCE() isn't used for clearing vcpu->suspended? >+ } >+ > st = (struct kvm_steal_time __user *)ghc->hva; > /* > * Doing a TLB flush here, on the guest's behalf, can avoid >@@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > unsafe_get_user(steal, &st->steal, out); > steal += current->sched_info.run_delay - > vcpu->arch.st.last_steal; >+ steal += suspend_duration; > vcpu->arch.st.last_steal = current->sched_info.run_delay; > unsafe_put_user(steal, &st->steal, out); > >@@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > mutex_lock(&kvm->lock); > kvm_for_each_vcpu(i, vcpu, kvm) { >+ WRITE_ONCE(vcpu->suspended, 1); > if (!vcpu->arch.pv_time.active) > continue; > >@@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > } > mutex_unlock(&kvm->lock); > >+ kvm->suspended_time = ktime_get_boottime_ns(); >+ > return ret ? NOTIFY_BAD : NOTIFY_DONE; > } > >+static int >+kvm_arch_resume_notifier(struct kvm *kvm) >+{ >+ kvm->last_suspend_duration = ktime_get_boottime_ns() - >+ kvm->suspended_time; Is it possible that a vCPU doesn't get any chance to run (i.e., update steal time) between two suspends? In this case, only the second suspend would be recorded. Maybe we need an infrastructure in the PM subsystem to record accumulated suspended time. When updating steal time, KVM can add the additional suspended time since the last update into steal_time (as how KVM deals with current->sched_info.run_deley). This way, the scenario I mentioned above won't be a problem and KVM needn't calculate the suspend duration for each guest. And this approach can potentially benefit RISC-V and ARM as well, since they have the same logic as x86 regarding steal_time. Additionally, it seems that if a guest migrates to another system after a suspend and before updating steal time, the suspended time is lost during migration. I'm not sure if this is a practical issue.
On Tue, Jul 30, 2024 at 10:26:30AM +0800, Chao Gao wrote: > Hi, > > On Wed, Jul 10, 2024 at 04:44:10PM +0900, Suleiman Souhlal wrote: > >When the host resumes from a suspend, the guest thinks any task > >that was running during the suspend ran for a long time, even though > >the effective run time was much shorter, which can end up having > >negative effects with scheduling. This can be particularly noticeable > >if the guest task was RT, as it can end up getting throttled for a > >long time. > > > >To mitigate this issue, we include the time that the host was > >suspended in steal time, which lets the guest can subtract the > >duration from the tasks' runtime. > > > >Signed-off-by: Suleiman Souhlal <suleiman@google.com> > >--- > > arch/x86/kvm/x86.c | 23 ++++++++++++++++++++++- > > include/linux/kvm_host.h | 4 ++++ > > 2 files changed, 26 insertions(+), 1 deletion(-) > > > >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >index 0763a0f72a067f..94bbdeef843863 100644 > >--- a/arch/x86/kvm/x86.c > >+++ b/arch/x86/kvm/x86.c > >@@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > struct kvm_steal_time __user *st; > > struct kvm_memslots *slots; > > gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; > >- u64 steal; > >+ u64 steal, suspend_duration; > > u32 version; > > > > if (kvm_xen_msr_enabled(vcpu->kvm)) { > >@@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > return; > > } > > > >+ suspend_duration = 0; > >+ if (READ_ONCE(vcpu->suspended)) { > >+ suspend_duration = vcpu->kvm->last_suspend_duration; > >+ vcpu->suspended = 0; > > Can you explain why READ_ONCE() is necessary here, but WRITE_ONCE() isn't used > for clearing vcpu->suspended? Because this part of the code is essentially single-threaded, it didn't seem like WRITE_ONCE() was needed. I can add it in an eventual future version of the patch if it makes things less confusing (if this code still exists). > > >+ } > >+ > > st = (struct kvm_steal_time __user *)ghc->hva; > > /* > > * Doing a TLB flush here, on the guest's behalf, can avoid > >@@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > unsafe_get_user(steal, &st->steal, out); > > steal += current->sched_info.run_delay - > > vcpu->arch.st.last_steal; > >+ steal += suspend_duration; > > vcpu->arch.st.last_steal = current->sched_info.run_delay; > > unsafe_put_user(steal, &st->steal, out); > > > >@@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > > > mutex_lock(&kvm->lock); > > kvm_for_each_vcpu(i, vcpu, kvm) { > >+ WRITE_ONCE(vcpu->suspended, 1); > > if (!vcpu->arch.pv_time.active) > > continue; > > > >@@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > } > > mutex_unlock(&kvm->lock); > > > >+ kvm->suspended_time = ktime_get_boottime_ns(); > >+ > > return ret ? NOTIFY_BAD : NOTIFY_DONE; > > } > > > >+static int > >+kvm_arch_resume_notifier(struct kvm *kvm) > >+{ > >+ kvm->last_suspend_duration = ktime_get_boottime_ns() - > >+ kvm->suspended_time; > > Is it possible that a vCPU doesn't get any chance to run (i.e., update steal > time) between two suspends? In this case, only the second suspend would be > recorded. Good point. I'll address this. > > Maybe we need an infrastructure in the PM subsystem to record accumulated > suspended time. When updating steal time, KVM can add the additional suspended > time since the last update into steal_time (as how KVM deals with > current->sched_info.run_deley). This way, the scenario I mentioned above won't > be a problem and KVM needn't calculate the suspend duration for each guest. And > this approach can potentially benefit RISC-V and ARM as well, since they have > the same logic as x86 regarding steal_time. Thanks for the suggestion. I'm a bit wary of making a whole PM subsystem addition for such a counter, but maybe I can make a architecture-independent KVM change for it, with a PM notifier in kvm_main.c. > > Additionally, it seems that if a guest migrates to another system after a suspend > and before updating steal time, the suspended time is lost during migration. I'm > not sure if this is a practical issue. The systems where the host suspends don't usually do VM migrations. Or at least the ones where we're encountering the problem this patch is trying to address don't (laptops). But even if they did, it doesn't seem that likely that the migration would happen over a host suspend. If it's ok with you, I'll put this issue aside for the time being. Thanks for the comments. -- Suleiman
>> >+static int >> >+kvm_arch_resume_notifier(struct kvm *kvm) >> >+{ >> >+ kvm->last_suspend_duration = ktime_get_boottime_ns() - >> >+ kvm->suspended_time; >> >> Is it possible that a vCPU doesn't get any chance to run (i.e., update steal >> time) between two suspends? In this case, only the second suspend would be >> recorded. > >Good point. I'll address this. > >> >> Maybe we need an infrastructure in the PM subsystem to record accumulated >> suspended time. When updating steal time, KVM can add the additional suspended >> time since the last update into steal_time (as how KVM deals with >> current->sched_info.run_deley). This way, the scenario I mentioned above won't >> be a problem and KVM needn't calculate the suspend duration for each guest. And >> this approach can potentially benefit RISC-V and ARM as well, since they have >> the same logic as x86 regarding steal_time. > >Thanks for the suggestion. >I'm a bit wary of making a whole PM subsystem addition for such a counter, but >maybe I can make a architecture-independent KVM change for it, with a PM >notifier in kvm_main.c. Sounds good. > >> >> Additionally, it seems that if a guest migrates to another system after a suspend >> and before updating steal time, the suspended time is lost during migration. I'm >> not sure if this is a practical issue. > >The systems where the host suspends don't usually do VM migrations. Or at least >the ones where we're encountering the problem this patch is trying to address >don't (laptops). >But even if they did, it doesn't seem that likely that the migration would >happen over a host suspend. >If it's ok with you, I'll put this issue aside for the time being. I am fine with putting this issue aside.
On Tue, Jul 30, 2024, Suleiman Souhlal wrote: > On Tue, Jul 30, 2024 at 10:26:30AM +0800, Chao Gao wrote: > > Hi, > > > > On Wed, Jul 10, 2024 at 04:44:10PM +0900, Suleiman Souhlal wrote: > > >When the host resumes from a suspend, the guest thinks any task > > >that was running during the suspend ran for a long time, even though > > >the effective run time was much shorter, which can end up having > > >negative effects with scheduling. This can be particularly noticeable > > >if the guest task was RT, as it can end up getting throttled for a > > >long time. > > > > > >To mitigate this issue, we include the time that the host was > > >suspended in steal time, which lets the guest can subtract the > > >duration from the tasks' runtime. > > > > > >Signed-off-by: Suleiman Souhlal <suleiman@google.com> > > >--- > > > arch/x86/kvm/x86.c | 23 ++++++++++++++++++++++- > > > include/linux/kvm_host.h | 4 ++++ > > > 2 files changed, 26 insertions(+), 1 deletion(-) > > > > > >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > >index 0763a0f72a067f..94bbdeef843863 100644 > > >--- a/arch/x86/kvm/x86.c > > >+++ b/arch/x86/kvm/x86.c > > >@@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > > struct kvm_steal_time __user *st; > > > struct kvm_memslots *slots; > > > gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; > > >- u64 steal; > > >+ u64 steal, suspend_duration; > > > u32 version; > > > > > > if (kvm_xen_msr_enabled(vcpu->kvm)) { > > >@@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > > return; > > > } > > > > > >+ suspend_duration = 0; > > >+ if (READ_ONCE(vcpu->suspended)) { > > >+ suspend_duration = vcpu->kvm->last_suspend_duration; > > >+ vcpu->suspended = 0; > > > > Can you explain why READ_ONCE() is necessary here, but WRITE_ONCE() isn't used > > for clearing vcpu->suspended? > > Because this part of the code is essentially single-threaded, it didn't seem > like WRITE_ONCE() was needed. I can add it in an eventual future version of > the patch if it makes things less confusing (if this code still exists). {READ,WRITE}_ONCE() don't provide ordering, they only ensure that the compiler emits a load/store. But (a) forcing the compiler to emit a load/store is only necessary when it's possible for the compiler to know/prove that the load/store won't affect functionalty, and (b) this code doesn't have have the correct ordering even if there were the appropriate smp_wmb() and smp_rmb() annotations. The writer needs to ensure the write to last_suspend_duration is visible before vcpu->suspended is set, and the reader needs to ensure it reads last_suspend_duration after vcpu->suspended. That said, it's likely a moot point, because I assume the PM notifier runs while tasks are frozen, i.e. its writes are guaranteed to be ordered before record_steal_time(). And _that_ said, I don't see any reason for two fields, nor do I see any reason to track the suspended duration in the VM instead of the vCPU. Similar to what Chao pointed out, per-VM tracking falls apart if only some vCPUs consume the suspend duration. I.e. just accumulate the suspend duration directly in the vCPU. > > >+ } > > >+ > > > st = (struct kvm_steal_time __user *)ghc->hva; > > > /* > > > * Doing a TLB flush here, on the guest's behalf, can avoid > > >@@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > > unsafe_get_user(steal, &st->steal, out); > > > steal += current->sched_info.run_delay - > > > vcpu->arch.st.last_steal; > > >+ steal += suspend_duration; > > > vcpu->arch.st.last_steal = current->sched_info.run_delay; > > > unsafe_put_user(steal, &st->steal, out); > > > > > >@@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > > > > > mutex_lock(&kvm->lock); > > > kvm_for_each_vcpu(i, vcpu, kvm) { > > >+ WRITE_ONCE(vcpu->suspended, 1); > > > if (!vcpu->arch.pv_time.active) > > > continue; > > > > > >@@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > > } > > > mutex_unlock(&kvm->lock); > > > > > >+ kvm->suspended_time = ktime_get_boottime_ns(); > > >+ > > > return ret ? NOTIFY_BAD : NOTIFY_DONE; > > > } > > > > > >+static int > > >+kvm_arch_resume_notifier(struct kvm *kvm) Do not wrap before the function name. Linus has a nice explanation/rant on this[*]. [*] https://lore.kernel.org/all/CAHk-=wjoLAYG446ZNHfg=GhjSY6nFmuB_wA8fYd5iLBNXjo9Bw@mail.gmail.com > > >+{ > > >+ kvm->last_suspend_duration = ktime_get_boottime_ns() - > > >+ kvm->suspended_time; > > > > Is it possible that a vCPU doesn't get any chance to run (i.e., update steal > > time) between two suspends? In this case, only the second suspend would be > > recorded. > > Good point. I'll address this. > > > > > Maybe we need an infrastructure in the PM subsystem to record accumulated > > suspended time. When updating steal time, KVM can add the additional suspended > > time since the last update into steal_time (as how KVM deals with > > current->sched_info.run_deley). This way, the scenario I mentioned above won't > > be a problem and KVM needn't calculate the suspend duration for each guest. And > > this approach can potentially benefit RISC-V and ARM as well, since they have > > the same logic as x86 regarding steal_time. > > Thanks for the suggestion. > I'm a bit wary of making a whole PM subsystem addition for such a counter, but > maybe I can make a architecture-independent KVM change for it, with a PM > notifier in kvm_main.c. Yes. Either that, or the fields need to be in kvm_vcpu_arch, not kvm_vcpu. > > Additionally, it seems that if a guest migrates to another system after a > > suspend and before updating steal time, the suspended time is lost during > > migration. I'm not sure if this is a practical issue. > > The systems where the host suspends don't usually do VM migrations. Or at > least the ones where we're encountering the problem this patch is trying to > address don't (laptops). > > But even if they did, it doesn't seem that likely that the migration would > happen over a host suspend. I think we want to account for this straightaway, or at least have defined and documented behavior, else we risk rehashing the issues with marking a vCPU as preempted when it's loaded, but not running. Which causes problems for live migration as it results in KVM marking the steal-time page as dirty after vCPUs have been paused. [*] https://lkml.kernel.org/r/20240503181734.1467938-4-dmatlack%40google.com
On Wed, Aug 14, 2024 at 2:06 AM Sean Christopherson <seanjc@google.com> wrote: > > On Tue, Jul 30, 2024, Suleiman Souhlal wrote: > > On Tue, Jul 30, 2024 at 10:26:30AM +0800, Chao Gao wrote: > > > Hi, > > > > > > On Wed, Jul 10, 2024 at 04:44:10PM +0900, Suleiman Souhlal wrote: > > > >When the host resumes from a suspend, the guest thinks any task > > > >that was running during the suspend ran for a long time, even though > > > >the effective run time was much shorter, which can end up having > > > >negative effects with scheduling. This can be particularly noticeable > > > >if the guest task was RT, as it can end up getting throttled for a > > > >long time. > > > > > > > >To mitigate this issue, we include the time that the host was > > > >suspended in steal time, which lets the guest can subtract the > > > >duration from the tasks' runtime. > > > > > > > >Signed-off-by: Suleiman Souhlal <suleiman@google.com> > > > >--- > > > > arch/x86/kvm/x86.c | 23 ++++++++++++++++++++++- > > > > include/linux/kvm_host.h | 4 ++++ > > > > 2 files changed, 26 insertions(+), 1 deletion(-) > > > > > > > >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > >index 0763a0f72a067f..94bbdeef843863 100644 > > > >--- a/arch/x86/kvm/x86.c > > > >+++ b/arch/x86/kvm/x86.c > > > >@@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > > > struct kvm_steal_time __user *st; > > > > struct kvm_memslots *slots; > > > > gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; > > > >- u64 steal; > > > >+ u64 steal, suspend_duration; > > > > u32 version; > > > > > > > > if (kvm_xen_msr_enabled(vcpu->kvm)) { > > > >@@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > > > return; > > > > } > > > > > > > >+ suspend_duration = 0; > > > >+ if (READ_ONCE(vcpu->suspended)) { > > > >+ suspend_duration = vcpu->kvm->last_suspend_duration; > > > >+ vcpu->suspended = 0; > > > > > > Can you explain why READ_ONCE() is necessary here, but WRITE_ONCE() isn't used > > > for clearing vcpu->suspended? > > > > Because this part of the code is essentially single-threaded, it didn't seem > > like WRITE_ONCE() was needed. I can add it in an eventual future version of > > the patch if it makes things less confusing (if this code still exists). > > {READ,WRITE}_ONCE() don't provide ordering, they only ensure that the compiler > emits a load/store. But (a) forcing the compiler to emit a load/store is only > necessary when it's possible for the compiler to know/prove that the load/store > won't affect functionalty, and (b) this code doesn't have have the correct > ordering even if there were the appropriate smp_wmb() and smp_rmb() annotations. > > The writer needs to ensure the write to last_suspend_duration is visible before > vcpu->suspended is set, and the reader needs to ensure it reads last_suspend_duration > after vcpu->suspended. > > That said, it's likely a moot point, because I assume the PM notifier runs while > tasks are frozen, i.e. its writes are guaranteed to be ordered before > record_steal_time(). > > And _that_ said, I don't see any reason for two fields, nor do I see any reason > to track the suspended duration in the VM instead of the vCPU. Similar to what > Chao pointed out, per-VM tracking falls apart if only some vCPUs consume the > suspend duration. I.e. just accumulate the suspend duration directly in the > vCPU. v2 will do this. > > > > >+ } > > > >+ > > > > st = (struct kvm_steal_time __user *)ghc->hva; > > > > /* > > > > * Doing a TLB flush here, on the guest's behalf, can avoid > > > >@@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > > > > unsafe_get_user(steal, &st->steal, out); > > > > steal += current->sched_info.run_delay - > > > > vcpu->arch.st.last_steal; > > > >+ steal += suspend_duration; > > > > vcpu->arch.st.last_steal = current->sched_info.run_delay; > > > > unsafe_put_user(steal, &st->steal, out); > > > > > > > >@@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > > > > > > > mutex_lock(&kvm->lock); > > > > kvm_for_each_vcpu(i, vcpu, kvm) { > > > >+ WRITE_ONCE(vcpu->suspended, 1); > > > > if (!vcpu->arch.pv_time.active) > > > > continue; > > > > > > > >@@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > > > } > > > > mutex_unlock(&kvm->lock); > > > > > > > >+ kvm->suspended_time = ktime_get_boottime_ns(); > > > >+ > > > > return ret ? NOTIFY_BAD : NOTIFY_DONE; > > > > } > > > > > > > >+static int > > > >+kvm_arch_resume_notifier(struct kvm *kvm) > > Do not wrap before the function name. Linus has a nice explanation/rant on this[*]. > > [*] https://lore.kernel.org/all/CAHk-=wjoLAYG446ZNHfg=GhjSY6nFmuB_wA8fYd5iLBNXjo9Bw@mail.gmail.com > > > > >+{ > > > >+ kvm->last_suspend_duration = ktime_get_boottime_ns() - > > > >+ kvm->suspended_time; > > > > > > Is it possible that a vCPU doesn't get any chance to run (i.e., update steal > > > time) between two suspends? In this case, only the second suspend would be > > > recorded. > > > > Good point. I'll address this. > > > > > > > > Maybe we need an infrastructure in the PM subsystem to record accumulated > > > suspended time. When updating steal time, KVM can add the additional suspended > > > time since the last update into steal_time (as how KVM deals with > > > current->sched_info.run_deley). This way, the scenario I mentioned above won't > > > be a problem and KVM needn't calculate the suspend duration for each guest. And > > > this approach can potentially benefit RISC-V and ARM as well, since they have > > > the same logic as x86 regarding steal_time. > > > > Thanks for the suggestion. > > I'm a bit wary of making a whole PM subsystem addition for such a counter, but > > maybe I can make a architecture-independent KVM change for it, with a PM > > notifier in kvm_main.c. > > Yes. Either that, or the fields need to be in kvm_vcpu_arch, not kvm_vcpu. > > > > Additionally, it seems that if a guest migrates to another system after a > > > suspend and before updating steal time, the suspended time is lost during > > > migration. I'm not sure if this is a practical issue. > > > > The systems where the host suspends don't usually do VM migrations. Or at > > least the ones where we're encountering the problem this patch is trying to > > address don't (laptops). > > > > But even if they did, it doesn't seem that likely that the migration would > > happen over a host suspend. > > I think we want to account for this straightaway, or at least have defined and > documented behavior, else we risk rehashing the issues with marking a vCPU as > preempted when it's loaded, but not running. Which causes problems for live > migration as it results in KVM marking the steal-time page as dirty after vCPUs > have been paused. > > [*] https://lkml.kernel.org/r/20240503181734.1467938-4-dmatlack%40google.com Can you explain how the steal-time page could get marked as dirty after VCPUs have been paused? From what I can tell, record_steal_time() gets called from vcpu_enter_guest(), which shouldn't happen when the VCPU has been paused, but I have to admit I don't really know anything about how live migration works. The series you linked is addressing an issue when the steal-time page gets written to outside of record_steal_time(), but we aren't doing this for this proposed patch. With the proposed approach, the steal time page would get copied to the new host and everything would keep working correctly, with the exception of a possible host suspend happening between when the migration started and when it finishes, not being reflected post-migration. That seems like a reasonable compromise. Please let me know if you want more. I am planning on sending v2 within the next few days. Thanks, -- Suleiman
On Wed, Aug 14, 2024, Suleiman Souhlal wrote: > On Wed, Aug 14, 2024 at 2:06 AM Sean Christopherson <seanjc@google.com> wrote: > > > > Additionally, it seems that if a guest migrates to another system after a > > > > suspend and before updating steal time, the suspended time is lost during > > > > migration. I'm not sure if this is a practical issue. > > > > > > The systems where the host suspends don't usually do VM migrations. Or at > > > least the ones where we're encountering the problem this patch is trying to > > > address don't (laptops). > > > > > > But even if they did, it doesn't seem that likely that the migration would > > > happen over a host suspend. > > > > I think we want to account for this straightaway, or at least have defined and > > documented behavior, else we risk rehashing the issues with marking a vCPU as > > preempted when it's loaded, but not running. Which causes problems for live > > migration as it results in KVM marking the steal-time page as dirty after vCPUs > > have been paused. > > > > [*] https://lkml.kernel.org/r/20240503181734.1467938-4-dmatlack%40google.com > > Can you explain how the steal-time page could get marked as dirty after VCPUs > have been paused? From what I can tell, record_steal_time() gets called from > vcpu_enter_guest(), which shouldn't happen when the VCPU has been paused, but > I have to admit I don't really know anything about how live migration works. It's not record_steal_time(), it's kvm_steal_time_set_preempted(). The flag KVM uses to tell the guest that the vCPU has been scheduled out, KVM_VCPU_PREEMPTED, resides in the kvm_steal_time structure, i.e. in the steal-time page. Userspace "pauses" vCPUs when it enters blackout to complete live migration. After pausing vCPUs, the VMM invokes various KVM_GET_* ioctls to retrieve vCPU state so that it can be transfered to the destination. Without the above series, KVM marks vCPUs as preempted when the associated task is scheduled out and the vCPU is "loaded", even if the vCPU is not actively running. This results in KVM writing to kvm_steal_time.preempted and dirtying the page, after userspace thinks it should be impossible for KVM to dirty guest memory (because vCPUs are no longer being run). > The series you linked is addressing an issue when the steal-time page gets > written to outside of record_steal_time(), but we aren't doing this for this > proposed patch. I know. What I am saying is that I don't want to punt on the issue Chao raised, because _if_ we want to properly account suspend time when a vCPU is migrated (or saved/restored for any reason) without doing KVM_RUN after suspect, then that either requires updating the steal-time information outside of KVM_RUN, or it requires new uAPI to explicitly migrate the unaccounted suspend timd. Given that new uAPI is generally avoided when possible, that makes updating steal-time outside of KVM_RUN the default choice (which isn,t necessarily the best choice), which in turn means KVM now has to worry about the above scenario of writing to guest memory after vCPUs have been paused by userespace. > With the proposed approach, the steal time page would get copied to the new > host and everything would keep working correctly, with the exception of a > possible host suspend happening between when the migration started and when > it finishes, not being reflected post-migration. That seems like a > reasonable compromise. Maybe, but I'm not keen on sweeping this under the rug. Ignoring issues because they'll "never" happen has bitten KVM more than once. At the absolute bare minimum, the flaw needs to be documented, with a suggested workaround provided (do KVM on all vCPUs before migrating after suspend), e.g. so that userspace can workaround the issue in the unlikely scenario userspace does suspend+resume, saves/restores a VM, *and* cares about steal-time. Even better would be if we can figure out a way to effectively require KVM_RUN after suspend+resume, but I can't think of a way to do that without breaking userspace or adding new uAPI, and adding new uAPI for this feels like overkill.
On Thu, Aug 15, 2024 at 12:35 AM Sean Christopherson <seanjc@google.com> wrote: > > On Wed, Aug 14, 2024, Suleiman Souhlal wrote: > > > With the proposed approach, the steal time page would get copied to the new > > host and everything would keep working correctly, with the exception of a > > possible host suspend happening between when the migration started and when > > it finishes, not being reflected post-migration. That seems like a > > reasonable compromise. > > Maybe, but I'm not keen on sweeping this under the rug. Ignoring issues because > they'll "never" happen has bitten KVM more than once. > > At the absolute bare minimum, the flaw needs to be documented, with a suggested > workaround provided (do KVM on all vCPUs before migrating after suspend), e.g. > so that userspace can workaround the issue in the unlikely scenario userspace > does suspend+resume, saves/restores a VM, *and* cares about steal-time. I can write a comment in record_steal_time() that describes the scenario, mention it in the commit message and add something to the steal time part of Documentation/virt/kvm/x86/msr.rst. -- Suleiman
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0763a0f72a067f..94bbdeef843863 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3669,7 +3669,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) struct kvm_steal_time __user *st; struct kvm_memslots *slots; gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; - u64 steal; + u64 steal, suspend_duration; u32 version; if (kvm_xen_msr_enabled(vcpu->kvm)) { @@ -3696,6 +3696,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) return; } + suspend_duration = 0; + if (READ_ONCE(vcpu->suspended)) { + suspend_duration = vcpu->kvm->last_suspend_duration; + vcpu->suspended = 0; + } + st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid @@ -3749,6 +3755,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) unsafe_get_user(steal, &st->steal, out); steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; + steal += suspend_duration; vcpu->arch.st.last_steal = current->sched_info.run_delay; unsafe_put_user(steal, &st->steal, out); @@ -6920,6 +6927,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { + WRITE_ONCE(vcpu->suspended, 1); if (!vcpu->arch.pv_time.active) continue; @@ -6932,15 +6940,28 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) } mutex_unlock(&kvm->lock); + kvm->suspended_time = ktime_get_boottime_ns(); + return ret ? NOTIFY_BAD : NOTIFY_DONE; } +static int +kvm_arch_resume_notifier(struct kvm *kvm) +{ + kvm->last_suspend_duration = ktime_get_boottime_ns() - + kvm->suspended_time; + return NOTIFY_DONE; +} + int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) { switch (state) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: return kvm_arch_suspend_notifier(kvm); + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + return kvm_arch_resume_notifier(kvm); } return NOTIFY_DONE; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 692c01e41a18ef..2d37af9a348648 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -366,6 +366,8 @@ struct kvm_vcpu { } async_pf; #endif + bool suspended; + #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT /* * Cpu relax intercept or pause loop exit optimization @@ -840,6 +842,8 @@ struct kvm { struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; + u64 last_suspend_duration; + u64 suspended_time; }; #define kvm_err(fmt, ...) \
When the host resumes from a suspend, the guest thinks any task that was running during the suspend ran for a long time, even though the effective run time was much shorter, which can end up having negative effects with scheduling. This can be particularly noticeable if the guest task was RT, as it can end up getting throttled for a long time. To mitigate this issue, we include the time that the host was suspended in steal time, which lets the guest can subtract the duration from the tasks' runtime. Signed-off-by: Suleiman Souhlal <suleiman@google.com> --- arch/x86/kvm/x86.c | 23 ++++++++++++++++++++++- include/linux/kvm_host.h | 4 ++++ 2 files changed, 26 insertions(+), 1 deletion(-)