@@ -994,6 +994,8 @@ struct kvm_vcpu_arch {
*/
struct {
enum kvm_vcpu_boost_state boost_status;
+ int boost_policy;
+ int boost_prio;
u64 msr_val;
struct gfn_to_hva_cache data;
} pv_sched;
@@ -2230,6 +2232,13 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)
#ifdef CONFIG_PARAVIRT_SCHED_KVM
+/*
+ * Default policy and priority used for boosting
+ * VCPU threads.
+ */
+#define VCPU_BOOST_DEFAULT_PRIO 8
+#define VCPU_BOOST_DEFAULT_POLICY SCHED_RR
+
static inline bool kvm_arch_vcpu_pv_sched_enabled(struct kvm_vcpu_arch *arch)
{
return arch->pv_sched.msr_val;
@@ -2240,6 +2249,39 @@ static inline void kvm_arch_vcpu_set_boost_status(struct kvm_vcpu_arch *arch,
{
arch->pv_sched.boost_status = boost_status;
}
+
+static inline bool kvm_arch_vcpu_boosted(struct kvm_vcpu_arch *arch)
+{
+ return arch->pv_sched.boost_status == VCPU_BOOST_BOOSTED;
+}
+
+static inline int kvm_arch_vcpu_boost_policy(struct kvm_vcpu_arch *arch)
+{
+ return arch->pv_sched.boost_policy;
+}
+
+static inline int kvm_arch_vcpu_boost_prio(struct kvm_vcpu_arch *arch)
+{
+ return arch->pv_sched.boost_prio;
+}
+
+static inline int kvm_arch_vcpu_set_boost_prio(struct kvm_vcpu_arch *arch, u64 prio)
+{
+ if (prio >= MAX_RT_PRIO)
+ return -EINVAL;
+
+ arch->pv_sched.boost_prio = prio;
+ return 0;
+}
+
+static inline int kvm_arch_vcpu_set_boost_policy(struct kvm_vcpu_arch *arch, u64 policy)
+{
+ if (policy != SCHED_FIFO && policy != SCHED_RR)
+ return -EINVAL;
+
+ arch->pv_sched.boost_policy = policy;
+ return 0;
+}
#endif
#endif /* _ASM_X86_KVM_HOST_H */
@@ -167,11 +167,30 @@ enum kvm_vcpu_boost_state {
VCPU_BOOST_BOOSTED
};
+/*
+ * Boost Request from guest to host for lazy boosting.
+ */
+enum kvm_vcpu_boost_request {
+ VCPU_REQ_NONE = 0,
+ VCPU_REQ_UNBOOST,
+ VCPU_REQ_BOOST,
+};
+
+
+union guest_schedinfo {
+ struct {
+ __u8 boost_req;
+ __u8 preempt_disabled;
+ };
+ __u64 pad;
+};
+
/*
* Structure passed in via MSR_KVM_PV_SCHED
*/
struct pv_sched_data {
__u64 boost_status;
+ union guest_schedinfo schedinfo;
};
#endif /* _UAPI_ASM_X86_KVM_PARA_H */
@@ -2148,6 +2148,37 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
xfer_to_guest_mode_work_pending();
}
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+static inline bool __vcpu_needs_boost(struct kvm_vcpu *vcpu, union guest_schedinfo schedinfo)
+{
+ bool pending_event = kvm_cpu_has_pending_timer(vcpu) || kvm_cpu_has_interrupt(vcpu);
+
+ /*
+ * vcpu needs a boost if
+ * - A lazy boost request active, or
+ * - Pending latency sensitive event, or
+ * - Preemption disabled in this vcpu.
+ */
+ return (schedinfo.boost_req == VCPU_REQ_BOOST || pending_event || schedinfo.preempt_disabled);
+}
+
+static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu)
+{
+ union guest_schedinfo schedinfo;
+
+ if (!kvm_vcpu_sched_enabled(vcpu))
+ return;
+
+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.pv_sched.data,
+ &schedinfo, offsetof(struct pv_sched_data, schedinfo), sizeof(schedinfo)))
+ return;
+
+ kvm_vcpu_set_sched(vcpu, __vcpu_needs_boost(vcpu, schedinfo));
+}
+#else
+static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu) { }
+#endif
+
/*
* The fast path for frequent and performance sensitive wrmsr emulation,
* i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
@@ -2201,6 +2232,15 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
ret = EXIT_FASTPATH_REENTER_GUEST;
}
break;
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+ case MSR_KVM_PV_SCHED:
+ data = kvm_read_edx_eax(vcpu);
+ if (data == ULLONG_MAX) {
+ kvm_skip_emulated_instruction(vcpu);
+ ret = EXIT_FASTPATH_EXIT_HANDLED;
+ }
+ break;
+#endif
default:
break;
}
@@ -10919,6 +10959,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
guest_timing_exit_irqoff();
local_irq_enable();
+
+ kvm_vcpu_do_pv_sched(vcpu);
+
preempt_enable();
kvm_vcpu_srcu_read_lock(vcpu);
@@ -11990,6 +12033,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (r)
goto free_guest_fpu;
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+ kvm_arch_vcpu_set_boost_prio(&vcpu->arch, VCPU_BOOST_DEFAULT_PRIO);
+ kvm_arch_vcpu_set_boost_policy(&vcpu->arch, VCPU_BOOST_DEFAULT_POLICY);
+#endif
+
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_xen_init_vcpu(vcpu);
@@ -2290,6 +2290,17 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
#ifdef CONFIG_PARAVIRT_SCHED_KVM
void kvm_set_vcpu_boosted(struct kvm_vcpu *vcpu, bool boosted);
+int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, bool boost);
+
+static inline bool kvm_vcpu_sched_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_vcpu_pv_sched_enabled(&vcpu->arch);
+}
+#else
+static inline int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, bool boost)
+{
+ return 0;
+}
#endif
#endif
@@ -57,6 +57,9 @@
#include <asm/ioctl.h>
#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
+
#include "coalesced_mmio.h"
#include "async_pf.h"
#include "kvm_mm.h"
@@ -3602,6 +3605,77 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+/*
+ * Check if we need to act on the boost/unboost request.
+ * Returns true if:
+ * - caller is requesting boost and vcpu is boosted, or
+ * - caller is requesting unboost and vcpu is not boosted.
+ */
+static inline bool __can_ignore_set_sched(struct kvm_vcpu *vcpu, bool boost)
+{
+ return ((boost && kvm_arch_vcpu_boosted(&vcpu->arch)) ||
+ (!boost && !kvm_arch_vcpu_boosted(&vcpu->arch)));
+}
+
+int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, bool boost)
+{
+ int policy;
+ int ret = 0;
+ struct pid *pid;
+ struct sched_param param = { 0 };
+ struct task_struct *vcpu_task = NULL;
+
+ /*
+ * We can ignore the request if a boost request comes
+ * when we are already boosted or an unboost request
+ * when we are already unboosted.
+ */
+ if (__can_ignore_set_sched(vcpu, boost))
+ goto set_boost_status;
+
+ if (boost) {
+ policy = kvm_arch_vcpu_boost_policy(&vcpu->arch);
+ param.sched_priority = kvm_arch_vcpu_boost_prio(&vcpu->arch);
+ } else {
+ /*
+ * TODO: here we just unboost to SCHED_NORMAL. Ideally we
+ * should either
+ * - revert to the initial priority before boost, or
+ * - introduce tunables for unboost priority.
+ */
+ policy = SCHED_NORMAL;
+ param.sched_priority = 0;
+ }
+
+ rcu_read_lock();
+ pid = rcu_dereference(vcpu->pid);
+ if (pid)
+ vcpu_task = get_pid_task(pid, PIDTYPE_PID);
+ rcu_read_unlock();
+ if (vcpu_task == NULL)
+ return -KVM_EINVAL;
+
+ /*
+ * This might be called from interrupt context.
+ * Since we do not use rt-mutexes, we can safely call
+ * sched_setscheduler_pi_nocheck with pi = false.
+ * NOTE: If in future, we use rt-mutexes, this should
+ * be modified to use a tasklet to do boost/unboost.
+ */
+ WARN_ON_ONCE(vcpu_task->pi_top_task);
+ ret = sched_setscheduler_pi_nocheck(vcpu_task, policy,
+ ¶m, false);
+ put_task_struct(vcpu_task);
+set_boost_status:
+ if (!ret)
+ kvm_set_vcpu_boosted(vcpu, boost);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_set_sched);
+#endif
+
#ifndef CONFIG_S390
/*
* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.