@@ -1169,6 +1169,8 @@ struct kvm_x86_ops {
bool (*rdtscp_supported)(void);
bool (*invpcid_supported)(void);
+ int (*set_cpu_freq_scaling)(struct kvm_vcpu *vcpu, u8 freq_100mhz);
+
void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
@@ -1699,6 +1699,95 @@ static bool vmx_invpcid_supported(void)
return cpu_has_vmx_invpcid();
}
+static int vmx_query_cpu_freq_valid_freq(u8 freq)
+{
+#define MASK_PERF 0xFF
+#define CAP_HIGHEST_SHIFT 0
+#define CAP_LOWEST_SHIFT 24
+#define CAP_HIGHEST_MASK (MASK_PERF << CAP_HIGHEST_SHIFT)
+#define CAP_LOWEST_MASK (MASK_PERF << CAP_LOWEST_SHIFT)
+ u64 cap_msr;
+ u8 highest, lowest;
+
+ /* Query highest and lowest supported scaling. */
+ rdmsrl(MSR_HWP_CAPABILITIES, cap_msr);
+ highest = (u8)(cap_msr & CAP_HIGHEST_MASK);
+ lowest = (u8)((cap_msr & CAP_LOWEST_MASK) >> CAP_LOWEST_SHIFT);
+
+ if (freq < lowest || freq > highest)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void vmx_set_cpu_freq_uncapped(void)
+{
+#define SHIFT_DESIRED_PERF 16
+#define SHIFT_MAX_PERF 8
+#define SHIFT_MIN_PERF 0
+
+ u64 cap_msr, req_msr;
+ u8 highest, lowest;
+
+ /* Query the capabilities. */
+ rdmsrl(MSR_HWP_CAPABILITIES, cap_msr);
+ highest = (u8)(cap_msr & CAP_HIGHEST_MASK);
+ lowest = (u8)((cap_msr & CAP_LOWEST_MASK) >> CAP_LOWEST_SHIFT);
+
+ /* Set the desired to highest performance. */
+ req_msr = ((highest & MASK_PERF) << SHIFT_DESIRED_PERF) |
+ ((highest & MASK_PERF) << SHIFT_MAX_PERF) |
+ ((lowest & MASK_PERF) << SHIFT_MIN_PERF);
+ wrmsrl(MSR_HWP_REQUEST, req_msr);
+}
+
+static void vmx_set_cpu_freq_capped(u8 freq_100mhz)
+{
+ u64 req_msr;
+
+ /* Populate the variable used for setting the HWP request. */
+ req_msr = ((freq_100mhz & MASK_PERF) << SHIFT_DESIRED_PERF) |
+ ((freq_100mhz & MASK_PERF) << SHIFT_MAX_PERF) |
+ ((freq_100mhz & MASK_PERF) << SHIFT_MIN_PERF);
+
+ wrmsrl(MSR_HWP_REQUEST, req_msr);
+}
+
+static int vmx_set_cpu_freq_scaling(struct kvm_vcpu *vcpu, u8 freq_100mhz)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 pm_before, req_msr;
+ int rc;
+
+ /* Is HWP scaling supported? */
+ if (!this_cpu_has(X86_FEATURE_HWP))
+ return -ENODEV;
+
+ /*
+ * HWP needs to be enabled to query & use capabilities.
+ * This bit is W1Once so cannot be cleared after.
+ */
+ rdmsrl(MSR_PM_ENABLE, pm_before);
+ if ((pm_before & 1) == 0)
+ wrmsrl(MSR_PM_ENABLE, pm_before | 1);
+
+ /*
+ * Check if setting to a specific value, if being set
+ * to zero this means return to uncapped frequency.
+ */
+ if (freq_100mhz) {
+ rc = vmx_query_cpu_freq_valid_freq(freq_100mhz);
+
+ if (rc)
+ return rc;
+
+ vmx_set_cpu_freq_capped(freq_100mhz);
+ } else
+ vmx_set_cpu_freq_uncapped();
+
+ return 0;
+}
+
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -8124,6 +8213,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.rdtscp_supported = vmx_rdtscp_supported,
.invpcid_supported = vmx_invpcid_supported,
+ .set_cpu_freq_scaling = vmx_set_cpu_freq_scaling,
+
.set_supported_cpuid = vmx_set_supported_cpuid,
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
@@ -3669,6 +3669,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SET_VAR_MTRR_COUNT:
case KVM_CAP_X86_USER_SPACE_MSR:
case KVM_CAP_X86_MSR_FILTER:
+ case KVM_CAP_CPU_FREQ_SCALING:
r = 1;
break;
#ifdef CONFIG_KVM_XEN
@@ -4499,6 +4500,19 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
return r;
}
+static int kvm_cap_set_cpu_freq(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
+{
+ u8 freq = (u8)cap->args[0];
+
+ /* Query whether this platform (Intel or AMD) support setting. */
+ if (!kvm_x86_ops.set_cpu_freq_scaling)
+ return -ENODEV;
+
+ /* Attempt to set to the frequency specified. */
+ return kvm_x86_ops.set_cpu_freq_scaling(vcpu, freq);
+}
+
/*
* kvm_set_guest_paused() indicates to the guest kernel that it has been
* stopped by the hypervisor. This function will be called from the host only.
@@ -4553,6 +4567,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return kvm_x86_ops.enable_direct_tlbflush(vcpu);
case KVM_CAP_SET_VAR_MTRR_COUNT:
return kvm_mtrr_set_var_mtrr_count(vcpu, cap->args[0]);
+ case KVM_CAP_CPU_FREQ_SCALING:
+ return kvm_cap_set_cpu_freq(vcpu, cap);
default:
return -EINVAL;
@@ -874,6 +874,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_NO_POLL_ON_HLT 100003
#define KVM_CAP_MMU_USE_VMA_CAPMEM 100004
#define KVM_CAP_MMU_SUPPORT_DYNAMIC_CAPMEM 100005
+#define KVM_CAP_CPU_FREQ_SCALING 100006
#define KVM_CAP_IRQCHIP 0
#define KVM_CAP_HLT 1
A VMM can control a vCPU's CPU frequency by interfacing with KVM via the vCPU file descriptor to enable/set CPU frequency scaling for a guest. Instead of creating a separate IOCTL to this this, KVM capabil- ities are extended to include a capability called KVM_CAP_CPU_FREQ_SCALING. A generic set_cpu_freq interface is added to kvm_x86_ops to allow for architecture (AMD/Intel) independent CPU frequency scaling setting. For Intel platforms, Hardware-Controlled Performance States (HWP) are used to implement CPU scaling within the guest. Further information on this mechanism can be seen in Intel SDM Vol 3B (section 14.4). The CPU frequency is set as soon as this function is called and is kept running until explicitly reset or set again. Currently the AMD frequency setting interface is left unimplemented. Please note that CPU frequency scaling will have an effect on host processing in it's current form. To change back to full performance when running in host context an IOCTL with a frequency value of 0 is needed to run back at uncapped speed. Signed-off-by: Jack Allister <jalliste@amazon.com> --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/vmx/vmx.c | 91 +++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 16 ++++++ include/uapi/linux/kvm.h | 1 + 4 files changed, 110 insertions(+)