=============Guest-wide profiling with domain-switch, for
Linux-2.6.32==================
b/arch/x86/include/asm/thread_info.h
@@ -96,6 +96,7 @@ struct thread_info {
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
+#define TIF_VPMU_CTXSW 29 /* KVM thread tag */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -119,6 +120,7 @@ struct thread_info {
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_VPMU_CTXSW (1 << TIF_VPMU_CTXSW)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -146,8 +148,9 @@ struct thread_info {
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
-
+ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC| \
+ _TIF_VPMU_CTXSW)
+
#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
@@ -178,6 +178,53 @@ int set_tsc_mode(unsigned int val)
return 0;
}
+static const u32 vmx_pmu_msr_index[] = {
+ MSR_P6_EVNTSEL0, MSR_P6_EVNTSEL1, MSR_P6_PERFCTR0, MSR_P6_PERFCTR1,
+};
+#define NR_VMX_PMU_MSR ARRAY_SIZE(vmx_pmu_msr_index)
+static u64 vpmu_msr_list[NR_VMX_PMU_MSR];
+
+static void vpmu_load_msrs(u64 *msr_list)
+{
+ u64 *p = msr_list;
+ int i;
+
+ for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+ wrmsrl(vmx_pmu_msr_index[i], *p);
+ p++;
+ }
+}
+
+static void vpmu_save_msrs(u64 *msr_list)
+{
+ u64 *p = msr_list;
+ int i;
+
+ for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+ rdmsrl(vmx_pmu_msr_index[i], *p);
+ p++;
+ }
+}
+
+#define P6_EVENTSEL0_ENABLE (1 << 22)
+static void enable_perf(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val |= P6_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void disable_perf(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val &= ~P6_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
@@ -186,6 +233,21 @@ void __switch_to_xtra(struct task_struct *prev_p,
struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
+ if (test_tsk_thread_flag(prev_p, TIF_VPMU_CTXSW) &&
+ test_tsk_thread_flag(next_p, TIF_VPMU_CTXSW)) {
+ /* do nothing, still in KVM context */
+ } else {
+ if (test_tsk_thread_flag(prev_p, TIF_VPMU_CTXSW)) {
+ disable_perf();
+ vpmu_save_msrs(vpmu_msr_list);
+ }
+
+ if (test_tsk_thread_flag(next_p, TIF_VPMU_CTXSW)) {
+ vpmu_load_msrs(vpmu_msr_list);
+ enable_perf();
+ }
+ }
+
if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
ds_switch_to(prev_p, next_p);
@@ -34,6 +34,7 @@
#include <asm/vmx.h>
#include <asm/virtext.h>
#include <asm/mce.h>
+#include <linux/kdebug.h>
#include "trace.h"
@@ -127,6 +128,7 @@ static u64 construct_eptp(unsigned long root_hpa);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct kvm_vcpu *, cur_exit_vcpu);
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
@@ -3603,6 +3605,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu = raw_smp_processor_id();
if (enable_ept && is_paging(vcpu)) {
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -3639,6 +3642,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
if (vcpu->arch.switch_db_regs)
set_debugreg(vcpu->arch.dr6, 6);
+ /* record the exited vcpu */
+ per_cpu(cur_exit_vcpu, cpu) = vcpu;
+
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -3985,6 +3991,43 @@ static struct kvm_x86_ops vmx_x86_ops = {
.gb_page_enable = vmx_gb_page_enable,
};
+static void guest_set_apic(void *info)
+{
+ unsigned int v;
+
+ v = apic_read(APIC_LVTERR);
+ apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ apic_write(APIC_LVTERR, v);
+}
+
+static int vmx_vcpu_nmi_notify(struct notifer_block *self,
+ unsigned long val, void *data)
+{
+ int cpu = raw_smp_processor_id();
+ struct kvm_vcpu *vcpu = per_cpu(cur_exit_vcpu, cpu);
+ int ret = NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ guest_set_apic(NULL);
+ vcpu->cntr_overflow = 1;
+ vcpu->nmi_nr++;
+ ret = NOTIFY_STOP;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block vmx_vcpu_nb = {
+ .notifier_call = vmx_vcpu_nmi_notify,
+ .next = NULL,
+ .priority = 3
+};
+
static int __init vmx_init(void)
{
int r;
@@ -4036,6 +4079,17 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ vmx_disable_intercept_for_msr(MSR_P6_PERFCTR0, false);
+ vmx_disable_intercept_for_msr(MSR_P6_PERFCTR1, false);
+ vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL0, false);
+ vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL1, false);
+
+ if (register_die_notifier(&vmx_vcpu_nb)) {
+ printk(KERN_ALERT "[hw_vpmu]: Register NMI handler failed..\n");
+ } else {
+ printk(KERN_ALERT "[hw_vpmu]: Register NMI handler succeeded..\n");
+ }
+
if (enable_ept) {
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
@@ -4071,6 +4125,9 @@ static void __exit vmx_exit(void)
free_page((unsigned long)vmx_io_bitmap_b);
free_page((unsigned long)vmx_io_bitmap_a);
+ unregister_die_notifier(&vmx_vcpu_nb);
+ printk(KERN_ALERT "[hw_vpmu]: Remove NMI handler module..\n");
+
kvm_exit();
}
@@ -3615,6 +3615,11 @@ static int vcpu_enter_guest(struct kvm_vcpu
*vcpu, struct kvm_run *kvm_run)
goto out;
}
+ if (vcpu->cntr_overflow) {
+ vcpu->arch.nmi_pending = 1;
+ vcpu->cntr_overflow = 0;
+ }
+
inject_pending_event(vcpu, kvm_run);
/* enable NMI/IRQ window open exits if needed */
@@ -99,6 +99,9 @@ struct kvm_vcpu {
gpa_t mmio_phys_addr;
#endif
+ int cntr_overflow;
+ int nmi_nr;
+
struct kvm_vcpu_arch arch;
};
@@ -225,6 +225,9 @@ extern int flush_work(struct work_struct *work);
extern int cancel_work_sync(struct work_struct *work);
+extern struct task_struct * thread_of_workqueue(struct workqueue_struct *wq,
+ int cpu);
+
/*
* Kill off a pending schedule_delayed_work(). Note that the work callback
* function may still be running on return from cancel_delayed_work(), unless
@@ -150,6 +150,15 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
spin_unlock_irqrestore(&cwq->lock, flags);
}
+struct task_struct * thread_of_workqueue(struct workqueue_struct *wq,
+ int cpu)
+{
+ struct cpu_workqueue_struct *cwq = wq_per_cpu(wq, cpu);
+
+ return cwq->thread;
+}
+EXPORT_SYMBOL_GPL(thread_of_workqueue);
+
/**
* queue_work - queue work on a workqueue
* @wq: workqueue to use
@@ -318,10 +318,18 @@ kvm_irqfd_release(struct kvm *kvm)
*/
static int __init irqfd_module_init(void)
{
+ int cpu = raw_smp_processor_id();
+ struct task_struct *thread;
+
irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
if (!irqfd_cleanup_wq)
return -ENOMEM;
+ thread = thread_of_workqueue(irqfd_cleanup_wq, cpu);
+ set_tsk_thread_flag(thread, TIF_VPMU_CTXSW);
+ printk(KERN_ALERT "[hw_vpmu]: monitored irqfd thread id = %d\n",
+ (int)thread->pid);
+
return 0;
}
@@ -1809,6 +1809,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm
*kvm, u32 id)
kvm->bsp_vcpu = vcpu;
#endif
mutex_unlock(&kvm->lock);
+
+ set_tsk_thread_flag(current, TIF_VPMU_CTXSW);
+ printk(KERN_ALERT "[hw_vpmu]: monitored vcpu thread id = %d\n",
+ (int)current->pid);
+
return r;
vcpu_destroy:
@@ -2360,6 +2365,10 @@ static int kvm_dev_ioctl_create_vm(void)
if (fd < 0)
kvm_put_kvm(kvm);
+ set_tsk_thread_flag(current, TIF_VPMU_CTXSW);
+ printk(KERN_ALERT "[hw_vpmu]: monitored main thread id = %d\n",
+ (int)current->pid);
+
return fd;
}
=============Guest-wide profiling with cpu-switch, for
Linux-2.6.32==================
@@ -34,6 +34,7 @@
#include <asm/vmx.h>
#include <asm/virtext.h>
#include <asm/mce.h>
+#include <linux/kdebug.h>
#include "trace.h"
@@ -114,6 +115,9 @@ struct vcpu_vmx {
ktime_t entry_time;
s64 vnmi_blocked_time;
u32 exit_reason;
+
+ unsigned long *msr_host_load_store;
+ unsigned long *msr_guest_load_store;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -127,12 +131,18 @@ static u64 construct_eptp(unsigned long root_hpa);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct kvm_vcpu *, cur_exit_vcpu);
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
+static const u32 vmx_pmu_msr_index[] = {
+ MSR_P6_EVNTSEL0, MSR_P6_EVNTSEL1, MSR_P6_PERFCTR0, MSR_P6_PERFCTR1,
+};
+#define NR_VMX_PMU_MSR ARRAY_SIZE(vmx_pmu_msr_index)
+
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -2272,6 +2282,14 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, NR_VMX_PMU_MSR);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_guest_load_store));
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, NR_VMX_PMU_MSR);
+ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_guest_load_store));
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, NR_VMX_PMU_MSR);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_host_load_store));
+
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
@@ -2340,9 +2358,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -3600,9 +3615,34 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
#define Q "l"
#endif
+static void guest_set_apic(void *info)
+{
+ unsigned int v;
+
+ v = apic_read(APIC_LVTERR);
+ apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ apic_write(APIC_LVTERR, v);
+}
+
+static void save_host_msrs(struct vcpu_vmx *vmx)
+{
+ u32 *p;
+ int i;
+
+ p = (u32 *)vmx->msr_host_load_store;
+ for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+ *p = vmx_pmu_msr_index[i];
+ p += 2;
+ rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+ p += 2;
+ }
+}
+
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu = raw_smp_processor_id();
if (enable_ept && is_paging(vcpu)) {
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -3639,6 +3679,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
if (vcpu->arch.switch_db_regs)
set_debugreg(vcpu->arch.dr6, 6);
+ /* record the exited vcpu */
+ per_cpu(cur_exit_vcpu, cpu) = vcpu;
+
+ /* The guest counters are reloaded by the hardware later. */
+ save_host_msrs(vmx);
+
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -3750,6 +3796,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
vmx->launched = 1;
vmx_complete_interrupts(vmx);
+
+ /* always clear LVTPC bit */
+ guest_set_apic(NULL);
+
}
#undef R
@@ -3766,6 +3816,59 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
}
}
+static int vmx_create_vpmu_msrs(struct kvm_vcpu *vcpu)
+{
+ int i, r = 0;
+ u32 *p;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->msr_host_load_store = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx->msr_host_load_store) {
+ r = -ENOMEM;
+ }
+
+ vmx->msr_guest_load_store = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx->msr_guest_load_store) {
+ r = -ENOMEM;
+ goto free_msr_host;
+ }
+
+ memset(vmx->msr_host_load_store, 0x00, PAGE_SIZE);
+ memset(vmx->msr_guest_load_store, 0x00, PAGE_SIZE);
+
+ /* Initialize load&store memory area. Use the contents of host MSRs as
+ * initial values.. */
+ p = (u32 *)vmx->msr_host_load_store;
+ for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+ *p = vmx_pmu_msr_index[i];
+ p += 2;
+ rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+ p += 2;
+ }
+
+ p = (u32 *)vmx->msr_guest_load_store;
+ for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+ *p = vmx_pmu_msr_index[i];
+ p += 2;
+ rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+ p += 2;
+ }
+
+ return r;
+
+free_msr_host:
+ free_page((unsigned long)vmx->msr_host_load_store);
+ return r;
+}
+
+static void vmx_free_vpmu_msrs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ free_page((unsigned long)vmx->msr_host_load_store);
+ free_page((unsigned long)vmx->msr_guest_load_store);
+}
+
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3777,6 +3880,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
vmx_free_vmcs(vcpu);
kfree(vmx->host_msrs);
kfree(vmx->guest_msrs);
+ vmx_free_vpmu_msrs(vcpu);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
}
@@ -3812,6 +3916,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct
kvm *kvm, unsigned int id)
vmcs_clear(vmx->vmcs);
+ if (vmx_create_vpmu_msrs(&vmx->vcpu))
+ goto free_vmcs;
+
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
err = vmx_vcpu_setup(vmx);
@@ -3985,6 +4092,33 @@ static struct kvm_x86_ops vmx_x86_ops = {
.gb_page_enable = vmx_gb_page_enable,
};
+static int vmx_vcpu_nmi_notify(struct notifer_block *self,
+ unsigned long val, void *data)
+{
+ int cpu = raw_smp_processor_id();
+ struct kvm_vcpu *vcpu = per_cpu(cur_exit_vcpu, cpu);
+ int ret = NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ guest_set_apic(NULL);
+ vcpu->cntr_overflow = 1;
+ vcpu->nmi_nr++;
+ ret = NOTIFY_STOP;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block vmx_vcpu_nb = {
+ .notifier_call = vmx_vcpu_nmi_notify,
+ .next = NULL,
+ .priority = 3
+};
+
static int __init vmx_init(void)
{
int r;
@@ -4036,6 +4170,17 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ vmx_disable_intercept_for_msr(MSR_P6_PERFCTR0, false);
+ vmx_disable_intercept_for_msr(MSR_P6_PERFCTR1, false);
+ vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL0, false);
+ vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL1, false);
+
+ if (register_die_notifier(&vmx_vcpu_nb)) {
+ printk(KERN_ALERT "[hw_vpmu]: Register NMI handler failed..\n");
+ } else {
+ printk(KERN_ALERT "[hw_vpmu]: Register NMI handler succeeded..\n");
+ }
+
if (enable_ept) {
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
@@ -4071,6 +4216,9 @@ static void __exit vmx_exit(void)
free_page((unsigned long)vmx_io_bitmap_b);
free_page((unsigned long)vmx_io_bitmap_a);
+ unregister_die_notifier(&vmx_vcpu_nb);
+ printk(KERN_ALERT "[hw_vpmu]: Remove NMI handler module..\n");
+
kvm_exit();
}
@@ -3615,6 +3615,11 @@ static int vcpu_enter_guest(struct kvm_vcpu
*vcpu, struct kvm_run *kvm_run)
goto out;
}
+ if (vcpu->cntr_overflow) {
+ vcpu->arch.nmi_pending = 1;
+ vcpu->cntr_overflow = 0;
+ }
+
inject_pending_event(vcpu, kvm_run);
/* enable NMI/IRQ window open exits if needed */
@@ -99,6 +99,9 @@ struct kvm_vcpu {
gpa_t mmio_phys_addr;
#endif
+ int cntr_overflow;
+ int nmi_nr;
+
struct kvm_vcpu_arch arch;
};
--
To unsubscribe from this list: send the line "unsubscribe kvm" in