@@ -2,6 +2,7 @@
#define _ASM_X86_KVM_PARA_H
#include <linux/types.h>
+#include <linux/list.h>
#include <asm/hyperv.h>
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
@@ -19,7 +20,8 @@
/* This indicates that the new set of kvmclock msrs
* are available. The use of 0x11 and 0x12 is deprecated
*/
-#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_PV_PERF 4
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -33,7 +35,14 @@
#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
-#define KVM_MAX_MMU_OP_BATCH 32
+#define KVM_MAX_MMU_OP_BATCH 32
+
+/* Operations for KVM_PERF_OP */
+#define KVM_PERF_OP_OPEN 1
+#define KVM_PERF_OP_CLOSE 2
+#define KVM_PERF_OP_ENABLE 3
+#define KVM_PERF_OP_DISABLE 4
+#define KVM_PERF_OP_READ 5
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
@@ -64,6 +73,85 @@ struct kvm_mmu_op_release_pt {
#ifdef __KERNEL__
#include <asm/processor.h>
+/*
+ * data communication area about perf_event between
+ * Host kernel and guest kernel
+ */
+struct guest_perf_event {
+ u64 count;
+ atomic_t overflows;
+};
+
+/*
+ * In host kernel, perf_event->host_perf_shadow points to
+ * host_perf_shadow which records some information
+ * about the guest.
+ */
+struct host_perf_shadow {
+ /* guest perf_event id passed from guest os */
+ int id;
+ /*
+ * Host kernel saves data into data member counter firstly.
+ * kvm will get data from this counter and calls kvm functions
+ * to copy or add data back to guets os before entering guest os
+ * next time
+ */
+ struct guest_perf_event counter;
+ /* guest_event_addr is gpa_t pointing to guest os guest_perf_event*/
+ __u64 guest_event_addr;
+
+ /*
+ * Link to of kvm.kvm_arch.shadow_hash_table
+ */
+ struct list_head shadow_entry;
+ struct kvm_vcpu *vcpu;
+
+ struct perf_event *host_event;
+ /*
+ * Below counter is to prevent malicious guest os to try to
+ * close/enable event at the same time.
+ */
+ atomic_t ref_counter;
+};
+
+/*
+ * In guest kernel, perf_event->guest_shadow points to
+ * guest_perf_shadow which records some information
+ * about the guest.
+ */
+struct guest_perf_shadow {
+ /* guest perf_event id passed from guest os */
+ int id;
+ /*
+ * Host kernel kvm saves data into data member counter
+ */
+ struct guest_perf_event counter;
+};
+
+/*
+ * guest_perf_attr is used when guest calls hypercall to
+ * open a new perf_event at host side. Mostly, it's a copy of
+ * perf_event_attr and deletes something not used by host kernel.
+ */
+struct guest_perf_attr {
+ __u32 type;
+ __u64 config;
+ __u64 sample_period;
+ __u64 sample_type;
+ __u64 read_format;
+ __u64 flags;
+ __u32 bp_type;
+ __u64 bp_addr;
+ __u64 bp_len;
+};
+
+struct guest_perf_event_param {
+ __u64 attr_addr;
+ __u64 guest_event_addr;
+ /* In case there is an alignment issue, we put id as the last one */
+ int id;
+};
+
extern void kvmclock_init(void);
@@ -24,6 +24,7 @@
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/msr-index.h>
+#include <asm/perf_event.h>
#define KVM_MAX_VCPUS 64
#define KVM_MEMORY_SLOTS 32
@@ -360,6 +361,18 @@ struct kvm_vcpu_arch {
/* fields used by HYPER-V emulation */
u64 hv_vapic;
+
+ /*
+ * Fields used by PARAVIRT perf interface:
+ *
+ * kvm checks overflow_events before entering guest os,
+ * and copy data back to guest os.
+ * event_mutex is to avoid a race between NMI perf event overflow
+ * handler, event close, and enable/disable.
+ */
+ struct mutex event_mutex;
+ int overflows;
+ struct perf_event *overflow_events[X86_PMC_IDX_MAX];
};
struct kvm_mem_alias {
@@ -377,6 +390,9 @@ struct kvm_mem_aliases {
int naliases;
};
+#define KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS (10)
+#define KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM (1<<KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS)
+
struct kvm_arch {
struct kvm_mem_aliases *aliases;
@@ -415,6 +431,15 @@ struct kvm_arch {
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
+
+ /*
+ * fields used by PARAVIRT perf interface:
+ * Used to organize all host perf_events representing guest
+ * perf_event on a specific kvm instance
+ */
+ atomic_t kvm_pv_event_num;
+ spinlock_t shadow_lock;
+ struct list_head *shadow_hash_table;
};
struct kvm_vm_stat {
@@ -561,6 +586,9 @@ int emulator_write_phys(struct kvm_vcpu
const void *val, int bytes);
int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
gpa_t addr, unsigned long *ret);
+int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
+ unsigned long a2, unsigned long *result);
+
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
@@ -17,6 +17,7 @@
#define KVM_HC_VAPIC_POLL_IRQ 1
#define KVM_HC_MMU_OP 2
+#define KVM_PERF_OP 3
/*
* hypercalls use architecture specific
@@ -3647,6 +3647,7 @@ static int vmx_handle_exit(struct kvm_vc
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ int ret;
trace_kvm_exit(exit_reason, vcpu);
@@ -3694,12 +3695,17 @@ static int vmx_handle_exit(struct kvm_vc
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
else {
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->run->hw.hardware_exit_reason = exit_reason;
+ ret = 0;
}
- return 0;
+
+ /* sync paravirt perf event to guest */
+ kvm_sync_events_to_guest(vcpu);
+
+ return ret;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -6,12 +6,14 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
+ * Copyright Intel Corporation, 2010
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
+ * Yanmin Zhang <yanmin.zhang@intel.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
@@ -1618,6 +1620,7 @@ int kvm_dev_ioctl_check_extension(long e
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_PV_PERF:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1993,7 +1996,9 @@ static void do_cpuid_ent(struct kvm_cpui
entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_PV_PERF) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
entry->ebx = 0;
entry->ecx = 0;
entry->edx = 0;
@@ -4052,10 +4057,21 @@ static unsigned long kvm_get_guest_ip(vo
return ip;
}
+int kvm_notify_event_overflow(void)
+{
+ if (percpu_read(current_vcpu)) {
+ kvm_inject_nmi(percpu_read(current_vcpu));
+ return 0;
+ }
+
+ return -1;
+}
+
static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
+ .copy_event_to_shadow = kvm_copy_event_to_shadow,
};
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
@@ -4138,15 +4154,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vc
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
- unsigned long a1)
-{
- if (is_long_mode(vcpu))
- return a0;
- else
- return a0 | ((gpa_t)a1 << 32);
-}
-
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
u64 param, ingpa, outgpa, ret;
@@ -4245,6 +4252,9 @@ int kvm_emulate_hypercall(struct kvm_vcp
case KVM_HC_MMU_OP:
r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
break;
+ case KVM_PERF_OP:
+ r = kvm_pv_perf_op(vcpu, a0, a1, a2, &ret);
+ break;
default:
ret = -KVM_ENOSYS;
break;
@@ -5334,6 +5344,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+ mutex_init(&vcpu->arch.event_mutex);
+
return 0;
fail_free_lapic:
kvm_free_lapic(vcpu);
@@ -5360,6 +5372,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcp
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ struct list_head *hash_table;
+ int i;
if (!kvm)
return ERR_PTR(-ENOMEM);
@@ -5369,6 +5383,18 @@ struct kvm *kvm_arch_create_vm(void)
kfree(kvm);
return ERR_PTR(-ENOMEM);
}
+ hash_table = kmalloc(sizeof(struct list_head) *
+ KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM,
+ GFP_KERNEL);
+ if (!hash_table) {
+ kfree(kvm->arch.aliases);
+ kfree(kvm);
+ return ERR_PTR(-ENOMEM);
+ }
+ for (i = 0; i < KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM; i++)
+ INIT_LIST_HEAD(&hash_table[i]);
+ kvm->arch.shadow_hash_table = hash_table;
+ spin_lock_init(&kvm->arch.shadow_lock);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5416,6 +5442,8 @@ void kvm_arch_sync_events(struct kvm *kv
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_remove_all_perf_events(kvm);
+
kvm_iommu_unmap_guest(kvm);
kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
@@ -5427,6 +5455,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
cleanup_srcu_struct(&kvm->srcu);
+ kfree(kvm->arch.shadow_hash_table);
kfree(kvm->arch.aliases);
kfree(kvm);
}
@@ -72,7 +72,20 @@ static inline struct kvm_mem_aliases *kv
|| lockdep_is_held(&kvm->slots_lock));
}
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+ unsigned long a1)
+{
+ if (is_long_mode(vcpu))
+ return a0;
+ else
+ return a0 | ((gpa_t)a1 << 32);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+int kvm_notify_event_overflow(void);
+void kvm_copy_event_to_shadow(struct perf_event *event, int overflows);
+void kvm_sync_events_to_guest(struct kvm_vcpu *vcpu);
+void kvm_remove_all_perf_events(struct kvm *kvm);
#endif
@@ -11,7 +11,7 @@ kvm-y += $(addprefix ../../../virt/kvm
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o
+ i8254.o timer.o kvmperf_event.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
@@ -0,0 +1,471 @@
+/*
+ * Performance events x86 kvm para architecture code
+ *
+ * Copyright (C) 2010 Intel Inc.
+ * Zhang Yanmin <yanmin.zhang@intel.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/compat.h>
+
+#include "x86.h"
+
+#define KVM_MAX_PARAVIRT_PERF_EVENT (1024)
+
+static inline u32 shadow_hash_id(int id)
+{
+ u32 hash_value = id;
+
+ hash_value = hash_32(hash_value, KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS);
+ return hash_value;
+}
+
+static int kvm_add_host_event(struct kvm_vcpu *vcpu,
+ struct host_perf_shadow *host_shadow)
+{
+ long unsigned flags;
+ u32 index = shadow_hash_id(host_shadow->id);
+ struct kvm_arch *arch = &vcpu->kvm->arch;
+ struct list_head *head = &arch->shadow_hash_table[index];
+ struct list_head *pos;
+ struct host_perf_shadow *tmp;
+
+ spin_lock_irqsave(&arch->shadow_lock, flags);
+ list_for_each(pos, head) {
+ tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+ WARN(tmp->id == host_shadow->id, "%s called when there is an"
+ " item with the same id [%d] in hash table,",
+ __func__, host_shadow->id);
+ }
+ list_add(&host_shadow->shadow_entry, head);
+ spin_unlock_irqrestore(&arch->shadow_lock, flags);
+ return 0;
+}
+
+static struct perf_event *
+kvm_find_get_host_event(struct kvm_vcpu *vcpu, int id, int need_delete)
+{
+ long unsigned flags;
+ u32 index = shadow_hash_id(id);
+ struct kvm_arch *arch = &vcpu->kvm->arch;
+ struct list_head *head = &arch->shadow_hash_table[index];
+ struct list_head *pos;
+ struct host_perf_shadow *tmp = NULL;
+ int found = 0;
+
+ spin_lock_irqsave(&arch->shadow_lock, flags);
+ list_for_each(pos, head) {
+ tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+ if (tmp->id == id) {
+ found = 1;
+ if (need_delete)
+ list_del_init(&tmp->shadow_entry);
+ else
+ atomic_inc(&tmp->ref_counter);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&arch->shadow_lock, flags);
+
+ if (found)
+ return tmp->host_event;
+ else
+ return NULL;
+}
+
+static void kvm_vcpu_add_event_overflow_ref(struct perf_event *event)
+{
+ struct host_perf_shadow *host_shadow = event->host_perf_shadow;
+ struct kvm_vcpu *vcpu = host_shadow->vcpu;
+ int ret;
+
+ /*
+ * Use trylock as it's in NMI handler. We don't care
+ * too much to lose reporting once of one event to guets os,
+ * because host saves overflows counter in host_perf_shadow.
+ * Next time when a new overflow of the event happens and if
+ * there is no contention, host could push overflows to guest
+ * and guest could process also saved overflows.
+ */
+ ret = mutex_trylock(&vcpu->arch.event_mutex);
+ if (!ret)
+ return;
+ if (vcpu->arch.overflows < X86_PMC_IDX_MAX) {
+ vcpu->arch.overflow_events[vcpu->arch.overflows] = event;
+ vcpu->arch.overflows++;
+ }
+ mutex_unlock(&vcpu->arch.event_mutex);
+}
+
+static int kvm_vcpu_remove_event_overflow_ref(struct host_perf_shadow *shadow)
+{
+ struct kvm_vcpu *vcpu = shadow->vcpu;
+ int i;
+
+ if (!vcpu || !vcpu->arch.overflows)
+ return -1;
+
+ mutex_lock(&vcpu->arch.event_mutex);
+ for (i = 0; i < vcpu->arch.overflows; i++) {
+ if (vcpu->arch.overflow_events[i] == shadow->host_event)
+ vcpu->arch.overflow_events[i] = NULL;
+ }
+ mutex_unlock(&vcpu->arch.event_mutex);
+ return 0;
+}
+
+void kvm_copy_event_to_shadow(struct perf_event *event, int overflows)
+{
+ struct host_perf_shadow *shadow = event->host_perf_shadow;
+
+ shadow->counter.count = local64_read(&event->count);
+ atomic_add(overflows, &shadow->counter.overflows);
+ kvm_vcpu_add_event_overflow_ref(event);
+ /* Inject NMI to guest os */
+ kvm_notify_event_overflow();
+}
+
+static void kvm_perf_event_overflow(struct perf_event *event, int nmi,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ BUG_ON(event->host_perf_shadow == NULL);
+ kvm_copy_event_to_shadow(event, 1);
+}
+
+static void kvm_put_host_event(struct perf_event *host_event)
+{
+ struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+ if (!atomic_dec_return(&shadow->ref_counter)) {
+ /*
+ * detach it in case guest os doesn't disables it
+ * before closing
+ */
+ perf_event_detach(host_event);
+ kvm_vcpu_remove_event_overflow_ref(shadow);
+
+ perf_event_release_kernel(host_event);
+ kfree(shadow);
+ atomic_dec(&shadow->vcpu->kvm->arch.kvm_pv_event_num);
+ }
+}
+
+static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
+ struct perf_event *host_event)
+{
+ struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+ struct guest_perf_event counter;
+ int ret;
+ s32 overflows;
+
+ ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
+ &counter, sizeof(counter));
+ if (ret < 0)
+ return;
+
+again:
+ overflows = atomic_read(&shadow->counter.overflows);
+ if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
+ overflows)
+ goto again;
+
+ counter.count = shadow->counter.count;
+ atomic_add(overflows, &counter.overflows);
+
+ kvm_write_guest(vcpu->kvm,
+ shadow->guest_event_addr,
+ &counter,
+ sizeof(counter));
+ return;
+}
+
+/*
+ * called by KVM to copy both perf_event->count and overflows to guest
+ * after host NMI handler detects guest perf_event overflows
+ */
+void kvm_sync_events_to_guest(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (vcpu->arch.overflows == 0)
+ return;
+
+ mutex_lock(&vcpu->arch.event_mutex);
+ for (i = 0; i < vcpu->arch.overflows; i++) {
+ if (vcpu->arch.overflow_events[i]) {
+ kvm_copy_event_to_guest(vcpu,
+ vcpu->arch.overflow_events[i]);
+ }
+ }
+ vcpu->arch.overflows = 0;
+ mutex_unlock(&vcpu->arch.event_mutex);
+}
+EXPORT_SYMBOL_GPL(kvm_sync_events_to_guest);
+
+/* Just copy perf_event->count to guest. Don't copy overflows to guest */
+static void
+kvm_copy_count_to_guest(struct kvm_vcpu *vcpu, struct perf_event *host_event)
+{
+ struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+
+ shadow->counter.count = local64_read(&host_event->count);
+ kvm_write_guest(vcpu->kvm,
+ shadow->guest_event_addr,
+ &shadow->counter.count,
+ sizeof(shadow->counter.count));
+ return;
+}
+
+static int
+kvm_pv_perf_op_open(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ int ret = 0;
+ struct perf_event *host_event = NULL;
+ struct host_perf_shadow *shadow = NULL;
+ struct guest_perf_event_param param;
+ struct guest_perf_attr *guest_attr = NULL;
+ struct perf_event_attr *attr = NULL;
+ int next_count;
+
+ next_count = atomic_read(&vcpu->kvm->arch.kvm_pv_event_num);
+ if (next_count >= KVM_MAX_PARAVIRT_PERF_EVENT) {
+ WARN_ONCE(1, "guest os wants to open more than %d events\n",
+ KVM_MAX_PARAVIRT_PERF_EVENT);
+ return -ENOENT;
+ }
+ atomic_inc(&vcpu->kvm->arch.kvm_pv_event_num);
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ guest_attr = kzalloc(sizeof(*guest_attr), GFP_KERNEL);
+ if (!attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = kvm_read_guest(vcpu->kvm, addr, ¶m, sizeof(param));
+ if (ret < 0)
+ goto out;
+
+ host_event = kvm_find_get_host_event(vcpu, param.id, 0);
+ if (host_event) {
+ kvm_put_host_event(host_event);
+ return -EEXIST;
+ }
+
+ ret = kvm_read_guest(vcpu->kvm, param.attr_addr,
+ guest_attr, sizeof(*guest_attr));
+ if (ret < 0)
+ goto out;
+
+ attr->type = guest_attr->type;
+ attr->config = guest_attr->config;
+ attr->sample_period = guest_attr->sample_period;
+ attr->read_format = guest_attr->read_format;
+ attr->flags = guest_attr->flags;
+ attr->bp_type = guest_attr->bp_type;
+ attr->bp_addr = guest_attr->bp_addr;
+ attr->bp_len = guest_attr->bp_len;
+ /*
+ * By default, we disable the host event. Later on, guets os
+ * triggers a perf_event_attach to enable it
+ */
+ attr->disabled = 1;
+ attr->inherit = 0;
+ attr->enable_on_exec = 0;
+ /*
+ * We don't support exclude mode of user and kernel for guest os,
+ * which mean we always collect both user and kernel for guest os
+ */
+ attr->exclude_user = 0;
+ attr->exclude_kernel = 0;
+
+ shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
+ if (!shadow) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ shadow->id = param.id;
+ shadow->guest_event_addr = param.guest_event_addr;
+ shadow->vcpu = vcpu;
+ INIT_LIST_HEAD(&shadow->shadow_entry);
+
+ /* We always create a cpu context host perf event */
+ host_event = perf_event_create_kernel_counter(attr, -1,
+ current->pid, kvm_perf_event_overflow);
+
+ if (IS_ERR(host_event)) {
+ host_event = NULL;
+ ret = -1;
+ goto out;
+ }
+ host_event->host_perf_shadow = shadow;
+ shadow->host_event = host_event;
+ atomic_set(&shadow->ref_counter, 1);
+ kvm_add_host_event(vcpu, shadow);
+
+out:
+ if (!host_event)
+ kfree(shadow);
+
+ kfree(attr);
+ kfree(guest_attr);
+
+ if (ret)
+ atomic_dec(&vcpu->kvm->arch.kvm_pv_event_num);
+
+ return ret;
+}
+
+static int kvm_pv_perf_op_close(struct kvm_vcpu *vcpu, int id)
+{
+ struct perf_event *host_event;
+
+ /* Find and delete the event from the hashtable */
+ host_event = kvm_find_get_host_event(vcpu, id, 1);
+ if (!host_event)
+ return -1;
+ kvm_put_host_event(host_event);
+ return 0;
+}
+
+static int kvm_pv_perf_op_enable(struct kvm_vcpu *vcpu, int id)
+{
+ struct perf_event *event;
+ struct host_perf_shadow *shadow;
+
+ event = kvm_find_get_host_event(vcpu, id, 0);
+ if (!event)
+ return -1;
+
+ shadow = event->host_perf_shadow;
+ if (shadow->vcpu != vcpu) {
+ kvm_vcpu_remove_event_overflow_ref(event->host_perf_shadow);
+ shadow->vcpu = vcpu;
+ }
+
+ perf_event_attach(event);
+ kvm_put_host_event(event);
+
+ return 0;
+}
+
+static int kvm_pv_perf_op_disable(struct kvm_vcpu *vcpu, int id)
+{
+ struct perf_event *host_event = kvm_find_get_host_event(vcpu, id, 0);
+ if (!host_event)
+ return -1;
+ perf_event_detach(host_event);
+ /* We sync count to guest as we delay the guest count update */
+ kvm_copy_count_to_guest(vcpu, host_event);
+ kvm_put_host_event(host_event);
+
+ return 0;
+}
+
+static int kvm_pv_perf_op_read(struct kvm_vcpu *vcpu, int id)
+{
+ u64 enabled, running;
+ struct perf_event *host_event = kvm_find_get_host_event(vcpu, id, 0);
+
+ if (!host_event)
+ return -1;
+ if (host_event->state == PERF_EVENT_STATE_ACTIVE)
+ perf_event_read_value(host_event, &enabled, &running);
+ kvm_copy_count_to_guest(vcpu, host_event);
+ kvm_put_host_event(host_event);
+ return 0;
+}
+
+int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
+ unsigned long a2, unsigned long *result)
+{
+ unsigned long ret;
+ gpa_t addr;
+ int id;
+
+ switch (op_code) {
+ case KVM_PERF_OP_OPEN:
+ addr = hc_gpa(vcpu, a1, a2);
+ ret = (unsigned long) kvm_pv_perf_op_open(vcpu, addr);
+ break;
+ case KVM_PERF_OP_CLOSE:
+ id = (int) a1;
+ ret = kvm_pv_perf_op_close(vcpu, id);
+ break;
+ case KVM_PERF_OP_ENABLE:
+ id = (int) a1;
+ ret = kvm_pv_perf_op_enable(vcpu, id);
+ break;
+ case KVM_PERF_OP_DISABLE:
+ id = (int) a1;
+ ret = kvm_pv_perf_op_disable(vcpu, id);
+ break;
+ case KVM_PERF_OP_READ:
+ id = (int) a1;
+ ret = kvm_pv_perf_op_read(vcpu, id);
+ break;
+ default:
+ ret = -KVM_ENOSYS;
+ }
+
+ *result = ret;
+ return 0;
+}
+
+void kvm_remove_all_perf_events(struct kvm *kvm)
+{
+ long unsigned flags;
+ struct kvm_arch *arch = &kvm->arch;
+ LIST_HEAD(total_events);
+ struct list_head *head;
+ struct list_head *pos, *next;
+ struct host_perf_shadow *tmp;
+ int i;
+
+ spin_lock_irqsave(&arch->shadow_lock, flags);
+ for (i = 0; i < KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM; i++) {
+ head = &arch->shadow_hash_table[i];
+ list_for_each_safe(pos, next, head) {
+ tmp = container_of(pos, struct host_perf_shadow,
+ shadow_entry);
+ list_del(&tmp->shadow_entry);
+ list_add(&tmp->shadow_entry, &total_events);
+ }
+ }
+ spin_unlock_irqrestore(&arch->shadow_lock, flags);
+ head = &total_events;
+ list_for_each_safe(pos, next, head) {
+ tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+ list_del(&tmp->shadow_entry);
+ kvm_put_host_event(tmp->host_event);
+ }
+
+ return;
+}
+
@@ -524,6 +524,7 @@ struct kvm_enable_cap {
#define KVM_CAP_PPC_OSI 52
#define KVM_CAP_PPC_UNSET_IRQ 53
#define KVM_CAP_ENABLE_CAP 54
+#define KVM_CAP_PV_PERF 57
#ifdef KVM_CAP_IRQ_ROUTING