diff mbox series

[v7,07/12] perf/x86: no counter allocation support

Message ID 1562548999-37095-8-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show
Series Guest LBR Enabling | expand

Commit Message

Wang, Wei W July 8, 2019, 1:23 a.m. UTC
In some cases, an event may be created without needing a counter
allocation. For example, an lbr event may be created by the host
only to help save/restore the lbr stack on the vCPU context switching.

This patch adds a new interface to allow users to create a perf event
without the need of counter assignment.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/events/core.c     | 12 ++++++++++++
 include/linux/perf_event.h | 13 +++++++++++++
 kernel/events/core.c       | 37 +++++++++++++++++++++++++------------
 3 files changed, 50 insertions(+), 12 deletions(-)

Comments

Peter Zijlstra July 8, 2019, 2:29 p.m. UTC | #1
On Mon, Jul 08, 2019 at 09:23:14AM +0800, Wei Wang wrote:
> In some cases, an event may be created without needing a counter
> allocation. For example, an lbr event may be created by the host
> only to help save/restore the lbr stack on the vCPU context switching.
> 
> This patch adds a new interface to allow users to create a perf event
> without the need of counter assignment.
> 
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> ---

I _really_ hate this one.

>  arch/x86/events/core.c     | 12 ++++++++++++
>  include/linux/perf_event.h | 13 +++++++++++++
>  kernel/events/core.c       | 37 +++++++++++++++++++++++++------------
>  3 files changed, 50 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index f315425..eebbd65 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -410,6 +410,9 @@ int x86_setup_perfctr(struct perf_event *event)
>  	struct hw_perf_event *hwc = &event->hw;
>  	u64 config;
>  
> +	if (is_no_counter_event(event))
> +		return 0;
> +
>  	if (!is_sampling_event(event)) {
>  		hwc->sample_period = x86_pmu.max_period;
>  		hwc->last_period = hwc->sample_period;
> @@ -1248,6 +1251,12 @@ static int x86_pmu_add(struct perf_event *event, int flags)
>  	hwc = &event->hw;
>  
>  	n0 = cpuc->n_events;
> +
> +	if (is_no_counter_event(event)) {
> +		n = n0;
> +		goto done_collect;
> +	}
> +
>  	ret = n = collect_events(cpuc, event, false);
>  	if (ret < 0)
>  		goto out;
> @@ -1422,6 +1431,9 @@ static void x86_pmu_del(struct perf_event *event, int flags)
>  	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
>  		goto do_del;
>  
> +	if (is_no_counter_event(event))
> +		goto do_del;
> +
>  	/*
>  	 * Not a TXN, therefore cleanup properly.
>  	 */

That's truely an abomination.

> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 0ab99c7..19e6593 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -528,6 +528,7 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
>   */
>  #define PERF_EV_CAP_SOFTWARE		BIT(0)
>  #define PERF_EV_CAP_READ_ACTIVE_PKG	BIT(1)
> +#define PERF_EV_CAP_NO_COUNTER		BIT(2)
>  
>  #define SWEVENT_HLIST_BITS		8
>  #define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
> @@ -895,6 +896,13 @@ extern int perf_event_refresh(struct perf_event *event, int refresh);
>  extern void perf_event_update_userpage(struct perf_event *event);
>  extern int perf_event_release_kernel(struct perf_event *event);
>  extern struct perf_event *
> +perf_event_create(struct perf_event_attr *attr,
> +		  int cpu,
> +		  struct task_struct *task,
> +		  perf_overflow_handler_t overflow_handler,
> +		  void *context,
> +		  bool counter_assignment);
> +extern struct perf_event *
>  perf_event_create_kernel_counter(struct perf_event_attr *attr,
>  				int cpu,
>  				struct task_struct *task,

Why the heck are you creating this wrapper nonsense?
Wang, Wei W July 9, 2019, 2:58 a.m. UTC | #2
On 07/08/2019 10:29 PM, Peter Zijlstra wrote:

Thanks for the comments.

>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 0ab99c7..19e6593 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -528,6 +528,7 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
>>    */
>>   #define PERF_EV_CAP_SOFTWARE		BIT(0)
>>   #define PERF_EV_CAP_READ_ACTIVE_PKG	BIT(1)
>> +#define PERF_EV_CAP_NO_COUNTER		BIT(2)
>>   
>>   #define SWEVENT_HLIST_BITS		8
>>   #define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
>> @@ -895,6 +896,13 @@ extern int perf_event_refresh(struct perf_event *event, int refresh);
>>   extern void perf_event_update_userpage(struct perf_event *event);
>>   extern int perf_event_release_kernel(struct perf_event *event);
>>   extern struct perf_event *
>> +perf_event_create(struct perf_event_attr *attr,
>> +		  int cpu,
>> +		  struct task_struct *task,
>> +		  perf_overflow_handler_t overflow_handler,
>> +		  void *context,
>> +		  bool counter_assignment);
>> +extern struct perf_event *
>>   perf_event_create_kernel_counter(struct perf_event_attr *attr,
>>   				int cpu,
>>   				struct task_struct *task,
> Why the heck are you creating this wrapper nonsense?

(please see early discussions: https://lkml.org/lkml/2018/9/20/868)
I thought we agreed that the perf event created here don't need to consume
an extra counter.

In the previous version, we added a "no_counter" bit to perf_event_attr, and
that will be exposed to user ABI, which seems not good.
(https://lkml.org/lkml/2019/2/14/791)
So we wrap a new kernel API above to support this.

Do you have a different suggestion to do this?
(exclude host/guest just clears the enable bit when on VM-exit/entry,
still consumes the counter)

Best,
Wei
Peter Zijlstra July 9, 2019, 9:43 a.m. UTC | #3
On Tue, Jul 09, 2019 at 10:58:46AM +0800, Wei Wang wrote:
> On 07/08/2019 10:29 PM, Peter Zijlstra wrote:
> 
> Thanks for the comments.
> 
> > 
> > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > > index 0ab99c7..19e6593 100644
> > > --- a/include/linux/perf_event.h
> > > +++ b/include/linux/perf_event.h
> > > @@ -528,6 +528,7 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
> > >    */
> > >   #define PERF_EV_CAP_SOFTWARE		BIT(0)
> > >   #define PERF_EV_CAP_READ_ACTIVE_PKG	BIT(1)
> > > +#define PERF_EV_CAP_NO_COUNTER		BIT(2)
> > >   #define SWEVENT_HLIST_BITS		8
> > >   #define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
> > > @@ -895,6 +896,13 @@ extern int perf_event_refresh(struct perf_event *event, int refresh);
> > >   extern void perf_event_update_userpage(struct perf_event *event);
> > >   extern int perf_event_release_kernel(struct perf_event *event);
> > >   extern struct perf_event *
> > > +perf_event_create(struct perf_event_attr *attr,
> > > +		  int cpu,
> > > +		  struct task_struct *task,
> > > +		  perf_overflow_handler_t overflow_handler,
> > > +		  void *context,
> > > +		  bool counter_assignment);
> > > +extern struct perf_event *
> > >   perf_event_create_kernel_counter(struct perf_event_attr *attr,
> > >   				int cpu,
> > >   				struct task_struct *task,
> > Why the heck are you creating this wrapper nonsense?
> 
> (please see early discussions: https://lkml.org/lkml/2018/9/20/868)
> I thought we agreed that the perf event created here don't need to consume
> an extra counter.

That's almost a year ago; I really can't remember that and you didn't
put any of that in your Changelog to help me remember.

(also please use: https://lkml.kernel.org/r/$msgid style links)

> In the previous version, we added a "no_counter" bit to perf_event_attr, and
> that will be exposed to user ABI, which seems not good.
> (https://lkml.org/lkml/2019/2/14/791)
> So we wrap a new kernel API above to support this.
> 
> Do you have a different suggestion to do this?
> (exclude host/guest just clears the enable bit when on VM-exit/entry,
> still consumes the counter)

Just add an argument to perf_event_create_kernel_counter() ?
Wang, Wei W July 9, 2019, 11:36 a.m. UTC | #4
On 07/09/2019 05:43 PM, Peter Zijlstra wrote:
> That's almost a year ago; I really can't remember that and you didn't
> put any of that in your Changelog to help me remember.
>
> (also please use: https://lkml.kernel.org/r/$msgid style links)

OK, I'll put this link in the cover letter or commit log for a reminder.

>
>> In the previous version, we added a "no_counter" bit to perf_event_attr, and
>> that will be exposed to user ABI, which seems not good.
>> (https://lkml.org/lkml/2019/2/14/791)
>> So we wrap a new kernel API above to support this.
>>
>> Do you have a different suggestion to do this?
>> (exclude host/guest just clears the enable bit when on VM-exit/entry,
>> still consumes the counter)
> Just add an argument to perf_event_create_kernel_counter() ?

Yes. I didn't find a proper place to add this "no_counter" indicator, so 
added a
wrapper to avoid changing existing callers of 
perf_event_create_kernel_counter.

Best,
Wei
diff mbox series

Patch

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f315425..eebbd65 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -410,6 +410,9 @@  int x86_setup_perfctr(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	u64 config;
 
+	if (is_no_counter_event(event))
+		return 0;
+
 	if (!is_sampling_event(event)) {
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
@@ -1248,6 +1251,12 @@  static int x86_pmu_add(struct perf_event *event, int flags)
 	hwc = &event->hw;
 
 	n0 = cpuc->n_events;
+
+	if (is_no_counter_event(event)) {
+		n = n0;
+		goto done_collect;
+	}
+
 	ret = n = collect_events(cpuc, event, false);
 	if (ret < 0)
 		goto out;
@@ -1422,6 +1431,9 @@  static void x86_pmu_del(struct perf_event *event, int flags)
 	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 		goto do_del;
 
+	if (is_no_counter_event(event))
+		goto do_del;
+
 	/*
 	 * Not a TXN, therefore cleanup properly.
 	 */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0ab99c7..19e6593 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -528,6 +528,7 @@  typedef void (*perf_overflow_handler_t)(struct perf_event *,
  */
 #define PERF_EV_CAP_SOFTWARE		BIT(0)
 #define PERF_EV_CAP_READ_ACTIVE_PKG	BIT(1)
+#define PERF_EV_CAP_NO_COUNTER		BIT(2)
 
 #define SWEVENT_HLIST_BITS		8
 #define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
@@ -895,6 +896,13 @@  extern int perf_event_refresh(struct perf_event *event, int refresh);
 extern void perf_event_update_userpage(struct perf_event *event);
 extern int perf_event_release_kernel(struct perf_event *event);
 extern struct perf_event *
+perf_event_create(struct perf_event_attr *attr,
+		  int cpu,
+		  struct task_struct *task,
+		  perf_overflow_handler_t overflow_handler,
+		  void *context,
+		  bool counter_assignment);
+extern struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				int cpu,
 				struct task_struct *task,
@@ -1032,6 +1040,11 @@  static inline bool is_sampling_event(struct perf_event *event)
 	return event->attr.sample_period != 0;
 }
 
+static inline bool is_no_counter_event(struct perf_event *event)
+{
+	return !!(event->event_caps & PERF_EV_CAP_NO_COUNTER);
+}
+
 /*
  * Return 1 for a software event, 0 for a hardware event
  */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index abbd4b3..70884df 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11162,18 +11162,10 @@  SYSCALL_DEFINE5(perf_event_open,
 	return err;
 }
 
-/**
- * perf_event_create_kernel_counter
- *
- * @attr: attributes of the counter to create
- * @cpu: cpu in which the counter is bound
- * @task: task to profile (NULL for percpu)
- */
-struct perf_event *
-perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-				 struct task_struct *task,
-				 perf_overflow_handler_t overflow_handler,
-				 void *context)
+struct perf_event *perf_event_create(struct perf_event_attr *attr, int cpu,
+				     struct task_struct *task,
+				     perf_overflow_handler_t overflow_handler,
+				     void *context, bool need_counter)
 {
 	struct perf_event_context *ctx;
 	struct perf_event *event;
@@ -11193,6 +11185,9 @@  perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	/* Mark owner so we could distinguish it from user events. */
 	event->owner = TASK_TOMBSTONE;
 
+	if (!need_counter)
+		event->event_caps |= PERF_EV_CAP_NO_COUNTER;
+
 	ctx = find_get_context(event->pmu, task, event);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
@@ -11241,6 +11236,24 @@  perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 err:
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL_GPL(perf_event_create);
+
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @task: task to profile (NULL for percpu)
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+				 struct task_struct *task,
+				 perf_overflow_handler_t overflow_handler,
+				 void *context)
+{
+	return perf_event_create(attr, cpu, task, overflow_handler,
+				 context, true);
+}
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 
 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)