mbox series

[v2,0/3] arm64: perf_event: Fix time offset prior to epoch

Message ID 20200505135544.6003-1-leo.yan@linaro.org (mailing list archive)
Headers show
Series arm64: perf_event: Fix time offset prior to epoch | expand

Message

Leo Yan May 5, 2020, 1:55 p.m. UTC
This patch set is to fix time offset prior to epoch for Arm arch timer.
This series is mainly following on suggestions on LKML [1].

To acheive the accurate time offset for a clock source prior to epoch,
patch 01 adds a new variant sched_clock_register_epoch() which allows to
output an extra argument for time offset prior to sched clock's
registration.

Patch 02 is to add handling for time offset in Arm arch timer driver, As
Will Deacon suggested to "disable the perf userpage if sched_clock
changes clocksource too" [2], after thinking about this suggestion, the
race condition doesn't exist between sched_clock's registration and perf
userpage.  The reason is sched_clock's registration is finished in
system's initialisation phase and at this point it has no chance to use
any userpage by Perf tool.  For this reason let's keep the code simple
and don't acquire all Perf events' seqlock during sched_clock's
registration.

Patch 03 is simply to pass time offset from arch timer driver
(clocksource driver) to perf event.

[1] https://lkml.org/lkml/2020/3/20/199
[2] https://lkml.org/lkml/2020/5/1/906

Changes from v1:
- Added patch 01 to retrieve more accurate offset when sched clock
  registration;
- Added patch 02 to handle time offset in arch timer driver.

Leo Yan (3):
  time/sched_clock: Add new variant sched_clock_register_epoch()
  clocksource/drivers/arm_arch_timer: Handle time offset prior to epoch
  arm64: perf_event: Fix time_offset for arch timer

 arch/arm64/kernel/perf_event.c       |  8 ++++++--
 drivers/clocksource/arm_arch_timer.c | 10 +++++++++-
 include/clocksource/arm_arch_timer.h |  6 ++++++
 include/linux/sched_clock.h          | 10 ++++++++++
 kernel/time/sched_clock.c            | 13 ++++++++++++-
 5 files changed, 43 insertions(+), 4 deletions(-)

Comments

Peter Zijlstra May 11, 2020, 9:22 a.m. UTC | #1
On Tue, May 05, 2020 at 09:55:41PM +0800, Leo Yan wrote:
> This patch set is to fix time offset prior to epoch for Arm arch timer.
> This series is mainly following on suggestions on LKML [1].
> 
> To acheive the accurate time offset for a clock source prior to epoch,
> patch 01 adds a new variant sched_clock_register_epoch() which allows to
> output an extra argument for time offset prior to sched clock's
> registration.
> 
> Patch 02 is to add handling for time offset in Arm arch timer driver, As
> Will Deacon suggested to "disable the perf userpage if sched_clock
> changes clocksource too" [2], after thinking about this suggestion, the
> race condition doesn't exist between sched_clock's registration and perf
> userpage.  The reason is sched_clock's registration is finished in
> system's initialisation phase and at this point it has no chance to use
> any userpage by Perf tool.  For this reason let's keep the code simple
> and don't acquire all Perf events' seqlock during sched_clock's
> registration.
> 

AFAICT that's still completely buggered. The fact that the clock starts
late is completely irrelevant, and might just be confusing you.

How this?

(_completely_ untested)

---
 arch/arm64/kernel/perf_event.c | 27 ++++++++++++++++++---------
 include/linux/sched_clock.h    | 28 ++++++++++++++++++++++++++++
 kernel/time/sched_clock.c      | 41 +++++++++++++----------------------------
 3 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 4d7879484cec..81a49a916660 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1165,28 +1165,37 @@ device_initcall(armv8_pmu_driver_init)
 void arch_perf_update_userpage(struct perf_event *event,
 			       struct perf_event_mmap_page *userpg, u64 now)
 {
-	u32 freq;
-	u32 shift;
+	struct clock_read_data *rd;
+	unsigned int seq;
 
 	/*
 	 * Internal timekeeping for enabled/running/stopped times
 	 * is always computed with the sched_clock.
 	 */
-	freq = arch_timer_get_rate();
 	userpg->cap_user_time = 1;
+	userpg->cap_user_time_zero = 1;
+
+	do {
+		rd = sched_clock_read_begin(&seq);
+
+		userpg->time_mult = rd->mult;
+		userpg->time_shift = rd->shift;
+		userpg->time_offset = rd->epoch_ns;
+
+		userpg->time_zero -= (rd->epoch_cyc * rd->shift) >> rd->shift;
+
+	} while (sched_clock_read_retry(seq));
+
+	userpg->time_offset = userpf->time_zero - now;
 
-	clocks_calc_mult_shift(&userpg->time_mult, &shift, freq,
-			NSEC_PER_SEC, 0);
 	/*
 	 * time_shift is not expected to be greater than 31 due to
 	 * the original published conversion algorithm shifting a
 	 * 32-bit value (now specifies a 64-bit value) - refer
 	 * perf_event_mmap_page documentation in perf_event.h.
 	 */
-	if (shift == 32) {
-		shift = 31;
+	if (userpg->time_shift == 32) {
+		userpg->time_shift = 31;
 		userpg->time_mult >>= 1;
 	}
-	userpg->time_shift = (u16)shift;
-	userpg->time_offset = -now;
 }
diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 0bb04a96a6d4..939dfbcd3289 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -6,6 +6,34 @@
 #define LINUX_SCHED_CLOCK
 
 #ifdef CONFIG_GENERIC_SCHED_CLOCK
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:		sched_clock() value at last update
+ * @epoch_cyc:		Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *			clocks.
+ * @read_sched_clock:	Current clock source (or dummy source when suspended).
+ * @mult:		Multipler for scaled math conversion.
+ * @shift:		Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
+	u64 epoch_ns;
+	u64 epoch_cyc;
+	u64 sched_clock_mask;
+	u64 (*read_sched_clock)(void);
+	u32 mult;
+	u32 shift;
+};
+
+extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq);
+extern bool sched_clock_read_retry(unsigned int seq);
+
 extern void generic_sched_clock_init(void);
 
 extern void sched_clock_register(u64 (*read)(void), int bits,
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fa3f800d7d76..c6d63b0d2999 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -19,31 +19,6 @@
 
 #include "timekeeping.h"
 
-/**
- * struct clock_read_data - data required to read from sched_clock()
- *
- * @epoch_ns:		sched_clock() value at last update
- * @epoch_cyc:		Clock cycle value at last update.
- * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
- *			clocks.
- * @read_sched_clock:	Current clock source (or dummy source when suspended).
- * @mult:		Multipler for scaled math conversion.
- * @shift:		Shift value for scaled math conversion.
- *
- * Care must be taken when updating this structure; it is read by
- * some very hot code paths. It occupies <=40 bytes and, when combined
- * with the seqcount used to synchronize access, comfortably fits into
- * a 64 byte cache line.
- */
-struct clock_read_data {
-	u64 epoch_ns;
-	u64 epoch_cyc;
-	u64 sched_clock_mask;
-	u64 (*read_sched_clock)(void);
-	u32 mult;
-	u32 shift;
-};
-
 /**
  * struct clock_data - all data needed for sched_clock() (including
  *                     registration of a new clock source)
@@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 	return (cyc * mult) >> shift;
 }
 
+struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
+{
+	*seq = raw_read_seqcount(&cs.seq);
+	return cs.read_data + (seq & 1);
+}
+
+struct bool sched_clock_read_retry(unsigned int seq)
+{
+	return read_seqcount_retry(&cd.seq, seq);
+}
+
 unsigned long long notrace sched_clock(void)
 {
 	u64 cyc, res;
@@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void)
 	struct clock_read_data *rd;
 
 	do {
-		seq = raw_read_seqcount(&cd.seq);
-		rd = cd.read_data + (seq & 1);
+		rd = sched_clock_read_begin(&seq);
 
 		cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
 		      rd->sched_clock_mask;
 		res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
-	} while (read_seqcount_retry(&cd.seq, seq));
+	} while (sched_clock_read_retry(seq));
 
 	return res;
 }
Peter Zijlstra May 11, 2020, 9:25 a.m. UTC | #2
On Mon, May 11, 2020 at 11:22:00AM +0200, Peter Zijlstra wrote:

> (_completely_ untested)
> 
> ---
>  arch/arm64/kernel/perf_event.c | 27 ++++++++++++++++++---------
>  include/linux/sched_clock.h    | 28 ++++++++++++++++++++++++++++
>  kernel/time/sched_clock.c      | 41 +++++++++++++----------------------------
>  3 files changed, 59 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
> index 4d7879484cec..81a49a916660 100644
> --- a/arch/arm64/kernel/perf_event.c
> +++ b/arch/arm64/kernel/perf_event.c
> @@ -1165,28 +1165,37 @@ device_initcall(armv8_pmu_driver_init)
>  void arch_perf_update_userpage(struct perf_event *event,
>  			       struct perf_event_mmap_page *userpg, u64 now)
>  {
> -	u32 freq;
> -	u32 shift;
> +	struct clock_read_data *rd;
> +	unsigned int seq;
>  
>  	/*
>  	 * Internal timekeeping for enabled/running/stopped times
>  	 * is always computed with the sched_clock.
>  	 */
> -	freq = arch_timer_get_rate();
>  	userpg->cap_user_time = 1;
> +	userpg->cap_user_time_zero = 1;
> +
> +	do {
> +		rd = sched_clock_read_begin(&seq);
> +
> +		userpg->time_mult = rd->mult;
> +		userpg->time_shift = rd->shift;
> +		userpg->time_offset = rd->epoch_ns;

			^^^^^^^ wants to be time_zero

> +
> +		userpg->time_zero -= (rd->epoch_cyc * rd->shift) >> rd->shift;
> +
> +	} while (sched_clock_read_retry(seq));
> +
> +	userpg->time_offset = userpf->time_zero - now;
>  
> -	clocks_calc_mult_shift(&userpg->time_mult, &shift, freq,
> -			NSEC_PER_SEC, 0);

And that ^^^ was complete crap.

>  	/*
>  	 * time_shift is not expected to be greater than 31 due to
>  	 * the original published conversion algorithm shifting a
>  	 * 32-bit value (now specifies a 64-bit value) - refer
>  	 * perf_event_mmap_page documentation in perf_event.h.
>  	 */
> -	if (shift == 32) {
> -		shift = 31;
> +	if (userpg->time_shift == 32) {
> +		userpg->time_shift = 31;
>  		userpg->time_mult >>= 1;
>  	}
> -	userpg->time_shift = (u16)shift;
> -	userpg->time_offset = -now;
>  }
Leo Yan May 12, 2020, 6:38 a.m. UTC | #3
Hi Peter,

On Mon, May 11, 2020 at 11:25:19AM +0200, Peter Zijlstra wrote:
> On Mon, May 11, 2020 at 11:22:00AM +0200, Peter Zijlstra wrote:
> 
> > (_completely_ untested)
> > 
> > ---
> >  arch/arm64/kernel/perf_event.c | 27 ++++++++++++++++++---------
> >  include/linux/sched_clock.h    | 28 ++++++++++++++++++++++++++++
> >  kernel/time/sched_clock.c      | 41 +++++++++++++----------------------------
> >  3 files changed, 59 insertions(+), 37 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
> > index 4d7879484cec..81a49a916660 100644
> > --- a/arch/arm64/kernel/perf_event.c
> > +++ b/arch/arm64/kernel/perf_event.c
> > @@ -1165,28 +1165,37 @@ device_initcall(armv8_pmu_driver_init)
> >  void arch_perf_update_userpage(struct perf_event *event,
> >  			       struct perf_event_mmap_page *userpg, u64 now)
> >  {
> > -	u32 freq;
> > -	u32 shift;
> > +	struct clock_read_data *rd;
> > +	unsigned int seq;
> >  
> >  	/*
> >  	 * Internal timekeeping for enabled/running/stopped times
> >  	 * is always computed with the sched_clock.
> >  	 */
> > -	freq = arch_timer_get_rate();
> >  	userpg->cap_user_time = 1;
> > +	userpg->cap_user_time_zero = 1;
> > +
> > +	do {
> > +		rd = sched_clock_read_begin(&seq);
> > +
> > +		userpg->time_mult = rd->mult;
> > +		userpg->time_shift = rd->shift;
> > +		userpg->time_offset = rd->epoch_ns;
> 
> 			^^^^^^^ wants to be time_zero
> 
> > +
> > +		userpg->time_zero -= (rd->epoch_cyc * rd->shift) >> rd->shift;
> > +
> > +	} while (sched_clock_read_retry(seq));
> > +
> > +	userpg->time_offset = userpf->time_zero - now;
> >  
> > -	clocks_calc_mult_shift(&userpg->time_mult, &shift, freq,
> > -			NSEC_PER_SEC, 0);
> 
> And that ^^^ was complete crap.
> 
> >  	/*
> >  	 * time_shift is not expected to be greater than 31 due to
> >  	 * the original published conversion algorithm shifting a
> >  	 * 32-bit value (now specifies a 64-bit value) - refer
> >  	 * perf_event_mmap_page documentation in perf_event.h.
> >  	 */
> > -	if (shift == 32) {
> > -		shift = 31;
> > +	if (userpg->time_shift == 32) {
> > +		userpg->time_shift = 31;
> >  		userpg->time_mult >>= 1;
> >  	}
> > -	userpg->time_shift = (u16)shift;
> > -	userpg->time_offset = -now;
> >  }

I have verified this change, it works as expected on my Arm64 board.
Also paste the updated code which makes building success with minor
fixing.

I am not sure how to proceed, will you merge this?  Or you want me to
send out formal patches (or only for the Arm64 part)?

P.s. it's shame I still missed you guys suggestion in prvious thread
even though you have provide enough ifno, and thank you for the helping!

---8<---

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 4d7879484cec..5a34e9264c5b 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -19,6 +19,7 @@
 #include <linux/of.h>
 #include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
+#include <linux/sched_clock.h>
 #include <linux/smp.h>
 
 /* ARMv8 Cortex-A53 specific event types. */
@@ -1165,28 +1166,26 @@ device_initcall(armv8_pmu_driver_init)
 void arch_perf_update_userpage(struct perf_event *event,
 			       struct perf_event_mmap_page *userpg, u64 now)
 {
-	u32 freq;
-	u32 shift;
+	struct clock_read_data *rd;
+	unsigned int seq;
 
 	/*
 	 * Internal timekeeping for enabled/running/stopped times
 	 * is always computed with the sched_clock.
 	 */
-	freq = arch_timer_get_rate();
 	userpg->cap_user_time = 1;
+	userpg->cap_user_time_zero = 1;
 
-	clocks_calc_mult_shift(&userpg->time_mult, &shift, freq,
-			NSEC_PER_SEC, 0);
-	/*
-	 * time_shift is not expected to be greater than 31 due to
-	 * the original published conversion algorithm shifting a
-	 * 32-bit value (now specifies a 64-bit value) - refer
-	 * perf_event_mmap_page documentation in perf_event.h.
-	 */
-	if (shift == 32) {
-		shift = 31;
-		userpg->time_mult >>= 1;
-	}
-	userpg->time_shift = (u16)shift;
-	userpg->time_offset = -now;
+	do {
+		rd = sched_clock_read_begin(&seq);
+
+		userpg->time_mult = rd->mult;
+		userpg->time_shift = rd->shift;
+		userpg->time_zero = rd->epoch_ns;
+
+		userpg->time_zero -= (rd->epoch_cyc * rd->mult) >> rd->shift;
+
+	} while (sched_clock_read_retry(seq));
+
+	userpg->time_offset = userpg->time_zero - now;
 }
diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 0bb04a96a6d4..528718e4ed52 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -6,6 +6,34 @@
 #define LINUX_SCHED_CLOCK
 
 #ifdef CONFIG_GENERIC_SCHED_CLOCK
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:		sched_clock() value at last update
+ * @epoch_cyc:		Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *			clocks.
+ * @read_sched_clock:	Current clock source (or dummy source when suspended).
+ * @mult:		Multipler for scaled math conversion.
+ * @shift:		Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
+	u64 epoch_ns;
+	u64 epoch_cyc;
+	u64 sched_clock_mask;
+	u64 (*read_sched_clock)(void);
+	u32 mult;
+	u32 shift;
+};
+
+extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq);
+extern int sched_clock_read_retry(unsigned int seq);
+
 extern void generic_sched_clock_init(void);
 
 extern void sched_clock_register(u64 (*read)(void), int bits,
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fa3f800d7d76..0acaadc3156c 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -19,31 +19,6 @@
 
 #include "timekeeping.h"
 
-/**
- * struct clock_read_data - data required to read from sched_clock()
- *
- * @epoch_ns:		sched_clock() value at last update
- * @epoch_cyc:		Clock cycle value at last update.
- * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
- *			clocks.
- * @read_sched_clock:	Current clock source (or dummy source when suspended).
- * @mult:		Multipler for scaled math conversion.
- * @shift:		Shift value for scaled math conversion.
- *
- * Care must be taken when updating this structure; it is read by
- * some very hot code paths. It occupies <=40 bytes and, when combined
- * with the seqcount used to synchronize access, comfortably fits into
- * a 64 byte cache line.
- */
-struct clock_read_data {
-	u64 epoch_ns;
-	u64 epoch_cyc;
-	u64 sched_clock_mask;
-	u64 (*read_sched_clock)(void);
-	u32 mult;
-	u32 shift;
-};
-
 /**
  * struct clock_data - all data needed for sched_clock() (including
  *                     registration of a new clock source)
@@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 	return (cyc * mult) >> shift;
 }
 
+struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
+{
+	*seq = raw_read_seqcount(&cd.seq);
+	return cd.read_data + (*seq & 1);
+}
+
+int sched_clock_read_retry(unsigned int seq)
+{
+	return read_seqcount_retry(&cd.seq, seq);
+}
+
 unsigned long long notrace sched_clock(void)
 {
 	u64 cyc, res;
@@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void)
 	struct clock_read_data *rd;
 
 	do {
-		seq = raw_read_seqcount(&cd.seq);
-		rd = cd.read_data + (seq & 1);
+		rd = sched_clock_read_begin(&seq);
 
 		cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
 		      rd->sched_clock_mask;
 		res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
-	} while (read_seqcount_retry(&cd.seq, seq));
+	} while (sched_clock_read_retry(seq));
 
 	return res;
 }
Peter Zijlstra May 12, 2020, 8:57 a.m. UTC | #4
On Tue, May 12, 2020 at 02:38:12PM +0800, Leo Yan wrote:

> I have verified this change, it works as expected on my Arm64 board.
> Also paste the updated code which makes building success with minor
> fixing.

W00t !

> I am not sure how to proceed, will you merge this?  Or you want me to
> send out formal patches (or only for the Arm64 part)?

I suppose I can write a Changelog for the thing, Will asked for another
change as well.

> P.s. it's shame I still missed you guys suggestion in prvious thread
> even though you have provide enough ifno, and thank you for the helping!

All good.


> ---8<---

> -	/*
> -	 * time_shift is not expected to be greater than 31 due to
> -	 * the original published conversion algorithm shifting a
> -	 * 32-bit value (now specifies a 64-bit value) - refer
> -	 * perf_event_mmap_page documentation in perf_event.h.
> -	 */
> -	if (shift == 32) {
> -		shift = 31;
> -		userpg->time_mult >>= 1;
> -	}

Is there a reason you completely lost that? IIRC I preserved that.
Although I don't know if it is still relevant.

I'll keep it for now, and removal can be a separate patch with proper
justification, ok?
Peter Zijlstra May 12, 2020, 9:19 a.m. UTC | #5
On Tue, May 12, 2020 at 02:38:12PM +0800, Leo Yan wrote:
> @@ -1165,28 +1166,26 @@ device_initcall(armv8_pmu_driver_init)
>  void arch_perf_update_userpage(struct perf_event *event,
>  			       struct perf_event_mmap_page *userpg, u64 now)
>  {
> +	struct clock_read_data *rd;
> +	unsigned int seq;
>  
>  	/*
>  	 * Internal timekeeping for enabled/running/stopped times
>  	 * is always computed with the sched_clock.
>  	 */
>  	userpg->cap_user_time = 1;
> +	userpg->cap_user_time_zero = 1;
>  
> +	do {
> +		rd = sched_clock_read_begin(&seq);
> +
> +		userpg->time_mult = rd->mult;
> +		userpg->time_shift = rd->shift;
> +		userpg->time_zero = rd->epoch_ns;
> +
> +		userpg->time_zero -= (rd->epoch_cyc * rd->mult) >> rd->shift;

Damn, I think this is broken vs the counter wrapping.

So what the sched_clock code does is:

	cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift)

But because the perf interface assumes a simple linear relation, we
can't express that properly.

Now, your arm64 counter is 56 bits, so wrapping is rare, but still, we
should probably fix that. And that probably needs an ABI extention
*sigh*.

> +
> +	} while (sched_clock_read_retry(seq));
> +
> +	userpg->time_offset = userpg->time_zero - now;
>  }
Mark Rutland May 12, 2020, 10:01 a.m. UTC | #6
On Tue, May 12, 2020 at 11:19:18AM +0200, Peter Zijlstra wrote:
> On Tue, May 12, 2020 at 02:38:12PM +0800, Leo Yan wrote:
> > @@ -1165,28 +1166,26 @@ device_initcall(armv8_pmu_driver_init)
> >  void arch_perf_update_userpage(struct perf_event *event,
> >  			       struct perf_event_mmap_page *userpg, u64 now)
> >  {
> > +	struct clock_read_data *rd;
> > +	unsigned int seq;
> >  
> >  	/*
> >  	 * Internal timekeeping for enabled/running/stopped times
> >  	 * is always computed with the sched_clock.
> >  	 */
> >  	userpg->cap_user_time = 1;
> > +	userpg->cap_user_time_zero = 1;
> >  
> > +	do {
> > +		rd = sched_clock_read_begin(&seq);
> > +
> > +		userpg->time_mult = rd->mult;
> > +		userpg->time_shift = rd->shift;
> > +		userpg->time_zero = rd->epoch_ns;
> > +
> > +		userpg->time_zero -= (rd->epoch_cyc * rd->mult) >> rd->shift;
> 
> Damn, I think this is broken vs the counter wrapping.
> 
> So what the sched_clock code does is:
> 
> 	cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift)
> 
> But because the perf interface assumes a simple linear relation, we
> can't express that properly.
> 
> Now, your arm64 counter is 56 bits, so wrapping is rare, but still, we
> should probably fix that. And that probably needs an ABI extention
> *sigh*.

FWIW, its's /at least/ 56 bits wide, and the ARM ARM says that it
shouldn't wrap in fewer than 40 years, so no correct implementation
should wrap before the 2050s.

If it's wider than 56 bits, the 56-bit portion could wrap more quickly
than that, so we should probably always treat it as 64-bits.

From ARMv8.6 it's always 64 bits wide @ a nominal 1GHz, and a 64-bit
wrap will take ~584.9 years (with a 56-bit wrap taking ~834 days).

See D11.1.2 "The system counter" in the latest ARM ARM (0487F.b):

https://static.docs.arm.com/ddi0487/fb/DDI0487F_b_armv8_arm.pdf?_ga=2.83012310.1749782910.1589218924-1447552059.1588172444

https://developer.arm.com/docs/ddi0487/latest

Thanks,
Mark.
Peter Zijlstra May 12, 2020, 11:21 a.m. UTC | #7
On Tue, May 12, 2020 at 11:19:18AM +0200, Peter Zijlstra wrote:

> Now, your arm64 counter is 56 bits, so wrapping is rare, but still, we
> should probably fix that. And that probably needs an ABI extention
> *sigh*.

Something like so, on top of the other (2).

---

--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1174,6 +1174,7 @@ void arch_perf_update_userpage(struct pe
 
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
+	userpg->cap_user_time_short = 0;
 
 	do {
 		rd = sched_clock_read_begin(&seq);
@@ -1184,13 +1185,16 @@ void arch_perf_update_userpage(struct pe
 		userpg->time_mult = rd->mult;
 		userpg->time_shift = rd->shift;
 		userpg->time_zero = rd->epoch_ns;
+		userpg->time_cycle = rd->epoch_cyc;
+		userpg->time_mask = rd->sched_clock_mask;
 
 		/*
 		 * This isn't strictly correct, the ARM64 counter can be
-		 * 'short' and then we get funnies when it wraps. The correct
-		 * thing would be to extend the perf ABI with a cycle and mask
-		 * value, but because wrapping on ARM64 is very rare in
-		 * practise this 'works'.
+		 * 'short' and then we get funnies when it wraps. But this
+		 * 'works' with the cap_user_time ABI.
+		 *
+		 * When userspace knows about cap_user_time_short, it
+		 * can do the correct thing.
 		 */
 		userpg->time_zero -= (rd->epoch_cyc * rd->mult) >> rd->shift;
 
@@ -1215,4 +1219,5 @@ void arch_perf_update_userpage(struct pe
 	 */
 	userpg->cap_user_time = 1;
 	userpg->cap_user_time_zero = 1;
+	userpg->cap_user_time_short = 1;
 }
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -532,9 +532,10 @@ struct perf_event_mmap_page {
 				cap_bit0_is_deprecated	: 1, /* Always 1, signals that bit 0 is zero */
 
 				cap_user_rdpmc		: 1, /* The RDPMC instruction can be used to read counts */
-				cap_user_time		: 1, /* The time_* fields are used */
+				cap_user_time		: 1, /* The time_{shift,mult,offset} fields are used */
 				cap_user_time_zero	: 1, /* The time_zero field is used */
-				cap_____res		: 59;
+				cap_user_time_short	: 1, /* the time_{cycle,mask} fields are used */
+				cap_____res		: 58;
 		};
 	};
 
@@ -593,13 +594,29 @@ struct perf_event_mmap_page {
 	 *               ((rem * time_mult) >> time_shift);
 	 */
 	__u64	time_zero;
+
 	__u32	size;			/* Header size up to __reserved[] fields. */
+	__u32	__reserved_1;
+
+	/*
+	 * If cap_usr_time_short, the hardware clock is less than 64bit wide
+	 * and we must compute the 'cyc' value, as used by cap_usr_time, as:
+	 *
+	 *   cyc = time_cycles + ((cyc - time_cycles) & time_mask)
+	 *
+	 * NOTE: this form is explicitly chosen such that cap_usr_time_short
+	 *       is an extention on top of cap_usr_time, and code that doesn't
+	 *       know about cap_usr_time_short still works under the assumption
+	 *       the counter doesn't wrap.
+	 */
+	__u64	time_cycles;
+	__u64	time_mask;
 
 		/*
 		 * Hole for extension of the self monitor capabilities
 		 */
 
-	__u8	__reserved[118*8+4];	/* align to 1k. */
+	__u8	__reserved[116*8];	/* align to 1k. */
 
 	/*
 	 * Control data for the mmap() data buffer.