diff mbox series

[2/3,v2] tools/power turbostat: Introduce functions to accumulate RAPL consumption

Message ID db96fd31afd0ff65e4041665293b96c984e675bc.1586782089.git.yu.c.chen@intel.com (mailing list archive)
State RFC, archived
Headers show
Series tools/power turbostat: Enable accumulated energy consumption for long time sampling | expand

Commit Message

Chen Yu April 14, 2020, 12:56 p.m. UTC
Since the RAPL Joule Counter is 32 bit, turbostat would
only print a *star* instead of printing the actual energy
consumed to indicate the overflow due to long duration.
This does not meet the requirement from servers as the
sampling time of turbostat is usually very long on servers.

So maintain a set of MSR buffer, and update them
periodically before the 32bit MSR register is wrapped round,
so as to avoid the overflow.

The idea is similar to the implementation of ktime_get():

Periodical MSR timer:
total_rapl_sum += (current_rapl_msr - last_rapl_msr);

Using get_msr_sum() to get the accumulated RAPL:
return (current_rapl_msr - last_rapl_msr) + total_rapl_sum;

The accumulated RAPL mechanism will be turned on in next patch.

Originally-by: Aaron Lu <aaron.lwe@gmail.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
v2: According to Len's suggestion:
    1. Enable the accumulated RAPL mechanism by default.
    2. Re-use the rapl_joule_counter_range to represent the
       the timeout of periodical timer.
    3. Remove the condition check in v1 patch when reading RAPL registers.
--
 tools/power/x86/turbostat/Makefile    |   2 +-
 tools/power/x86/turbostat/turbostat.c | 209 ++++++++++++++++++++++++++
 2 files changed, 210 insertions(+), 1 deletion(-)

Comments

Doug Smythies April 16, 2020, 4:03 a.m. UTC | #1
On 2020.04.15 05:57 Chen Yu wrote:

...

> v2: According to Len's suggestion:
>    1. Enable the accumulated RAPL mechanism by default.

I am not a fan of this, but O.K.

>    2. Re-use the rapl_joule_counter_range to represent the
>       the timeout of periodical timer.

No, please no. It is too easy to still have an overflow.

...
> +	/*
> +	 * A wraparound time is calculated early.
> +	 */
> +	its.it_interval.tv_sec = rapl_joule_counter_range;

Would this be o.K.?

+	its.it_interval.tv_sec = rapl_joule_counter_range / 2;

> +	its.it_interval.tv_nsec = 0;

The way it was sent, this patch set does not work.
It still overflows.

Example, sample time calculated to ensure overflow:

Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
100.00  3500    3592125 80      9.72    0.12
100.00  3500    3587391 79      9.77    0.12

Actual package watts was around 65.

However, if this additional patch is applied (I only fixed one of them):

doug@s18:~/temp-k-git/linux/tools/power/x86/turbostat$ git diff
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 29fc4069f467..4d72d9be5209 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1350,7 +1350,8 @@ delta_package(struct pkg_data *new, struct pkg_data *old)

        old->gfx_mhz = new->gfx_mhz;

-       DELTA_WRAP32(new->energy_pkg, old->energy_pkg);
+/*     DELTA_WRAP32(new->energy_pkg, old->energy_pkg);  */
+       old->energy_pkg = new->energy_pkg - old->energy_pkg;
        DELTA_WRAP32(new->energy_cores, old->energy_cores);
        DELTA_WRAP32(new->energy_gfx, old->energy_gfx);
        DELTA_WRAP32(new->energy_dram, old->energy_dram);

Then it seems to work.

Example:

doug@s15:~/temp-turbostat$ sudo ./turbostat --Summary --show Busy%,Bzy_MHz,PkgTmp,PkgWatt,GFXWatt,IRQ --interval 1200
...
RAPL: 690 sec. Joule Counter Range, at 95 Watts
...
Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
100.00  3500    3592328 80      64.32   0.12
100.00  3500    3595195 79      64.37   0.12

... Doug
Chen Yu April 16, 2020, 5:06 p.m. UTC | #2
Hi Doug,
Thanks for reviewing this patch.
On Wed, Apr 15, 2020 at 09:03:34PM -0700, Doug Smythies wrote:
> On 2020.04.15 05:57 Chen Yu wrote:
> 
> ...
> 
> > v2: According to Len's suggestion:
> >    1. Enable the accumulated RAPL mechanism by default.
> 
> I am not a fan of this, but O.K.
> 
> >    2. Re-use the rapl_joule_counter_range to represent the
> >       the timeout of periodical timer.
> 
> No, please no. It is too easy to still have an overflow.
> 
> ...
> > +	/*
> > +	 * A wraparound time is calculated early.
> > +	 */
> > +	its.it_interval.tv_sec = rapl_joule_counter_range;
> 
> Would this be o.K.?
> 
> +	its.it_interval.tv_sec = rapl_joule_counter_range / 2;
> 
This should be okay. I've checked the defination of TDP, and
on a wiki page it has mentioned that[1]:
"Some sources state that the peak power for a microprocessor
is usually 1.5 times the TDP rating"
although the defination of TDP varies, using 2 * TDP should
be safe.
> > +	its.it_interval.tv_nsec = 0;
> 
> The way it was sent, this patch set does not work.
> It still overflows.
> 
> Example, sample time calculated to ensure overflow:
> 
> Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
> 100.00  3500    3592125 80      9.72    0.12
> 100.00  3500    3587391 79      9.77    0.12
> 
> Actual package watts was around 65.
> 
> However, if this additional patch is applied (I only fixed one of them):
> 
> doug@s18:~/temp-k-git/linux/tools/power/x86/turbostat$ git diff
> diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
> index 29fc4069f467..4d72d9be5209 100644
> --- a/tools/power/x86/turbostat/turbostat.c
> +++ b/tools/power/x86/turbostat/turbostat.c
> @@ -1350,7 +1350,8 @@ delta_package(struct pkg_data *new, struct pkg_data *old)
> 
>         old->gfx_mhz = new->gfx_mhz;
> 
> -       DELTA_WRAP32(new->energy_pkg, old->energy_pkg);
> +/*     DELTA_WRAP32(new->energy_pkg, old->energy_pkg);  */
> +       old->energy_pkg = new->energy_pkg - old->energy_pkg;
>         DELTA_WRAP32(new->energy_cores, old->energy_cores);
>         DELTA_WRAP32(new->energy_gfx, old->energy_gfx);
>         DELTA_WRAP32(new->energy_dram, old->energy_dram);
> 
> Then it seems to work.
> 
Nice catch, I did not realize that the energy_pkg field has
already been converted into accumuted variable which does not
need to consider the wrapping(64bit should be long enough for
normal test cases).

Thanks,
Chenyu
> Example:
> 
> doug@s15:~/temp-turbostat$ sudo ./turbostat --Summary --show Busy%,Bzy_MHz,PkgTmp,PkgWatt,GFXWatt,IRQ --interval 1200
> ...
> RAPL: 690 sec. Joule Counter Range, at 95 Watts
> ...
> Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
> 100.00  3500    3592328 80      64.32   0.12
> 100.00  3500    3595195 79      64.37   0.12
> 
> ... Doug
> 
>
Doug Smythies April 16, 2020, 8:40 p.m. UTC | #3
On 2020.04.10:06 Chen Yu wrote:
> On Wed, Apr 15, 2020 at 09:03:34PM -0700, Doug Smythies wrote:
>> On 2020.04.15 05:57 Chen Yu wrote:
> 
>>> +	/*
>>> +	 * A wraparound time is calculated early.
>>> +	 */
>>> +	its.it_interval.tv_sec = rapl_joule_counter_range;
>> 
>> Would this be o.K.?
>> 
>> +	its.it_interval.tv_sec = rapl_joule_counter_range / 2;
>> 
> This should be okay. I've checked the defination of TDP, and
> on a wiki page it has mentioned that[1]:
> "Some sources state that the peak power for a microprocessor
> is usually 1.5 times the TDP rating"
> although the defination of TDP varies, using 2 * TDP should
> be safe.

O.K. Great.
By the way, I have already tested it (in addition to the previously e-mailed patch correction):

First, with this:

	its.it_interval.tv_sec = rapl_joule_counter_range;

Result:

sudo ./turbostat --Summary --interval 3200 --show Avg_MHz,Busy%,Bzy_MHz,IRQ,PkgTmp,PkgWatt,GFXWatt
...
RAPL: 2759 sec. Joule Counter Range, at 95 Watts
...
cpu0: MSR_PKG_POWER_INFO: 0x000002f8 (95 W TDP, RAPL 0 - 0 W, 0.000000 sec.)
cpu0: MSR_PKG_POWER_LIMIT: 0x4283e800dd8320 (UNlocked)
cpu0: PKG Limit #1: ENabled (100.000000 Watts, 28.000000 sec, clamp ENabled)
cpu0: PKG Limit #2: ENabled (125.000000 Watts, 0.002441* sec, clamp DISabled)
...
Avg_MHz Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
4039    100.20  4031    7211202 64      18.29   0.00
4033    100.22  4024    7254993 66      18.00   0.00

actual (using a shorter interval, that doesn't wrap around):

Avg_MHz Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
4032    100.22  4023    676360  65      99.92   0.00
4029    100.23  4019    676629  65      99.91   0.00
4032    100.22  4023    676771  65      99.91   0.00
4037    100.22  4028    675430  65      99.91   0.00
4032    100.22  4023    675819  65      99.91   0.00
4028    100.23  4019    676541  65      99.91   0.00
4042    100.22  4033    675857  64      99.91   0.00
4029    100.23  4020    675597  65      99.91   0.00
4027    100.23  4017    676201  65      3751748943144.71        0.00
4034    100.22  4025    676402  65      99.91   0.00
4035    100.22  4026    674982  65      99.91   0.00
4032    100.22  4023    676012  64      99.91   0.00
4034    100.22  4025    723696  66      99.91   0.00
4039    100.22  4030    676342  64      99.91   0.00
4028    100.23  4018    676082  65      99.91   0.00
4028    100.23  4019    676218  65      99.91   0.00
4038    100.22  4030    675771  65      99.91   0.00
4031    100.22  4022    674282  65      3751749380702.93        0.00
4031    100.22  4022    676314  65      99.91   0.00
4039    100.22  4030    676197  65      99.91   0.00

And now with this:

	its.it_interval.tv_sec = rapl_joule_counter_range / 2;

Avg_MHz Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
4032    100.22  4023    7205931 65      99.91   0.00
4033    100.22  4023    7208003 65      99.91   0.00
4034    100.22  4024    7205563 65      99.91   0.00

and using the shorter interval:

Avg_MHz Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
4028    100.23  4019    676147  64      99.92   0.00
4027    100.23  4017    675857  65      99.91   0.00
4036    100.22  4027    675736  65      99.91   0.00
4032    100.22  4023    674758  65      99.91   0.00
4032    100.22  4022    675692  65      99.91   0.00
4032    100.22  4023    676275  65      99.91   0.00
4043    100.22  4035    676001  66      99.91   0.00
4028    100.23  4019    676277  65      99.91   0.00
4028    100.23  4019    676420  65      99.91   0.00
4028    100.23  4019    675884  65      99.91   0.00
4037    100.22  4028    675293  65      99.91   0.00
4030    100.23  4021    674025  66      99.91   0.00
4031    100.22  4022    676462  65      99.91   0.00
4032    100.22  4023    676007  66      99.91   0.00
4047    100.21  4038    676424  65      99.91   0.00
4030    100.22  4021    676853  65      99.91   0.00
4028    100.23  4019    676553  65      99.91   0.00
4034    100.22  4025    675880  65      99.91   0.00
4036    100.22  4027    674824  65      99.91   0.00
4033    100.22  4024    674577  65      99.91   0.00
4031    100.22  4022    676599  65      99.91   0.00
4041    100.22  4032    676675  66      99.91   0.00

Note that it is very much on purpose that I have set
TDP to 100 watts on this processor, whereas the
default is 95 watts. Notice the package temperature,
after several hours of running power throttled to
a CPU frequency of 4.03 GHz.

... Doug
Chen Yu April 17, 2020, 4:23 a.m. UTC | #4
On Thu, Apr 16, 2020 at 01:40:09PM -0700, Doug Smythies wrote:
> On 2020.04.10:06 Chen Yu wrote:
> > On Wed, Apr 15, 2020 at 09:03:34PM -0700, Doug Smythies wrote:
> >> On 2020.04.15 05:57 Chen Yu wrote:
> > 
> >>> +	/*
> >>> +	 * A wraparound time is calculated early.
> >>> +	 */
> >>> +	its.it_interval.tv_sec = rapl_joule_counter_range;
> >> 
> >> Would this be o.K.?
> >> 
> >> +	its.it_interval.tv_sec = rapl_joule_counter_range / 2;
> >> 
> > This should be okay. I've checked the defination of TDP, and
> > on a wiki page it has mentioned that[1]:
> > "Some sources state that the peak power for a microprocessor
> > is usually 1.5 times the TDP rating"
> > although the defination of TDP varies, using 2 * TDP should
> > be safe.
> 
> O.K. Great.
> By the way, I have already tested it (in addition to the previously e-mailed patch correction):
> 
> First, with this:
> 
> 	its.it_interval.tv_sec = rapl_joule_counter_range;
> 
> Result:
> 
> sudo ./turbostat --Summary --interval 3200 --show Avg_MHz,Busy%,Bzy_MHz,IRQ,PkgTmp,PkgWatt,GFXWatt
> ...
> RAPL: 2759 sec. Joule Counter Range, at 95 Watts
> ...
> cpu0: MSR_PKG_POWER_INFO: 0x000002f8 (95 W TDP, RAPL 0 - 0 W, 0.000000 sec.)
> cpu0: MSR_PKG_POWER_LIMIT: 0x4283e800dd8320 (UNlocked)
> cpu0: PKG Limit #1: ENabled (100.000000 Watts, 28.000000 sec, clamp ENabled)
> cpu0: PKG Limit #2: ENabled (125.000000 Watts, 0.002441* sec, clamp DISabled)
> ...
> Avg_MHz Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
> 4039    100.20  4031    7211202 64      18.29   0.00
> 4033    100.22  4024    7254993 66      18.00   0.00
> 
> actual (using a shorter interval, that doesn't wrap around):
> 
> Avg_MHz Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
> 4032    100.22  4023    676360  65      99.92   0.00
> 4029    100.23  4019    676629  65      99.91   0.00
> 4032    100.22  4023    676771  65      99.91   0.00
> 4037    100.22  4028    675430  65      99.91   0.00
> 4032    100.22  4023    675819  65      99.91   0.00
> 4028    100.23  4019    676541  65      99.91   0.00
> 4042    100.22  4033    675857  64      99.91   0.00
> 4029    100.23  4020    675597  65      99.91   0.00
> 4027    100.23  4017    676201  65      3751748943144.71        0.00
> 4034    100.22  4025    676402  65      99.91   0.00
> 4035    100.22  4026    674982  65      99.91   0.00
> 4032    100.22  4023    676012  64      99.91   0.00
> 4034    100.22  4025    723696  66      99.91   0.00
> 4039    100.22  4030    676342  64      99.91   0.00
> 4028    100.23  4018    676082  65      99.91   0.00
> 4028    100.23  4019    676218  65      99.91   0.00
> 4038    100.22  4030    675771  65      99.91   0.00
> 4031    100.22  4022    674282  65      3751749380702.93        0.00
> 4031    100.22  4022    676314  65      99.91   0.00
> 4039    100.22  4030    676197  65      99.91   0.00
> 
> And now with this:
> 
> 	its.it_interval.tv_sec = rapl_joule_counter_range / 2;
> 
> Avg_MHz Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
> 4032    100.22  4023    7205931 65      99.91   0.00
> 4033    100.22  4023    7208003 65      99.91   0.00
> 4034    100.22  4024    7205563 65      99.91   0.00
> 
> and using the shorter interval:
> 
> Avg_MHz Busy%   Bzy_MHz IRQ     PkgTmp  PkgWatt GFXWatt
> 4028    100.23  4019    676147  64      99.92   0.00
> 4027    100.23  4017    675857  65      99.91   0.00
> 4036    100.22  4027    675736  65      99.91   0.00
> 4032    100.22  4023    674758  65      99.91   0.00
> 4032    100.22  4022    675692  65      99.91   0.00
> 4032    100.22  4023    676275  65      99.91   0.00
> 4043    100.22  4035    676001  66      99.91   0.00
> 4028    100.23  4019    676277  65      99.91   0.00
> 4028    100.23  4019    676420  65      99.91   0.00
> 4028    100.23  4019    675884  65      99.91   0.00
> 4037    100.22  4028    675293  65      99.91   0.00
> 4030    100.23  4021    674025  66      99.91   0.00
> 4031    100.22  4022    676462  65      99.91   0.00
> 4032    100.22  4023    676007  66      99.91   0.00
> 4047    100.21  4038    676424  65      99.91   0.00
> 4030    100.22  4021    676853  65      99.91   0.00
> 4028    100.23  4019    676553  65      99.91   0.00
> 4034    100.22  4025    675880  65      99.91   0.00
> 4036    100.22  4027    674824  65      99.91   0.00
> 4033    100.22  4024    674577  65      99.91   0.00
> 4031    100.22  4022    676599  65      99.91   0.00
> 4041    100.22  4032    676675  66      99.91   0.00
> 
> Note that it is very much on purpose that I have set
> TDP to 100 watts on this processor, whereas the
> default is 95 watts. Notice the package temperature,
> after several hours of running power throttled to
> a CPU frequency of 4.03 GHz.
>
Do you mind if I add Reviewed-by and Tested-by tag from you
in next version?

Thanks,
Chenyu
> ... Doug
> 
>
diff mbox series

Patch

diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile
index 2b6551269e43..d08765531bcb 100644
--- a/tools/power/x86/turbostat/Makefile
+++ b/tools/power/x86/turbostat/Makefile
@@ -16,7 +16,7 @@  override CFLAGS +=	-D_FORTIFY_SOURCE=2
 
 %: %.c
 	@mkdir -p $(BUILD_OUTPUT)
-	$(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap
+	$(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap -lrt
 
 .PHONY : clean
 clean :
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 95f3047e94ae..ef380db38cba 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -259,6 +259,113 @@  struct msr_counter {
 #define	SYSFS_PERCPU	(1 << 1)
 };
 
+/*
+ * The accumulated sum of MSR is defined as a monotonic
+ * increasing MSR, it will be accumulated periodically,
+ * despite its register's bit width.
+ */
+enum {
+	IDX_PKG_ENERGY,
+	IDX_DRAM_ENERGY,
+	IDX_PP0_ENERGY,
+	IDX_PP1_ENERGY,
+	IDX_PKG_PERF,
+	IDX_DRAM_PERF,
+	IDX_COUNT,
+};
+
+int get_msr_sum(int cpu, off_t offset, unsigned long long *msr);
+
+struct msr_sum_array {
+	/* get_msr_sum() = sum + (get_msr() - last) */
+	struct {
+		/*The accumulated MSR value is updated by the timer*/
+		unsigned long long sum;
+		/*The MSR footprint recorded in last timer*/
+		unsigned long long last;
+	} entries[IDX_COUNT];
+};
+
+/* The percpu MSR sum array.*/
+struct msr_sum_array *per_cpu_msr_sum;
+
+int idx_to_offset(int idx)
+{
+	int offset;
+
+	switch (idx) {
+	case IDX_PKG_ENERGY:
+		offset = MSR_PKG_ENERGY_STATUS;
+		break;
+	case IDX_DRAM_ENERGY:
+		offset = MSR_DRAM_ENERGY_STATUS;
+		break;
+	case IDX_PP0_ENERGY:
+		offset = MSR_PP0_ENERGY_STATUS;
+		break;
+	case IDX_PP1_ENERGY:
+		offset = MSR_PP1_ENERGY_STATUS;
+		break;
+	case IDX_PKG_PERF:
+		offset = MSR_PKG_PERF_STATUS;
+		break;
+	case IDX_DRAM_PERF:
+		offset = MSR_DRAM_PERF_STATUS;
+		break;
+	default:
+		offset = -1;
+	}
+	return offset;
+}
+
+int offset_to_idx(int offset)
+{
+	int idx;
+
+	switch (offset) {
+	case MSR_PKG_ENERGY_STATUS:
+		idx = IDX_PKG_ENERGY;
+		break;
+	case MSR_DRAM_ENERGY_STATUS:
+		idx = IDX_DRAM_ENERGY;
+		break;
+	case MSR_PP0_ENERGY_STATUS:
+		idx = IDX_PP0_ENERGY;
+		break;
+	case MSR_PP1_ENERGY_STATUS:
+		idx = IDX_PP1_ENERGY;
+		break;
+	case MSR_PKG_PERF_STATUS:
+		idx = IDX_PKG_PERF;
+		break;
+	case MSR_DRAM_PERF_STATUS:
+		idx = IDX_DRAM_PERF;
+		break;
+	default:
+		idx = -1;
+	}
+	return idx;
+}
+
+int idx_valid(int idx)
+{
+	switch (idx) {
+	case IDX_PKG_ENERGY:
+		return do_rapl & RAPL_PKG;
+	case IDX_DRAM_ENERGY:
+		return do_rapl & RAPL_DRAM;
+	case IDX_PP0_ENERGY:
+		return do_rapl & RAPL_CORES_ENERGY_STATUS;
+	case IDX_PP1_ENERGY:
+		return do_rapl & RAPL_GFX;
+	case IDX_PKG_PERF:
+		return do_rapl & RAPL_PKG_PERF_STATUS;
+	case IDX_DRAM_PERF:
+		return do_rapl & RAPL_DRAM_PERF_STATUS;
+	default:
+		return 0;
+	}
+}
 struct sys_counters {
 	unsigned int added_thread_counters;
 	unsigned int added_core_counters;
@@ -3053,6 +3160,108 @@  void do_sleep(void)
 	}
 }
 
+int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
+{
+	int ret, idx;
+	unsigned long long msr_cur, msr_last;
+
+	if (!per_cpu_msr_sum)
+		return 1;
+
+	idx = offset_to_idx(offset);
+	if (idx < 0)
+		return idx;
+	/* get_msr_sum() = sum + (get_msr() - last) */
+	ret = get_msr(cpu, offset, &msr_cur);
+	if (ret)
+		return ret;
+	msr_last = per_cpu_msr_sum[cpu].entries[idx].last;
+	DELTA_WRAP32(msr_cur, msr_last);
+	*msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum;
+
+	return 0;
+}
+
+timer_t timerid;
+
+/* Timer callback, update the sum of MSRs periodically. */
+static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+	int i, ret;
+	int cpu = t->cpu_id;
+
+	for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
+		unsigned long long msr_cur, msr_last;
+		int offset;
+
+		if (!idx_valid(i))
+			continue;
+		offset = idx_to_offset(i);
+		if (offset < 0)
+			continue;
+		ret = get_msr(cpu, offset, &msr_cur);
+		if (ret) {
+			fprintf(outf, "Can not update msr(0x%x)\n", offset);
+			continue;
+		}
+
+		msr_last = per_cpu_msr_sum[cpu].entries[i].last;
+		per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff;
+
+		DELTA_WRAP32(msr_cur, msr_last);
+		per_cpu_msr_sum[cpu].entries[i].sum += msr_last;
+	}
+	return 0;
+}
+
+static void
+msr_record_handler(union sigval v)
+{
+	for_all_cpus(update_msr_sum, EVEN_COUNTERS);
+}
+
+void msr_sum_record(void)
+{
+	struct itimerspec its;
+	struct sigevent sev;
+
+	per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array));
+	if (!per_cpu_msr_sum) {
+		fprintf(outf, "Can not allocate memory for long time MSR.\n");
+		return;
+	}
+	/*
+	 * Signal handler might be restricted, so use thread notifier instead.
+	 */
+	memset(&sev, 0, sizeof(struct sigevent));
+	sev.sigev_notify = SIGEV_THREAD;
+	sev.sigev_notify_function = msr_record_handler;
+
+	sev.sigev_value.sival_ptr = &timerid;
+	if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) {
+		fprintf(outf, "Can not create timer.\n");
+		goto release_msr;
+	}
+
+	its.it_value.tv_sec = 0;
+	its.it_value.tv_nsec = 1;
+	/*
+	 * A wraparound time is calculated early.
+	 */
+	its.it_interval.tv_sec = rapl_joule_counter_range;
+	its.it_interval.tv_nsec = 0;
+
+	if (timer_settime(timerid, 0, &its, NULL) == -1) {
+		fprintf(outf, "Can not set timer.\n");
+		goto release_timer;
+	}
+	return;
+
+ release_timer:
+	timer_delete(timerid);
+ release_msr:
+	free(per_cpu_msr_sum);
+}
 
 void turbostat_loop()
 {