diff mbox series

perf: RISC-V: Check standard event availability

Message ID 20240103165438.633054-1-samuel.holland@sifive.com (mailing list archive)
State New, archived
Headers show
Series perf: RISC-V: Check standard event availability | expand

Commit Message

Samuel Holland Jan. 3, 2024, 4:53 p.m. UTC
The RISC-V SBI PMU specification defines several standard hardware and
cache events. Currently, all of these events appear in the `perf list`
output, even if they are not actually implemented. Add logic to check
which events are supported by the hardware (i.e. can be mapped to some
counter), so only usable events are reported to userspace.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
---
Before this patch:
$ perf list hw

List of pre-defined events (to be used in -e or -M):

  branch-instructions OR branches                    [Hardware event]
  branch-misses                                      [Hardware event]
  bus-cycles                                         [Hardware event]
  cache-misses                                       [Hardware event]
  cache-references                                   [Hardware event]
  cpu-cycles OR cycles                               [Hardware event]
  instructions                                       [Hardware event]
  ref-cycles                                         [Hardware event]
  stalled-cycles-backend OR idle-cycles-backend      [Hardware event]
  stalled-cycles-frontend OR idle-cycles-frontend    [Hardware event]

$ perf stat -ddd true

 Performance counter stats for 'true':

              4.36 msec task-clock                       #    0.744 CPUs utilized
                 1      context-switches                 #  229.325 /sec
                 0      cpu-migrations                   #    0.000 /sec
                38      page-faults                      #    8.714 K/sec
         4,375,694      cycles                           #    1.003 GHz                         (60.64%)
           728,945      instructions                     #    0.17  insn per cycle
            79,199      branches                         #   18.162 M/sec
            17,709      branch-misses                    #   22.36% of all branches
           181,734      L1-dcache-loads                  #   41.676 M/sec
             5,547      L1-dcache-load-misses            #    3.05% of all L1-dcache accesses
     <not counted>      LLC-loads                                                               (0.00%)
     <not counted>      LLC-load-misses                                                         (0.00%)
     <not counted>      L1-icache-loads                                                         (0.00%)
     <not counted>      L1-icache-load-misses                                                   (0.00%)
     <not counted>      dTLB-loads                                                              (0.00%)
     <not counted>      dTLB-load-misses                                                        (0.00%)
     <not counted>      iTLB-loads                                                              (0.00%)
     <not counted>      iTLB-load-misses                                                        (0.00%)
     <not counted>      L1-dcache-prefetches                                                    (0.00%)
     <not counted>      L1-dcache-prefetch-misses                                               (0.00%)

       0.005860375 seconds time elapsed

       0.000000000 seconds user
       0.010383000 seconds sys

After this patch:
$ perf list hw

List of pre-defined events (to be used in -e or -M):

  branch-instructions OR branches                    [Hardware event]
  branch-misses                                      [Hardware event]
  cache-misses                                       [Hardware event]
  cache-references                                   [Hardware event]
  cpu-cycles OR cycles                               [Hardware event]
  instructions                                       [Hardware event]

$ perf stat -ddd true

 Performance counter stats for 'true':

              5.16 msec task-clock                       #    0.848 CPUs utilized
                 1      context-switches                 #  193.817 /sec
                 0      cpu-migrations                   #    0.000 /sec
                37      page-faults                      #    7.171 K/sec
         5,183,625      cycles                           #    1.005 GHz
           961,696      instructions                     #    0.19  insn per cycle
            85,853      branches                         #   16.640 M/sec
            20,462      branch-misses                    #   23.83% of all branches
           243,545      L1-dcache-loads                  #   47.203 M/sec
             5,974      L1-dcache-load-misses            #    2.45% of all L1-dcache accesses
   <not supported>      LLC-loads
   <not supported>      LLC-load-misses
   <not supported>      L1-icache-loads
   <not supported>      L1-icache-load-misses
   <not supported>      dTLB-loads
            19,619      dTLB-load-misses
   <not supported>      iTLB-loads
             6,831      iTLB-load-misses
   <not supported>      L1-dcache-prefetches
   <not supported>      L1-dcache-prefetch-misses

       0.006085625 seconds time elapsed

       0.000000000 seconds user
       0.013022000 seconds sys


 drivers/perf/riscv_pmu_sbi.c | 37 ++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

Comments

Atish Patra March 18, 2024, 7:44 p.m. UTC | #1
On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
>
> The RISC-V SBI PMU specification defines several standard hardware and
> cache events. Currently, all of these events appear in the `perf list`
> output, even if they are not actually implemented. Add logic to check
> which events are supported by the hardware (i.e. can be mapped to some
> counter), so only usable events are reported to userspace.
>

Thanks for the patch.
This adds tons of SBI calls at every boot for a use case which is at
best confusing for a subset of users who actually wants to run perf.
This probing can be done at runtime by invoking the
pmu_sbi_check_event from pmu_sbi_event_map.
We can update the event map after that so that it doesn't need to
invoke pmu_sbi_check_event next time.

> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> ---
> Before this patch:
> $ perf list hw
>
> List of pre-defined events (to be used in -e or -M):
>
>   branch-instructions OR branches                    [Hardware event]
>   branch-misses                                      [Hardware event]
>   bus-cycles                                         [Hardware event]
>   cache-misses                                       [Hardware event]
>   cache-references                                   [Hardware event]
>   cpu-cycles OR cycles                               [Hardware event]
>   instructions                                       [Hardware event]
>   ref-cycles                                         [Hardware event]
>   stalled-cycles-backend OR idle-cycles-backend      [Hardware event]
>   stalled-cycles-frontend OR idle-cycles-frontend    [Hardware event]
>
> $ perf stat -ddd true
>
>  Performance counter stats for 'true':
>
>               4.36 msec task-clock                       #    0.744 CPUs utilized
>                  1      context-switches                 #  229.325 /sec
>                  0      cpu-migrations                   #    0.000 /sec
>                 38      page-faults                      #    8.714 K/sec
>          4,375,694      cycles                           #    1.003 GHz                         (60.64%)
>            728,945      instructions                     #    0.17  insn per cycle
>             79,199      branches                         #   18.162 M/sec
>             17,709      branch-misses                    #   22.36% of all branches
>            181,734      L1-dcache-loads                  #   41.676 M/sec
>              5,547      L1-dcache-load-misses            #    3.05% of all L1-dcache accesses
>      <not counted>      LLC-loads                                                               (0.00%)
>      <not counted>      LLC-load-misses                                                         (0.00%)
>      <not counted>      L1-icache-loads                                                         (0.00%)
>      <not counted>      L1-icache-load-misses                                                   (0.00%)
>      <not counted>      dTLB-loads                                                              (0.00%)
>      <not counted>      dTLB-load-misses                                                        (0.00%)
>      <not counted>      iTLB-loads                                                              (0.00%)
>      <not counted>      iTLB-load-misses                                                        (0.00%)
>      <not counted>      L1-dcache-prefetches                                                    (0.00%)
>      <not counted>      L1-dcache-prefetch-misses                                               (0.00%)
>
>        0.005860375 seconds time elapsed
>
>        0.000000000 seconds user
>        0.010383000 seconds sys
>
> After this patch:
> $ perf list hw
>
> List of pre-defined events (to be used in -e or -M):
>
>   branch-instructions OR branches                    [Hardware event]
>   branch-misses                                      [Hardware event]
>   cache-misses                                       [Hardware event]
>   cache-references                                   [Hardware event]
>   cpu-cycles OR cycles                               [Hardware event]
>   instructions                                       [Hardware event]
>
> $ perf stat -ddd true
>
>  Performance counter stats for 'true':
>
>               5.16 msec task-clock                       #    0.848 CPUs utilized
>                  1      context-switches                 #  193.817 /sec
>                  0      cpu-migrations                   #    0.000 /sec
>                 37      page-faults                      #    7.171 K/sec
>          5,183,625      cycles                           #    1.005 GHz
>            961,696      instructions                     #    0.19  insn per cycle
>             85,853      branches                         #   16.640 M/sec
>             20,462      branch-misses                    #   23.83% of all branches
>            243,545      L1-dcache-loads                  #   47.203 M/sec
>              5,974      L1-dcache-load-misses            #    2.45% of all L1-dcache accesses
>    <not supported>      LLC-loads
>    <not supported>      LLC-load-misses
>    <not supported>      L1-icache-loads
>    <not supported>      L1-icache-load-misses
>    <not supported>      dTLB-loads
>             19,619      dTLB-load-misses
>    <not supported>      iTLB-loads
>              6,831      iTLB-load-misses
>    <not supported>      L1-dcache-prefetches
>    <not supported>      L1-dcache-prefetch-misses
>
>        0.006085625 seconds time elapsed
>
>        0.000000000 seconds user
>        0.013022000 seconds sys
>
>
>  drivers/perf/riscv_pmu_sbi.c | 37 ++++++++++++++++++++++++++++++++++--
>  1 file changed, 35 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> index 16acd4dcdb96..b58a70ee8317 100644
> --- a/drivers/perf/riscv_pmu_sbi.c
> +++ b/drivers/perf/riscv_pmu_sbi.c
> @@ -86,7 +86,7 @@ struct sbi_pmu_event_data {
>         };
>  };
>
> -static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
> +static struct sbi_pmu_event_data pmu_hw_event_map[] = {
>         [PERF_COUNT_HW_CPU_CYCLES]              = {.hw_gen_event = {
>                                                         SBI_PMU_HW_CPU_CYCLES,
>                                                         SBI_PMU_EVENT_TYPE_HW, 0}},
> @@ -120,7 +120,7 @@ static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
>  };
>
>  #define C(x) PERF_COUNT_HW_CACHE_##x
> -static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> +static struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
>  [PERF_COUNT_HW_CACHE_OP_MAX]
>  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
>         [C(L1D)] = {
> @@ -265,6 +265,36 @@ static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_M
>         },
>  };
>
> +static void pmu_sbi_check_event(struct sbi_pmu_event_data *edata)
> +{
> +       struct sbiret ret;
> +
> +       ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH,
> +                       0, cmask, 0, edata->event_idx, 0, 0);
> +       if (!ret.error) {
> +               sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
> +                         ret.value, 0x1, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
> +       } else if (ret.error == SBI_ERR_NOT_SUPPORTED) {
> +               /* This event cannot be monitored by any counter */
> +               edata->event_idx = -EINVAL;
> +       }
> +}
> +
> +static void pmu_sbi_update_events(void)
> +{
> +       /* Ensure events are not already mapped to a counter */
> +       sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
> +                 0, cmask, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
> +
> +       for (int i = 0; i < ARRAY_SIZE(pmu_hw_event_map); i++)
> +               pmu_sbi_check_event(&pmu_hw_event_map[i]);
> +
> +       for (int i = 0; i < ARRAY_SIZE(pmu_cache_event_map); i++)
> +               for (int j = 0; j < ARRAY_SIZE(pmu_cache_event_map[i]); j++)
> +                       for (int k = 0; k < ARRAY_SIZE(pmu_cache_event_map[i][j]); k++)
> +                               pmu_sbi_check_event(&pmu_cache_event_map[i][j][k]);
> +}
> +
>  static int pmu_sbi_ctr_get_width(int idx)
>  {
>         return pmu_ctr_list[idx].width;
> @@ -1046,6 +1076,9 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
>         if (pmu_sbi_get_ctrinfo(num_counters, &cmask))
>                 goto out_free;
>
> +       /* Check which standard events are available */
> +       pmu_sbi_update_events();
> +
>         ret = pmu_sbi_setup_irqs(pmu, pdev);
>         if (ret < 0) {
>                 pr_info("Perf sampling/filtering is not supported as sscof extension is not available\n");
> --
> 2.42.0
>


--
Regards,
Atish
Samuel Holland April 11, 2024, 1:40 a.m. UTC | #2
Hi Atish,

On 2024-03-18 2:44 PM, Atish Patra wrote:
> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
>>
>> The RISC-V SBI PMU specification defines several standard hardware and
>> cache events. Currently, all of these events appear in the `perf list`
>> output, even if they are not actually implemented. Add logic to check
>> which events are supported by the hardware (i.e. can be mapped to some
>> counter), so only usable events are reported to userspace.
>>
> 
> Thanks for the patch.
> This adds tons of SBI calls at every boot for a use case which is at
> best confusing for a subset of users who actually wants to run perf.

I should have been clearer in the patch description. This is not just a cosmetic
change; because perf sees these as valid events, it tries to use them in
commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
causes the ->add() callback to fail, this prevents any other events from being
scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
why the dTLB/iTLB miss counts are missing in the "before" example below.

> This probing can be done at runtime by invoking the
> pmu_sbi_check_event from pmu_sbi_event_map.
> We can update the event map after that so that it doesn't need to
> invoke pmu_sbi_check_event next time.

I tried to implement this suggestion, but it did not work. The SBI interface
does not distinguish between "none of the counters can monitor the specified
event [because the event is unsupported]" and "none of the counters can monitor
the specified event [because the counters are busy]". It is not sufficient for
the kernel to verify that at least one counter is available before performing
the check, because certain events may only be usable on a subset of counters
(per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.

As a result, checking for event support is only reliable when none of the
counters are in use. So the check can be asynchronous/deferred to later in the
boot process, but I believe it still needs to be performed for all events before
userspace starts using the counters.

Regards,
Samuel

>> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
>> ---
>> Before this patch:
>> $ perf list hw
>>
>> List of pre-defined events (to be used in -e or -M):
>>
>>   branch-instructions OR branches                    [Hardware event]
>>   branch-misses                                      [Hardware event]
>>   bus-cycles                                         [Hardware event]
>>   cache-misses                                       [Hardware event]
>>   cache-references                                   [Hardware event]
>>   cpu-cycles OR cycles                               [Hardware event]
>>   instructions                                       [Hardware event]
>>   ref-cycles                                         [Hardware event]
>>   stalled-cycles-backend OR idle-cycles-backend      [Hardware event]
>>   stalled-cycles-frontend OR idle-cycles-frontend    [Hardware event]
>>
>> $ perf stat -ddd true
>>
>>  Performance counter stats for 'true':
>>
>>               4.36 msec task-clock                       #    0.744 CPUs utilized
>>                  1      context-switches                 #  229.325 /sec
>>                  0      cpu-migrations                   #    0.000 /sec
>>                 38      page-faults                      #    8.714 K/sec
>>          4,375,694      cycles                           #    1.003 GHz                         (60.64%)
>>            728,945      instructions                     #    0.17  insn per cycle
>>             79,199      branches                         #   18.162 M/sec
>>             17,709      branch-misses                    #   22.36% of all branches
>>            181,734      L1-dcache-loads                  #   41.676 M/sec
>>              5,547      L1-dcache-load-misses            #    3.05% of all L1-dcache accesses
>>      <not counted>      LLC-loads                                                               (0.00%)
>>      <not counted>      LLC-load-misses                                                         (0.00%)
>>      <not counted>      L1-icache-loads                                                         (0.00%)
>>      <not counted>      L1-icache-load-misses                                                   (0.00%)
>>      <not counted>      dTLB-loads                                                              (0.00%)
>>      <not counted>      dTLB-load-misses                                                        (0.00%)
>>      <not counted>      iTLB-loads                                                              (0.00%)
>>      <not counted>      iTLB-load-misses                                                        (0.00%)
>>      <not counted>      L1-dcache-prefetches                                                    (0.00%)
>>      <not counted>      L1-dcache-prefetch-misses                                               (0.00%)
>>
>>        0.005860375 seconds time elapsed
>>
>>        0.000000000 seconds user
>>        0.010383000 seconds sys
>>
>> After this patch:
>> $ perf list hw
>>
>> List of pre-defined events (to be used in -e or -M):
>>
>>   branch-instructions OR branches                    [Hardware event]
>>   branch-misses                                      [Hardware event]
>>   cache-misses                                       [Hardware event]
>>   cache-references                                   [Hardware event]
>>   cpu-cycles OR cycles                               [Hardware event]
>>   instructions                                       [Hardware event]
>>
>> $ perf stat -ddd true
>>
>>  Performance counter stats for 'true':
>>
>>               5.16 msec task-clock                       #    0.848 CPUs utilized
>>                  1      context-switches                 #  193.817 /sec
>>                  0      cpu-migrations                   #    0.000 /sec
>>                 37      page-faults                      #    7.171 K/sec
>>          5,183,625      cycles                           #    1.005 GHz
>>            961,696      instructions                     #    0.19  insn per cycle
>>             85,853      branches                         #   16.640 M/sec
>>             20,462      branch-misses                    #   23.83% of all branches
>>            243,545      L1-dcache-loads                  #   47.203 M/sec
>>              5,974      L1-dcache-load-misses            #    2.45% of all L1-dcache accesses
>>    <not supported>      LLC-loads
>>    <not supported>      LLC-load-misses
>>    <not supported>      L1-icache-loads
>>    <not supported>      L1-icache-load-misses
>>    <not supported>      dTLB-loads
>>             19,619      dTLB-load-misses
>>    <not supported>      iTLB-loads
>>              6,831      iTLB-load-misses
>>    <not supported>      L1-dcache-prefetches
>>    <not supported>      L1-dcache-prefetch-misses
>>
>>        0.006085625 seconds time elapsed
>>
>>        0.000000000 seconds user
>>        0.013022000 seconds sys
>>
>>
>>  drivers/perf/riscv_pmu_sbi.c | 37 ++++++++++++++++++++++++++++++++++--
>>  1 file changed, 35 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
>> index 16acd4dcdb96..b58a70ee8317 100644
>> --- a/drivers/perf/riscv_pmu_sbi.c
>> +++ b/drivers/perf/riscv_pmu_sbi.c
>> @@ -86,7 +86,7 @@ struct sbi_pmu_event_data {
>>         };
>>  };
>>
>> -static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
>> +static struct sbi_pmu_event_data pmu_hw_event_map[] = {
>>         [PERF_COUNT_HW_CPU_CYCLES]              = {.hw_gen_event = {
>>                                                         SBI_PMU_HW_CPU_CYCLES,
>>                                                         SBI_PMU_EVENT_TYPE_HW, 0}},
>> @@ -120,7 +120,7 @@ static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
>>  };
>>
>>  #define C(x) PERF_COUNT_HW_CACHE_##x
>> -static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
>> +static struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
>>  [PERF_COUNT_HW_CACHE_OP_MAX]
>>  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
>>         [C(L1D)] = {
>> @@ -265,6 +265,36 @@ static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_M
>>         },
>>  };
>>
>> +static void pmu_sbi_check_event(struct sbi_pmu_event_data *edata)
>> +{
>> +       struct sbiret ret;
>> +
>> +       ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH,
>> +                       0, cmask, 0, edata->event_idx, 0, 0);
>> +       if (!ret.error) {
>> +               sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
>> +                         ret.value, 0x1, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
>> +       } else if (ret.error == SBI_ERR_NOT_SUPPORTED) {
>> +               /* This event cannot be monitored by any counter */
>> +               edata->event_idx = -EINVAL;
>> +       }
>> +}
>> +
>> +static void pmu_sbi_update_events(void)
>> +{
>> +       /* Ensure events are not already mapped to a counter */
>> +       sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
>> +                 0, cmask, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
>> +
>> +       for (int i = 0; i < ARRAY_SIZE(pmu_hw_event_map); i++)
>> +               pmu_sbi_check_event(&pmu_hw_event_map[i]);
>> +
>> +       for (int i = 0; i < ARRAY_SIZE(pmu_cache_event_map); i++)
>> +               for (int j = 0; j < ARRAY_SIZE(pmu_cache_event_map[i]); j++)
>> +                       for (int k = 0; k < ARRAY_SIZE(pmu_cache_event_map[i][j]); k++)
>> +                               pmu_sbi_check_event(&pmu_cache_event_map[i][j][k]);
>> +}
>> +
>>  static int pmu_sbi_ctr_get_width(int idx)
>>  {
>>         return pmu_ctr_list[idx].width;
>> @@ -1046,6 +1076,9 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
>>         if (pmu_sbi_get_ctrinfo(num_counters, &cmask))
>>                 goto out_free;
>>
>> +       /* Check which standard events are available */
>> +       pmu_sbi_update_events();
>> +
>>         ret = pmu_sbi_setup_irqs(pmu, pdev);
>>         if (ret < 0) {
>>                 pr_info("Perf sampling/filtering is not supported as sscof extension is not available\n");
>> --
>> 2.42.0
Atish Kumar Patra April 16, 2024, 12:01 a.m. UTC | #3
On 4/10/24 18:40, Samuel Holland wrote:
> Hi Atish,
> 
> On 2024-03-18 2:44 PM, Atish Patra wrote:
>> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
>>>
>>> The RISC-V SBI PMU specification defines several standard hardware and
>>> cache events. Currently, all of these events appear in the `perf list`
>>> output, even if they are not actually implemented. Add logic to check
>>> which events are supported by the hardware (i.e. can be mapped to some
>>> counter), so only usable events are reported to userspace.
>>>
>>
>> Thanks for the patch.
>> This adds tons of SBI calls at every boot for a use case which is at
>> best confusing for a subset of users who actually wants to run perf.
> 
> I should have been clearer in the patch description. This is not just a cosmetic
> change; because perf sees these as valid events, it tries to use them in
> commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
> causes the ->add() callback to fail, this prevents any other events from being
> scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
> why the dTLB/iTLB miss counts are missing in the "before" example below.
> 

Thanks for explaining the problem. I can reproduce it in qemu as well if 
enough number of invalid events given on the command line and the 
workload is short enough.

>> This probing can be done at runtime by invoking the
>> pmu_sbi_check_event from pmu_sbi_event_map.
>> We can update the event map after that so that it doesn't need to
>> invoke pmu_sbi_check_event next time.
> 
> I tried to implement this suggestion, but it did not work. The SBI interface
> does not distinguish between "none of the counters can monitor the specified
> event [because the event is unsupported]" and "none of the counters can monitor
> the specified event [because the counters are busy]". It is not sufficient for
> the kernel to verify that at least one counter is available before performing
> the check, because certain events may only be usable on a subset of counters
> (per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.
> 

Yeah. My suggestion was to fix the perf list issue which is different 
than the issue reported now.

> As a result, checking for event support is only reliable when none of the
> counters are in use. So the check can be asynchronous/deferred to later in the
> boot process, but I believe it still needs to be performed for all events before
> userspace starts using the counters.
> 

We should defer it a work queue for sure. We can also look at improving 
SBI PMU extension to support bulk matching behavior as well.

However, I think a better solution would be to just rely on the json 
file mappings instead of making SBI calls. We are going to have the 
event encoding and mappings in the json in the future.

I had added it only for platforms with counter delegation[1] but I think 
this can be generalized for platforms with SBI PMU as well.

I had some hacks to specify the legacy event encodings but Ian rogers 
improved with a generic support by preferring sysfs/json event encodings 
over fixed ones. I am yet to rebase and try Ian's series on top of the 
counter delegation though. Thoughts ?

[1] 
https://lore.kernel.org/lkml/20240217005738.3744121-1-atishp@rivosinc.com/
[2] 
https://lore.kernel.org/bpf/20240415063626.453987-2-irogers@google.com/T/


> Regards,
> Samuel
> 
>>> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
>>> ---
>>> Before this patch:
>>> $ perf list hw
>>>
>>> List of pre-defined events (to be used in -e or -M):
>>>
>>>    branch-instructions OR branches                    [Hardware event]
>>>    branch-misses                                      [Hardware event]
>>>    bus-cycles                                         [Hardware event]
>>>    cache-misses                                       [Hardware event]
>>>    cache-references                                   [Hardware event]
>>>    cpu-cycles OR cycles                               [Hardware event]
>>>    instructions                                       [Hardware event]
>>>    ref-cycles                                         [Hardware event]
>>>    stalled-cycles-backend OR idle-cycles-backend      [Hardware event]
>>>    stalled-cycles-frontend OR idle-cycles-frontend    [Hardware event]
>>>
>>> $ perf stat -ddd true
>>>
>>>   Performance counter stats for 'true':
>>>
>>>                4.36 msec task-clock                       #    0.744 CPUs utilized
>>>                   1      context-switches                 #  229.325 /sec
>>>                   0      cpu-migrations                   #    0.000 /sec
>>>                  38      page-faults                      #    8.714 K/sec
>>>           4,375,694      cycles                           #    1.003 GHz                         (60.64%)
>>>             728,945      instructions                     #    0.17  insn per cycle
>>>              79,199      branches                         #   18.162 M/sec
>>>              17,709      branch-misses                    #   22.36% of all branches
>>>             181,734      L1-dcache-loads                  #   41.676 M/sec
>>>               5,547      L1-dcache-load-misses            #    3.05% of all L1-dcache accesses
>>>       <not counted>      LLC-loads                                                               (0.00%)
>>>       <not counted>      LLC-load-misses                                                         (0.00%)
>>>       <not counted>      L1-icache-loads                                                         (0.00%)
>>>       <not counted>      L1-icache-load-misses                                                   (0.00%)
>>>       <not counted>      dTLB-loads                                                              (0.00%)
>>>       <not counted>      dTLB-load-misses                                                        (0.00%)
>>>       <not counted>      iTLB-loads                                                              (0.00%)
>>>       <not counted>      iTLB-load-misses                                                        (0.00%)
>>>       <not counted>      L1-dcache-prefetches                                                    (0.00%)
>>>       <not counted>      L1-dcache-prefetch-misses                                               (0.00%)
>>>
>>>         0.005860375 seconds time elapsed
>>>
>>>         0.000000000 seconds user
>>>         0.010383000 seconds sys
>>>
>>> After this patch:
>>> $ perf list hw
>>>
>>> List of pre-defined events (to be used in -e or -M):
>>>
>>>    branch-instructions OR branches                    [Hardware event]
>>>    branch-misses                                      [Hardware event]
>>>    cache-misses                                       [Hardware event]
>>>    cache-references                                   [Hardware event]
>>>    cpu-cycles OR cycles                               [Hardware event]
>>>    instructions                                       [Hardware event]
>>>
>>> $ perf stat -ddd true
>>>
>>>   Performance counter stats for 'true':
>>>
>>>                5.16 msec task-clock                       #    0.848 CPUs utilized
>>>                   1      context-switches                 #  193.817 /sec
>>>                   0      cpu-migrations                   #    0.000 /sec
>>>                  37      page-faults                      #    7.171 K/sec
>>>           5,183,625      cycles                           #    1.005 GHz
>>>             961,696      instructions                     #    0.19  insn per cycle
>>>              85,853      branches                         #   16.640 M/sec
>>>              20,462      branch-misses                    #   23.83% of all branches
>>>             243,545      L1-dcache-loads                  #   47.203 M/sec
>>>               5,974      L1-dcache-load-misses            #    2.45% of all L1-dcache accesses
>>>     <not supported>      LLC-loads
>>>     <not supported>      LLC-load-misses
>>>     <not supported>      L1-icache-loads
>>>     <not supported>      L1-icache-load-misses
>>>     <not supported>      dTLB-loads
>>>              19,619      dTLB-load-misses
>>>     <not supported>      iTLB-loads
>>>               6,831      iTLB-load-misses
>>>     <not supported>      L1-dcache-prefetches
>>>     <not supported>      L1-dcache-prefetch-misses
>>>
>>>         0.006085625 seconds time elapsed
>>>
>>>         0.000000000 seconds user
>>>         0.013022000 seconds sys
>>>
>>>
>>>   drivers/perf/riscv_pmu_sbi.c | 37 ++++++++++++++++++++++++++++++++++--
>>>   1 file changed, 35 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
>>> index 16acd4dcdb96..b58a70ee8317 100644
>>> --- a/drivers/perf/riscv_pmu_sbi.c
>>> +++ b/drivers/perf/riscv_pmu_sbi.c
>>> @@ -86,7 +86,7 @@ struct sbi_pmu_event_data {
>>>          };
>>>   };
>>>
>>> -static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
>>> +static struct sbi_pmu_event_data pmu_hw_event_map[] = {
>>>          [PERF_COUNT_HW_CPU_CYCLES]              = {.hw_gen_event = {
>>>                                                          SBI_PMU_HW_CPU_CYCLES,
>>>                                                          SBI_PMU_EVENT_TYPE_HW, 0}},
>>> @@ -120,7 +120,7 @@ static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
>>>   };
>>>
>>>   #define C(x) PERF_COUNT_HW_CACHE_##x
>>> -static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
>>> +static struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
>>>   [PERF_COUNT_HW_CACHE_OP_MAX]
>>>   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
>>>          [C(L1D)] = {
>>> @@ -265,6 +265,36 @@ static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_M
>>>          },
>>>   };
>>>
>>> +static void pmu_sbi_check_event(struct sbi_pmu_event_data *edata)
>>> +{
>>> +       struct sbiret ret;
>>> +
>>> +       ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH,
>>> +                       0, cmask, 0, edata->event_idx, 0, 0);
>>> +       if (!ret.error) {
>>> +               sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
>>> +                         ret.value, 0x1, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
>>> +       } else if (ret.error == SBI_ERR_NOT_SUPPORTED) {
>>> +               /* This event cannot be monitored by any counter */
>>> +               edata->event_idx = -EINVAL;
>>> +       }
>>> +}
>>> +
>>> +static void pmu_sbi_update_events(void)
>>> +{
>>> +       /* Ensure events are not already mapped to a counter */
>>> +       sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
>>> +                 0, cmask, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
>>> +
>>> +       for (int i = 0; i < ARRAY_SIZE(pmu_hw_event_map); i++)
>>> +               pmu_sbi_check_event(&pmu_hw_event_map[i]);
>>> +
>>> +       for (int i = 0; i < ARRAY_SIZE(pmu_cache_event_map); i++)
>>> +               for (int j = 0; j < ARRAY_SIZE(pmu_cache_event_map[i]); j++)
>>> +                       for (int k = 0; k < ARRAY_SIZE(pmu_cache_event_map[i][j]); k++)
>>> +                               pmu_sbi_check_event(&pmu_cache_event_map[i][j][k]);
>>> +}
>>> +
>>>   static int pmu_sbi_ctr_get_width(int idx)
>>>   {
>>>          return pmu_ctr_list[idx].width;
>>> @@ -1046,6 +1076,9 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
>>>          if (pmu_sbi_get_ctrinfo(num_counters, &cmask))
>>>                  goto out_free;
>>>
>>> +       /* Check which standard events are available */
>>> +       pmu_sbi_update_events();
>>> +
>>>          ret = pmu_sbi_setup_irqs(pmu, pdev);
>>>          if (ret < 0) {
>>>                  pr_info("Perf sampling/filtering is not supported as sscof extension is not available\n");
>>> --
>>> 2.42.0
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Anup Patel April 16, 2024, 4:07 a.m. UTC | #4
On Tue, Apr 16, 2024 at 5:31 AM Atish Patra <atishp@rivosinc.com> wrote:
>
> On 4/10/24 18:40, Samuel Holland wrote:
> > Hi Atish,
> >
> > On 2024-03-18 2:44 PM, Atish Patra wrote:
> >> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
> >>>
> >>> The RISC-V SBI PMU specification defines several standard hardware and
> >>> cache events. Currently, all of these events appear in the `perf list`
> >>> output, even if they are not actually implemented. Add logic to check
> >>> which events are supported by the hardware (i.e. can be mapped to some
> >>> counter), so only usable events are reported to userspace.
> >>>
> >>
> >> Thanks for the patch.
> >> This adds tons of SBI calls at every boot for a use case which is at
> >> best confusing for a subset of users who actually wants to run perf.
> >
> > I should have been clearer in the patch description. This is not just a cosmetic
> > change; because perf sees these as valid events, it tries to use them in
> > commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
> > causes the ->add() callback to fail, this prevents any other events from being
> > scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
> > why the dTLB/iTLB miss counts are missing in the "before" example below.
> >
>
> Thanks for explaining the problem. I can reproduce it in qemu as well if
> enough number of invalid events given on the command line and the
> workload is short enough.
>
> >> This probing can be done at runtime by invoking the
> >> pmu_sbi_check_event from pmu_sbi_event_map.
> >> We can update the event map after that so that it doesn't need to
> >> invoke pmu_sbi_check_event next time.
> >
> > I tried to implement this suggestion, but it did not work. The SBI interface
> > does not distinguish between "none of the counters can monitor the specified
> > event [because the event is unsupported]" and "none of the counters can monitor
> > the specified event [because the counters are busy]". It is not sufficient for
> > the kernel to verify that at least one counter is available before performing
> > the check, because certain events may only be usable on a subset of counters
> > (per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.
> >
>
> Yeah. My suggestion was to fix the perf list issue which is different
> than the issue reported now.
>
> > As a result, checking for event support is only reliable when none of the
> > counters are in use. So the check can be asynchronous/deferred to later in the
> > boot process, but I believe it still needs to be performed for all events before
> > userspace starts using the counters.
> >
>
> We should defer it a work queue for sure. We can also look at improving
> SBI PMU extension to support bulk matching behavior as well.
>
> However, I think a better solution would be to just rely on the json
> file mappings instead of making SBI calls. We are going to have the
> event encoding and mappings in the json in the future.

The problem with JSON based event encoding is how to deal in-case
we are running inside Guest/VM because Host could be anything.

IMO, the JSON based approach is not suitable for SBI PMU. For now,
we either defer the work using the work queue or keep the approach
of this patch as-is.

The good part about SBI PMU extension is that we do have a standard
set of events and we only need a way to discover supported standard
events with a minimum number of SBI calls. It is better to add a new
SBI PMU call to assist supported event discovery which will also
help us virtualize it.

Regards,
Anup

>
> I had added it only for platforms with counter delegation[1] but I think
> this can be generalized for platforms with SBI PMU as well.
>
> I had some hacks to specify the legacy event encodings but Ian rogers
> improved with a generic support by preferring sysfs/json event encodings
> over fixed ones. I am yet to rebase and try Ian's series on top of the
> counter delegation though. Thoughts ?
>
> [1]
> https://lore.kernel.org/lkml/20240217005738.3744121-1-atishp@rivosinc.com/
> [2]
> https://lore.kernel.org/bpf/20240415063626.453987-2-irogers@google.com/T/
>
>
> > Regards,
> > Samuel
> >
> >>> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> >>> ---
> >>> Before this patch:
> >>> $ perf list hw
> >>>
> >>> List of pre-defined events (to be used in -e or -M):
> >>>
> >>>    branch-instructions OR branches                    [Hardware event]
> >>>    branch-misses                                      [Hardware event]
> >>>    bus-cycles                                         [Hardware event]
> >>>    cache-misses                                       [Hardware event]
> >>>    cache-references                                   [Hardware event]
> >>>    cpu-cycles OR cycles                               [Hardware event]
> >>>    instructions                                       [Hardware event]
> >>>    ref-cycles                                         [Hardware event]
> >>>    stalled-cycles-backend OR idle-cycles-backend      [Hardware event]
> >>>    stalled-cycles-frontend OR idle-cycles-frontend    [Hardware event]
> >>>
> >>> $ perf stat -ddd true
> >>>
> >>>   Performance counter stats for 'true':
> >>>
> >>>                4.36 msec task-clock                       #    0.744 CPUs utilized
> >>>                   1      context-switches                 #  229.325 /sec
> >>>                   0      cpu-migrations                   #    0.000 /sec
> >>>                  38      page-faults                      #    8.714 K/sec
> >>>           4,375,694      cycles                           #    1.003 GHz                         (60.64%)
> >>>             728,945      instructions                     #    0.17  insn per cycle
> >>>              79,199      branches                         #   18.162 M/sec
> >>>              17,709      branch-misses                    #   22.36% of all branches
> >>>             181,734      L1-dcache-loads                  #   41.676 M/sec
> >>>               5,547      L1-dcache-load-misses            #    3.05% of all L1-dcache accesses
> >>>       <not counted>      LLC-loads                                                               (0.00%)
> >>>       <not counted>      LLC-load-misses                                                         (0.00%)
> >>>       <not counted>      L1-icache-loads                                                         (0.00%)
> >>>       <not counted>      L1-icache-load-misses                                                   (0.00%)
> >>>       <not counted>      dTLB-loads                                                              (0.00%)
> >>>       <not counted>      dTLB-load-misses                                                        (0.00%)
> >>>       <not counted>      iTLB-loads                                                              (0.00%)
> >>>       <not counted>      iTLB-load-misses                                                        (0.00%)
> >>>       <not counted>      L1-dcache-prefetches                                                    (0.00%)
> >>>       <not counted>      L1-dcache-prefetch-misses                                               (0.00%)
> >>>
> >>>         0.005860375 seconds time elapsed
> >>>
> >>>         0.000000000 seconds user
> >>>         0.010383000 seconds sys
> >>>
> >>> After this patch:
> >>> $ perf list hw
> >>>
> >>> List of pre-defined events (to be used in -e or -M):
> >>>
> >>>    branch-instructions OR branches                    [Hardware event]
> >>>    branch-misses                                      [Hardware event]
> >>>    cache-misses                                       [Hardware event]
> >>>    cache-references                                   [Hardware event]
> >>>    cpu-cycles OR cycles                               [Hardware event]
> >>>    instructions                                       [Hardware event]
> >>>
> >>> $ perf stat -ddd true
> >>>
> >>>   Performance counter stats for 'true':
> >>>
> >>>                5.16 msec task-clock                       #    0.848 CPUs utilized
> >>>                   1      context-switches                 #  193.817 /sec
> >>>                   0      cpu-migrations                   #    0.000 /sec
> >>>                  37      page-faults                      #    7.171 K/sec
> >>>           5,183,625      cycles                           #    1.005 GHz
> >>>             961,696      instructions                     #    0.19  insn per cycle
> >>>              85,853      branches                         #   16.640 M/sec
> >>>              20,462      branch-misses                    #   23.83% of all branches
> >>>             243,545      L1-dcache-loads                  #   47.203 M/sec
> >>>               5,974      L1-dcache-load-misses            #    2.45% of all L1-dcache accesses
> >>>     <not supported>      LLC-loads
> >>>     <not supported>      LLC-load-misses
> >>>     <not supported>      L1-icache-loads
> >>>     <not supported>      L1-icache-load-misses
> >>>     <not supported>      dTLB-loads
> >>>              19,619      dTLB-load-misses
> >>>     <not supported>      iTLB-loads
> >>>               6,831      iTLB-load-misses
> >>>     <not supported>      L1-dcache-prefetches
> >>>     <not supported>      L1-dcache-prefetch-misses
> >>>
> >>>         0.006085625 seconds time elapsed
> >>>
> >>>         0.000000000 seconds user
> >>>         0.013022000 seconds sys
> >>>
> >>>
> >>>   drivers/perf/riscv_pmu_sbi.c | 37 ++++++++++++++++++++++++++++++++++--
> >>>   1 file changed, 35 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> >>> index 16acd4dcdb96..b58a70ee8317 100644
> >>> --- a/drivers/perf/riscv_pmu_sbi.c
> >>> +++ b/drivers/perf/riscv_pmu_sbi.c
> >>> @@ -86,7 +86,7 @@ struct sbi_pmu_event_data {
> >>>          };
> >>>   };
> >>>
> >>> -static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
> >>> +static struct sbi_pmu_event_data pmu_hw_event_map[] = {
> >>>          [PERF_COUNT_HW_CPU_CYCLES]              = {.hw_gen_event = {
> >>>                                                          SBI_PMU_HW_CPU_CYCLES,
> >>>                                                          SBI_PMU_EVENT_TYPE_HW, 0}},
> >>> @@ -120,7 +120,7 @@ static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
> >>>   };
> >>>
> >>>   #define C(x) PERF_COUNT_HW_CACHE_##x
> >>> -static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> >>> +static struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> >>>   [PERF_COUNT_HW_CACHE_OP_MAX]
> >>>   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> >>>          [C(L1D)] = {
> >>> @@ -265,6 +265,36 @@ static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_M
> >>>          },
> >>>   };
> >>>
> >>> +static void pmu_sbi_check_event(struct sbi_pmu_event_data *edata)
> >>> +{
> >>> +       struct sbiret ret;
> >>> +
> >>> +       ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH,
> >>> +                       0, cmask, 0, edata->event_idx, 0, 0);
> >>> +       if (!ret.error) {
> >>> +               sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
> >>> +                         ret.value, 0x1, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
> >>> +       } else if (ret.error == SBI_ERR_NOT_SUPPORTED) {
> >>> +               /* This event cannot be monitored by any counter */
> >>> +               edata->event_idx = -EINVAL;
> >>> +       }
> >>> +}
> >>> +
> >>> +static void pmu_sbi_update_events(void)
> >>> +{
> >>> +       /* Ensure events are not already mapped to a counter */
> >>> +       sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
> >>> +                 0, cmask, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
> >>> +
> >>> +       for (int i = 0; i < ARRAY_SIZE(pmu_hw_event_map); i++)
> >>> +               pmu_sbi_check_event(&pmu_hw_event_map[i]);
> >>> +
> >>> +       for (int i = 0; i < ARRAY_SIZE(pmu_cache_event_map); i++)
> >>> +               for (int j = 0; j < ARRAY_SIZE(pmu_cache_event_map[i]); j++)
> >>> +                       for (int k = 0; k < ARRAY_SIZE(pmu_cache_event_map[i][j]); k++)
> >>> +                               pmu_sbi_check_event(&pmu_cache_event_map[i][j][k]);
> >>> +}
> >>> +
> >>>   static int pmu_sbi_ctr_get_width(int idx)
> >>>   {
> >>>          return pmu_ctr_list[idx].width;
> >>> @@ -1046,6 +1076,9 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
> >>>          if (pmu_sbi_get_ctrinfo(num_counters, &cmask))
> >>>                  goto out_free;
> >>>
> >>> +       /* Check which standard events are available */
> >>> +       pmu_sbi_update_events();
> >>> +
> >>>          ret = pmu_sbi_setup_irqs(pmu, pdev);
> >>>          if (ret < 0) {
> >>>                  pr_info("Perf sampling/filtering is not supported as sscof extension is not available\n");
> >>> --
> >>> 2.42.0
> >
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
>
Ian Rogers April 23, 2024, 3:44 a.m. UTC | #5
On Mon, Apr 15, 2024 at 9:07 PM Anup Patel <anup@brainfault.org> wrote:
>
> On Tue, Apr 16, 2024 at 5:31 AM Atish Patra <atishp@rivosinc.com> wrote:
> >
> > On 4/10/24 18:40, Samuel Holland wrote:
> > > Hi Atish,
> > >
> > > On 2024-03-18 2:44 PM, Atish Patra wrote:
> > >> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
> > >>>
> > >>> The RISC-V SBI PMU specification defines several standard hardware and
> > >>> cache events. Currently, all of these events appear in the `perf list`
> > >>> output, even if they are not actually implemented. Add logic to check
> > >>> which events are supported by the hardware (i.e. can be mapped to some
> > >>> counter), so only usable events are reported to userspace.
> > >>>
> > >>
> > >> Thanks for the patch.
> > >> This adds tons of SBI calls at every boot for a use case which is at
> > >> best confusing for a subset of users who actually wants to run perf.
> > >
> > > I should have been clearer in the patch description. This is not just a cosmetic
> > > change; because perf sees these as valid events, it tries to use them in
> > > commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
> > > causes the ->add() callback to fail, this prevents any other events from being
> > > scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
> > > why the dTLB/iTLB miss counts are missing in the "before" example below.
> > >
> >
> > Thanks for explaining the problem. I can reproduce it in qemu as well if
> > enough number of invalid events given on the command line and the
> > workload is short enough.
> >
> > >> This probing can be done at runtime by invoking the
> > >> pmu_sbi_check_event from pmu_sbi_event_map.
> > >> We can update the event map after that so that it doesn't need to
> > >> invoke pmu_sbi_check_event next time.
> > >
> > > I tried to implement this suggestion, but it did not work. The SBI interface
> > > does not distinguish between "none of the counters can monitor the specified
> > > event [because the event is unsupported]" and "none of the counters can monitor
> > > the specified event [because the counters are busy]". It is not sufficient for
> > > the kernel to verify that at least one counter is available before performing
> > > the check, because certain events may only be usable on a subset of counters
> > > (per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.
> > >
> >
> > Yeah. My suggestion was to fix the perf list issue which is different
> > than the issue reported now.
> >
> > > As a result, checking for event support is only reliable when none of the
> > > counters are in use. So the check can be asynchronous/deferred to later in the
> > > boot process, but I believe it still needs to be performed for all events before
> > > userspace starts using the counters.
> > >
> >
> > We should defer it a work queue for sure. We can also look at improving
> > SBI PMU extension to support bulk matching behavior as well.
> >
> > However, I think a better solution would be to just rely on the json
> > file mappings instead of making SBI calls. We are going to have the
> > event encoding and mappings in the json in the future.
>
> The problem with JSON based event encoding is how to deal in-case
> we are running inside Guest/VM because Host could be anything.
>
> IMO, the JSON based approach is not suitable for SBI PMU. For now,
> we either defer the work using the work queue or keep the approach
> of this patch as-is.
>
> The good part about SBI PMU extension is that we do have a standard
> set of events and we only need a way to discover supported standard
> events with a minimum number of SBI calls. It is better to add a new
> SBI PMU call to assist supported event discovery which will also
> help us virtualize it.
>
> Regards,
> Anup

+Ian Rogers

`perf list` will already filter some events depending on whether the
PMU supports them, for example, legacy cache events. I think we can
extend this to json events.

Thanks,
Ian

> >
> > I had added it only for platforms with counter delegation[1] but I think
> > this can be generalized for platforms with SBI PMU as well.
> >
> > I had some hacks to specify the legacy event encodings but Ian rogers
> > improved with a generic support by preferring sysfs/json event encodings
> > over fixed ones. I am yet to rebase and try Ian's series on top of the
> > counter delegation though. Thoughts ?
> >
> > [1]
> > https://lore.kernel.org/lkml/20240217005738.3744121-1-atishp@rivosinc.com/
> > [2]
> > https://lore.kernel.org/bpf/20240415063626.453987-2-irogers@google.com/T/
> >
> >
> > > Regards,
> > > Samuel
> > >
> > >>> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> > >>> ---
> > >>> Before this patch:
> > >>> $ perf list hw
> > >>>
> > >>> List of pre-defined events (to be used in -e or -M):
> > >>>
> > >>>    branch-instructions OR branches                    [Hardware event]
> > >>>    branch-misses                                      [Hardware event]
> > >>>    bus-cycles                                         [Hardware event]
> > >>>    cache-misses                                       [Hardware event]
> > >>>    cache-references                                   [Hardware event]
> > >>>    cpu-cycles OR cycles                               [Hardware event]
> > >>>    instructions                                       [Hardware event]
> > >>>    ref-cycles                                         [Hardware event]
> > >>>    stalled-cycles-backend OR idle-cycles-backend      [Hardware event]
> > >>>    stalled-cycles-frontend OR idle-cycles-frontend    [Hardware event]
> > >>>
> > >>> $ perf stat -ddd true
> > >>>
> > >>>   Performance counter stats for 'true':
> > >>>
> > >>>                4.36 msec task-clock                       #    0.744 CPUs utilized
> > >>>                   1      context-switches                 #  229.325 /sec
> > >>>                   0      cpu-migrations                   #    0.000 /sec
> > >>>                  38      page-faults                      #    8.714 K/sec
> > >>>           4,375,694      cycles                           #    1.003 GHz                         (60.64%)
> > >>>             728,945      instructions                     #    0.17  insn per cycle
> > >>>              79,199      branches                         #   18.162 M/sec
> > >>>              17,709      branch-misses                    #   22.36% of all branches
> > >>>             181,734      L1-dcache-loads                  #   41.676 M/sec
> > >>>               5,547      L1-dcache-load-misses            #    3.05% of all L1-dcache accesses
> > >>>       <not counted>      LLC-loads                                                               (0.00%)
> > >>>       <not counted>      LLC-load-misses                                                         (0.00%)
> > >>>       <not counted>      L1-icache-loads                                                         (0.00%)
> > >>>       <not counted>      L1-icache-load-misses                                                   (0.00%)
> > >>>       <not counted>      dTLB-loads                                                              (0.00%)
> > >>>       <not counted>      dTLB-load-misses                                                        (0.00%)
> > >>>       <not counted>      iTLB-loads                                                              (0.00%)
> > >>>       <not counted>      iTLB-load-misses                                                        (0.00%)
> > >>>       <not counted>      L1-dcache-prefetches                                                    (0.00%)
> > >>>       <not counted>      L1-dcache-prefetch-misses                                               (0.00%)
> > >>>
> > >>>         0.005860375 seconds time elapsed
> > >>>
> > >>>         0.000000000 seconds user
> > >>>         0.010383000 seconds sys
> > >>>
> > >>> After this patch:
> > >>> $ perf list hw
> > >>>
> > >>> List of pre-defined events (to be used in -e or -M):
> > >>>
> > >>>    branch-instructions OR branches                    [Hardware event]
> > >>>    branch-misses                                      [Hardware event]
> > >>>    cache-misses                                       [Hardware event]
> > >>>    cache-references                                   [Hardware event]
> > >>>    cpu-cycles OR cycles                               [Hardware event]
> > >>>    instructions                                       [Hardware event]
> > >>>
> > >>> $ perf stat -ddd true
> > >>>
> > >>>   Performance counter stats for 'true':
> > >>>
> > >>>                5.16 msec task-clock                       #    0.848 CPUs utilized
> > >>>                   1      context-switches                 #  193.817 /sec
> > >>>                   0      cpu-migrations                   #    0.000 /sec
> > >>>                  37      page-faults                      #    7.171 K/sec
> > >>>           5,183,625      cycles                           #    1.005 GHz
> > >>>             961,696      instructions                     #    0.19  insn per cycle
> > >>>              85,853      branches                         #   16.640 M/sec
> > >>>              20,462      branch-misses                    #   23.83% of all branches
> > >>>             243,545      L1-dcache-loads                  #   47.203 M/sec
> > >>>               5,974      L1-dcache-load-misses            #    2.45% of all L1-dcache accesses
> > >>>     <not supported>      LLC-loads
> > >>>     <not supported>      LLC-load-misses
> > >>>     <not supported>      L1-icache-loads
> > >>>     <not supported>      L1-icache-load-misses
> > >>>     <not supported>      dTLB-loads
> > >>>              19,619      dTLB-load-misses
> > >>>     <not supported>      iTLB-loads
> > >>>               6,831      iTLB-load-misses
> > >>>     <not supported>      L1-dcache-prefetches
> > >>>     <not supported>      L1-dcache-prefetch-misses
> > >>>
> > >>>         0.006085625 seconds time elapsed
> > >>>
> > >>>         0.000000000 seconds user
> > >>>         0.013022000 seconds sys
> > >>>
> > >>>
> > >>>   drivers/perf/riscv_pmu_sbi.c | 37 ++++++++++++++++++++++++++++++++++--
> > >>>   1 file changed, 35 insertions(+), 2 deletions(-)
> > >>>
> > >>> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> > >>> index 16acd4dcdb96..b58a70ee8317 100644
> > >>> --- a/drivers/perf/riscv_pmu_sbi.c
> > >>> +++ b/drivers/perf/riscv_pmu_sbi.c
> > >>> @@ -86,7 +86,7 @@ struct sbi_pmu_event_data {
> > >>>          };
> > >>>   };
> > >>>
> > >>> -static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
> > >>> +static struct sbi_pmu_event_data pmu_hw_event_map[] = {
> > >>>          [PERF_COUNT_HW_CPU_CYCLES]              = {.hw_gen_event = {
> > >>>                                                          SBI_PMU_HW_CPU_CYCLES,
> > >>>                                                          SBI_PMU_EVENT_TYPE_HW, 0}},
> > >>> @@ -120,7 +120,7 @@ static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
> > >>>   };
> > >>>
> > >>>   #define C(x) PERF_COUNT_HW_CACHE_##x
> > >>> -static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> > >>> +static struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> > >>>   [PERF_COUNT_HW_CACHE_OP_MAX]
> > >>>   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> > >>>          [C(L1D)] = {
> > >>> @@ -265,6 +265,36 @@ static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_M
> > >>>          },
> > >>>   };
> > >>>
> > >>> +static void pmu_sbi_check_event(struct sbi_pmu_event_data *edata)
> > >>> +{
> > >>> +       struct sbiret ret;
> > >>> +
> > >>> +       ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH,
> > >>> +                       0, cmask, 0, edata->event_idx, 0, 0);
> > >>> +       if (!ret.error) {
> > >>> +               sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
> > >>> +                         ret.value, 0x1, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
> > >>> +       } else if (ret.error == SBI_ERR_NOT_SUPPORTED) {
> > >>> +               /* This event cannot be monitored by any counter */
> > >>> +               edata->event_idx = -EINVAL;
> > >>> +       }
> > >>> +}
> > >>> +
> > >>> +static void pmu_sbi_update_events(void)
> > >>> +{
> > >>> +       /* Ensure events are not already mapped to a counter */
> > >>> +       sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
> > >>> +                 0, cmask, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
> > >>> +
> > >>> +       for (int i = 0; i < ARRAY_SIZE(pmu_hw_event_map); i++)
> > >>> +               pmu_sbi_check_event(&pmu_hw_event_map[i]);
> > >>> +
> > >>> +       for (int i = 0; i < ARRAY_SIZE(pmu_cache_event_map); i++)
> > >>> +               for (int j = 0; j < ARRAY_SIZE(pmu_cache_event_map[i]); j++)
> > >>> +                       for (int k = 0; k < ARRAY_SIZE(pmu_cache_event_map[i][j]); k++)
> > >>> +                               pmu_sbi_check_event(&pmu_cache_event_map[i][j][k]);
> > >>> +}
> > >>> +
> > >>>   static int pmu_sbi_ctr_get_width(int idx)
> > >>>   {
> > >>>          return pmu_ctr_list[idx].width;
> > >>> @@ -1046,6 +1076,9 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
> > >>>          if (pmu_sbi_get_ctrinfo(num_counters, &cmask))
> > >>>                  goto out_free;
> > >>>
> > >>> +       /* Check which standard events are available */
> > >>> +       pmu_sbi_update_events();
> > >>> +
> > >>>          ret = pmu_sbi_setup_irqs(pmu, pdev);
> > >>>          if (ret < 0) {
> > >>>                  pr_info("Perf sampling/filtering is not supported as sscof extension is not available\n");
> > >>> --
> > >>> 2.42.0
> > >
> > >
> > > _______________________________________________
> > > linux-riscv mailing list
> > > linux-riscv@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/linux-riscv
> >
>
Atish Kumar Patra April 24, 2024, 12:36 a.m. UTC | #6
On Mon, Apr 22, 2024 at 8:44 PM Ian Rogers <irogers@google.com> wrote:
>
> On Mon, Apr 15, 2024 at 9:07 PM Anup Patel <anup@brainfault.org> wrote:
> >
> > On Tue, Apr 16, 2024 at 5:31 AM Atish Patra <atishp@rivosinc.com> wrote:
> > >
> > > On 4/10/24 18:40, Samuel Holland wrote:
> > > > Hi Atish,
> > > >
> > > > On 2024-03-18 2:44 PM, Atish Patra wrote:
> > > >> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
> > > >>>
> > > >>> The RISC-V SBI PMU specification defines several standard hardware and
> > > >>> cache events. Currently, all of these events appear in the `perf list`
> > > >>> output, even if they are not actually implemented. Add logic to check
> > > >>> which events are supported by the hardware (i.e. can be mapped to some
> > > >>> counter), so only usable events are reported to userspace.
> > > >>>
> > > >>
> > > >> Thanks for the patch.
> > > >> This adds tons of SBI calls at every boot for a use case which is at
> > > >> best confusing for a subset of users who actually wants to run perf.
> > > >
> > > > I should have been clearer in the patch description. This is not just a cosmetic
> > > > change; because perf sees these as valid events, it tries to use them in
> > > > commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
> > > > causes the ->add() callback to fail, this prevents any other events from being
> > > > scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
> > > > why the dTLB/iTLB miss counts are missing in the "before" example below.
> > > >
> > >
> > > Thanks for explaining the problem. I can reproduce it in qemu as well if
> > > enough number of invalid events given on the command line and the
> > > workload is short enough.
> > >
> > > >> This probing can be done at runtime by invoking the
> > > >> pmu_sbi_check_event from pmu_sbi_event_map.
> > > >> We can update the event map after that so that it doesn't need to
> > > >> invoke pmu_sbi_check_event next time.
> > > >
> > > > I tried to implement this suggestion, but it did not work. The SBI interface
> > > > does not distinguish between "none of the counters can monitor the specified
> > > > event [because the event is unsupported]" and "none of the counters can monitor
> > > > the specified event [because the counters are busy]". It is not sufficient for
> > > > the kernel to verify that at least one counter is available before performing
> > > > the check, because certain events may only be usable on a subset of counters
> > > > (per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.
> > > >
> > >
> > > Yeah. My suggestion was to fix the perf list issue which is different
> > > than the issue reported now.
> > >
> > > > As a result, checking for event support is only reliable when none of the
> > > > counters are in use. So the check can be asynchronous/deferred to later in the
> > > > boot process, but I believe it still needs to be performed for all events before
> > > > userspace starts using the counters.
> > > >
> > >
> > > We should defer it a work queue for sure. We can also look at improving
> > > SBI PMU extension to support bulk matching behavior as well.
> > >
> > > However, I think a better solution would be to just rely on the json
> > > file mappings instead of making SBI calls. We are going to have the
> > > event encoding and mappings in the json in the future.
> >
> > The problem with JSON based event encoding is how to deal in-case
> > we are running inside Guest/VM because Host could be anything.
> >
> > IMO, the JSON based approach is not suitable for SBI PMU. For now,
> > we either defer the work using the work queue or keep the approach
> > of this patch as-is.
> >
> > The good part about SBI PMU extension is that we do have a standard
> > set of events and we only need a way to discover supported standard
> > events with a minimum number of SBI calls. It is better to add a new
> > SBI PMU call to assist supported event discovery which will also
> > help us virtualize it.
> >
> > Regards,
> > Anup
>
> +Ian Rogers
>
> `perf list` will already filter some events depending on whether the
> PMU supports them, for example, legacy cache events. I think we can
> extend this to json events.
>

Yes. That's what I was thinking as well. However, that may be a
problem in virtualization
as Anup pointed out.

As per my understanding, cloud providers provide json files for VMs
based on the host
architecture and allow migration only between hosts with the same
family of cpu. In RISC-V, the mapfile.csv works based on 3 registers
indicating marchid, mimpid, and mvendorid. Thus, the json file has to
be tied with the host machine it is going to be run.

We will end up doing the same if we only rely on the json file to
filter events in the future. Please let me know if the assumption is
incorrect.

If we allow a SBI call route to discover which events are supported,
the guest can always support legacy events on any host even though it
doesn't have a json file.

+Andrew Jones :
Any thoughts ?

> Thanks,
> Ian
>
> > >
> > > I had added it only for platforms with counter delegation[1] but I think
> > > this can be generalized for platforms with SBI PMU as well.
> > >
> > > I had some hacks to specify the legacy event encodings but Ian rogers
> > > improved with a generic support by preferring sysfs/json event encodings
> > > over fixed ones. I am yet to rebase and try Ian's series on top of the
> > > counter delegation though. Thoughts ?
> > >
> > > [1]
> > > https://lore.kernel.org/lkml/20240217005738.3744121-1-atishp@rivosinc.com/
> > > [2]
> > > https://lore.kernel.org/bpf/20240415063626.453987-2-irogers@google.com/T/
> > >
> > >
> > > > Regards,
> > > > Samuel
> > > >
> > > >>> Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
> > > >>> ---
> > > >>> Before this patch:
> > > >>> $ perf list hw
> > > >>>
> > > >>> List of pre-defined events (to be used in -e or -M):
> > > >>>
> > > >>>    branch-instructions OR branches                    [Hardware event]
> > > >>>    branch-misses                                      [Hardware event]
> > > >>>    bus-cycles                                         [Hardware event]
> > > >>>    cache-misses                                       [Hardware event]
> > > >>>    cache-references                                   [Hardware event]
> > > >>>    cpu-cycles OR cycles                               [Hardware event]
> > > >>>    instructions                                       [Hardware event]
> > > >>>    ref-cycles                                         [Hardware event]
> > > >>>    stalled-cycles-backend OR idle-cycles-backend      [Hardware event]
> > > >>>    stalled-cycles-frontend OR idle-cycles-frontend    [Hardware event]
> > > >>>
> > > >>> $ perf stat -ddd true
> > > >>>
> > > >>>   Performance counter stats for 'true':
> > > >>>
> > > >>>                4.36 msec task-clock                       #    0.744 CPUs utilized
> > > >>>                   1      context-switches                 #  229.325 /sec
> > > >>>                   0      cpu-migrations                   #    0.000 /sec
> > > >>>                  38      page-faults                      #    8.714 K/sec
> > > >>>           4,375,694      cycles                           #    1.003 GHz                         (60.64%)
> > > >>>             728,945      instructions                     #    0.17  insn per cycle
> > > >>>              79,199      branches                         #   18.162 M/sec
> > > >>>              17,709      branch-misses                    #   22.36% of all branches
> > > >>>             181,734      L1-dcache-loads                  #   41.676 M/sec
> > > >>>               5,547      L1-dcache-load-misses            #    3.05% of all L1-dcache accesses
> > > >>>       <not counted>      LLC-loads                                                               (0.00%)
> > > >>>       <not counted>      LLC-load-misses                                                         (0.00%)
> > > >>>       <not counted>      L1-icache-loads                                                         (0.00%)
> > > >>>       <not counted>      L1-icache-load-misses                                                   (0.00%)
> > > >>>       <not counted>      dTLB-loads                                                              (0.00%)
> > > >>>       <not counted>      dTLB-load-misses                                                        (0.00%)
> > > >>>       <not counted>      iTLB-loads                                                              (0.00%)
> > > >>>       <not counted>      iTLB-load-misses                                                        (0.00%)
> > > >>>       <not counted>      L1-dcache-prefetches                                                    (0.00%)
> > > >>>       <not counted>      L1-dcache-prefetch-misses                                               (0.00%)
> > > >>>
> > > >>>         0.005860375 seconds time elapsed
> > > >>>
> > > >>>         0.000000000 seconds user
> > > >>>         0.010383000 seconds sys
> > > >>>
> > > >>> After this patch:
> > > >>> $ perf list hw
> > > >>>
> > > >>> List of pre-defined events (to be used in -e or -M):
> > > >>>
> > > >>>    branch-instructions OR branches                    [Hardware event]
> > > >>>    branch-misses                                      [Hardware event]
> > > >>>    cache-misses                                       [Hardware event]
> > > >>>    cache-references                                   [Hardware event]
> > > >>>    cpu-cycles OR cycles                               [Hardware event]
> > > >>>    instructions                                       [Hardware event]
> > > >>>
> > > >>> $ perf stat -ddd true
> > > >>>
> > > >>>   Performance counter stats for 'true':
> > > >>>
> > > >>>                5.16 msec task-clock                       #    0.848 CPUs utilized
> > > >>>                   1      context-switches                 #  193.817 /sec
> > > >>>                   0      cpu-migrations                   #    0.000 /sec
> > > >>>                  37      page-faults                      #    7.171 K/sec
> > > >>>           5,183,625      cycles                           #    1.005 GHz
> > > >>>             961,696      instructions                     #    0.19  insn per cycle
> > > >>>              85,853      branches                         #   16.640 M/sec
> > > >>>              20,462      branch-misses                    #   23.83% of all branches
> > > >>>             243,545      L1-dcache-loads                  #   47.203 M/sec
> > > >>>               5,974      L1-dcache-load-misses            #    2.45% of all L1-dcache accesses
> > > >>>     <not supported>      LLC-loads
> > > >>>     <not supported>      LLC-load-misses
> > > >>>     <not supported>      L1-icache-loads
> > > >>>     <not supported>      L1-icache-load-misses
> > > >>>     <not supported>      dTLB-loads
> > > >>>              19,619      dTLB-load-misses
> > > >>>     <not supported>      iTLB-loads
> > > >>>               6,831      iTLB-load-misses
> > > >>>     <not supported>      L1-dcache-prefetches
> > > >>>     <not supported>      L1-dcache-prefetch-misses
> > > >>>
> > > >>>         0.006085625 seconds time elapsed
> > > >>>
> > > >>>         0.000000000 seconds user
> > > >>>         0.013022000 seconds sys
> > > >>>
> > > >>>
> > > >>>   drivers/perf/riscv_pmu_sbi.c | 37 ++++++++++++++++++++++++++++++++++--
> > > >>>   1 file changed, 35 insertions(+), 2 deletions(-)
> > > >>>
> > > >>> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> > > >>> index 16acd4dcdb96..b58a70ee8317 100644
> > > >>> --- a/drivers/perf/riscv_pmu_sbi.c
> > > >>> +++ b/drivers/perf/riscv_pmu_sbi.c
> > > >>> @@ -86,7 +86,7 @@ struct sbi_pmu_event_data {
> > > >>>          };
> > > >>>   };
> > > >>>
> > > >>> -static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
> > > >>> +static struct sbi_pmu_event_data pmu_hw_event_map[] = {
> > > >>>          [PERF_COUNT_HW_CPU_CYCLES]              = {.hw_gen_event = {
> > > >>>                                                          SBI_PMU_HW_CPU_CYCLES,
> > > >>>                                                          SBI_PMU_EVENT_TYPE_HW, 0}},
> > > >>> @@ -120,7 +120,7 @@ static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
> > > >>>   };
> > > >>>
> > > >>>   #define C(x) PERF_COUNT_HW_CACHE_##x
> > > >>> -static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> > > >>> +static struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> > > >>>   [PERF_COUNT_HW_CACHE_OP_MAX]
> > > >>>   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> > > >>>          [C(L1D)] = {
> > > >>> @@ -265,6 +265,36 @@ static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_M
> > > >>>          },
> > > >>>   };
> > > >>>
> > > >>> +static void pmu_sbi_check_event(struct sbi_pmu_event_data *edata)
> > > >>> +{
> > > >>> +       struct sbiret ret;
> > > >>> +
> > > >>> +       ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH,
> > > >>> +                       0, cmask, 0, edata->event_idx, 0, 0);
> > > >>> +       if (!ret.error) {
> > > >>> +               sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
> > > >>> +                         ret.value, 0x1, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
> > > >>> +       } else if (ret.error == SBI_ERR_NOT_SUPPORTED) {
> > > >>> +               /* This event cannot be monitored by any counter */
> > > >>> +               edata->event_idx = -EINVAL;
> > > >>> +       }
> > > >>> +}
> > > >>> +
> > > >>> +static void pmu_sbi_update_events(void)
> > > >>> +{
> > > >>> +       /* Ensure events are not already mapped to a counter */
> > > >>> +       sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
> > > >>> +                 0, cmask, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
> > > >>> +
> > > >>> +       for (int i = 0; i < ARRAY_SIZE(pmu_hw_event_map); i++)
> > > >>> +               pmu_sbi_check_event(&pmu_hw_event_map[i]);
> > > >>> +
> > > >>> +       for (int i = 0; i < ARRAY_SIZE(pmu_cache_event_map); i++)
> > > >>> +               for (int j = 0; j < ARRAY_SIZE(pmu_cache_event_map[i]); j++)
> > > >>> +                       for (int k = 0; k < ARRAY_SIZE(pmu_cache_event_map[i][j]); k++)
> > > >>> +                               pmu_sbi_check_event(&pmu_cache_event_map[i][j][k]);
> > > >>> +}
> > > >>> +
> > > >>>   static int pmu_sbi_ctr_get_width(int idx)
> > > >>>   {
> > > >>>          return pmu_ctr_list[idx].width;
> > > >>> @@ -1046,6 +1076,9 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
> > > >>>          if (pmu_sbi_get_ctrinfo(num_counters, &cmask))
> > > >>>                  goto out_free;
> > > >>>
> > > >>> +       /* Check which standard events are available */
> > > >>> +       pmu_sbi_update_events();
> > > >>> +
> > > >>>          ret = pmu_sbi_setup_irqs(pmu, pdev);
> > > >>>          if (ret < 0) {
> > > >>>                  pr_info("Perf sampling/filtering is not supported as sscof extension is not available\n");
> > > >>> --
> > > >>> 2.42.0
> > > >
> > > >
> > > > _______________________________________________
> > > > linux-riscv mailing list
> > > > linux-riscv@lists.infradead.org
> > > > http://lists.infradead.org/mailman/listinfo/linux-riscv
> > >
> >
Andrew Jones April 24, 2024, 1:31 p.m. UTC | #7
On Tue, Apr 23, 2024 at 05:36:43PM -0700, Atish Kumar Patra wrote:
> On Mon, Apr 22, 2024 at 8:44 PM Ian Rogers <irogers@google.com> wrote:
> >
> > On Mon, Apr 15, 2024 at 9:07 PM Anup Patel <anup@brainfault.org> wrote:
> > >
> > > On Tue, Apr 16, 2024 at 5:31 AM Atish Patra <atishp@rivosinc.com> wrote:
> > > >
> > > > On 4/10/24 18:40, Samuel Holland wrote:
> > > > > Hi Atish,
> > > > >
> > > > > On 2024-03-18 2:44 PM, Atish Patra wrote:
> > > > >> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
> > > > >>>
> > > > >>> The RISC-V SBI PMU specification defines several standard hardware and
> > > > >>> cache events. Currently, all of these events appear in the `perf list`
> > > > >>> output, even if they are not actually implemented. Add logic to check
> > > > >>> which events are supported by the hardware (i.e. can be mapped to some
> > > > >>> counter), so only usable events are reported to userspace.
> > > > >>>
> > > > >>
> > > > >> Thanks for the patch.
> > > > >> This adds tons of SBI calls at every boot for a use case which is at
> > > > >> best confusing for a subset of users who actually wants to run perf.
> > > > >
> > > > > I should have been clearer in the patch description. This is not just a cosmetic
> > > > > change; because perf sees these as valid events, it tries to use them in
> > > > > commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
> > > > > causes the ->add() callback to fail, this prevents any other events from being
> > > > > scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
> > > > > why the dTLB/iTLB miss counts are missing in the "before" example below.
> > > > >
> > > >
> > > > Thanks for explaining the problem. I can reproduce it in qemu as well if
> > > > enough number of invalid events given on the command line and the
> > > > workload is short enough.
> > > >
> > > > >> This probing can be done at runtime by invoking the
> > > > >> pmu_sbi_check_event from pmu_sbi_event_map.
> > > > >> We can update the event map after that so that it doesn't need to
> > > > >> invoke pmu_sbi_check_event next time.
> > > > >
> > > > > I tried to implement this suggestion, but it did not work. The SBI interface
> > > > > does not distinguish between "none of the counters can monitor the specified
> > > > > event [because the event is unsupported]" and "none of the counters can monitor
> > > > > the specified event [because the counters are busy]". It is not sufficient for
> > > > > the kernel to verify that at least one counter is available before performing
> > > > > the check, because certain events may only be usable on a subset of counters
> > > > > (per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.
> > > > >
> > > >
> > > > Yeah. My suggestion was to fix the perf list issue which is different
> > > > than the issue reported now.
> > > >
> > > > > As a result, checking for event support is only reliable when none of the
> > > > > counters are in use. So the check can be asynchronous/deferred to later in the
> > > > > boot process, but I believe it still needs to be performed for all events before
> > > > > userspace starts using the counters.
> > > > >
> > > >
> > > > We should defer it a work queue for sure. We can also look at improving
> > > > SBI PMU extension to support bulk matching behavior as well.
> > > >
> > > > However, I think a better solution would be to just rely on the json
> > > > file mappings instead of making SBI calls. We are going to have the
> > > > event encoding and mappings in the json in the future.
> > >
> > > The problem with JSON based event encoding is how to deal in-case
> > > we are running inside Guest/VM because Host could be anything.
> > >
> > > IMO, the JSON based approach is not suitable for SBI PMU. For now,
> > > we either defer the work using the work queue or keep the approach
> > > of this patch as-is.
> > >
> > > The good part about SBI PMU extension is that we do have a standard
> > > set of events and we only need a way to discover supported standard
> > > events with a minimum number of SBI calls. It is better to add a new
> > > SBI PMU call to assist supported event discovery which will also
> > > help us virtualize it.
> > >
> > > Regards,
> > > Anup
> >
> > +Ian Rogers
> >
> > `perf list` will already filter some events depending on whether the
> > PMU supports them, for example, legacy cache events. I think we can
> > extend this to json events.
> >
> 
> Yes. That's what I was thinking as well. However, that may be a
> problem in virtualization
> as Anup pointed out.
> 
> As per my understanding, cloud providers provide json files for VMs
> based on the host
> architecture and allow migration only between hosts with the same
> family of cpu. In RISC-V, the mapfile.csv works based on 3 registers
> indicating marchid, mimpid, and mvendorid. Thus, the json file has to
> be tied with the host machine it is going to be run.

This is also my understanding. That is, that cloud instances typically
dedicate CPUs to VMs and don't try to present CPU models to VMs which
don't exactly match the host's CPUs. The remaining concern would be if
the hypervisor doesn't emulate/passthrough everything the json describes
for the host CPU type.

However, this is just "typical" clouds. All bets are off for general
virtualization, as Anup points out.

> 
> We will end up doing the same if we only rely on the json file to
> filter events in the future. Please let me know if the assumption is
> incorrect.
> 
> If we allow a SBI call route to discover which events are supported,
> the guest can always support legacy events on any host even though it
> doesn't have a json file.

Yes, I think we need a solution which works even without a json file,
since a VM may use a special mvendorid,marchid,mimpid triplet to
describe a more generic CPU type. Unless we also create json files
for these VCPUs, or provide other event discovery mechanisms, then
the VMs will not have anything.

Thanks,
drew
Ian Rogers April 24, 2024, 6:27 p.m. UTC | #8
On Wed, Apr 24, 2024 at 6:31 AM Andrew Jones <ajones@ventanamicro.com> wrote:
>
> On Tue, Apr 23, 2024 at 05:36:43PM -0700, Atish Kumar Patra wrote:
> > On Mon, Apr 22, 2024 at 8:44 PM Ian Rogers <irogers@google.com> wrote:
> > >
> > > On Mon, Apr 15, 2024 at 9:07 PM Anup Patel <anup@brainfault.org> wrote:
> > > >
> > > > On Tue, Apr 16, 2024 at 5:31 AM Atish Patra <atishp@rivosinc.com> wrote:
> > > > >
> > > > > On 4/10/24 18:40, Samuel Holland wrote:
> > > > > > Hi Atish,
> > > > > >
> > > > > > On 2024-03-18 2:44 PM, Atish Patra wrote:
> > > > > >> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
> > > > > >>>
> > > > > >>> The RISC-V SBI PMU specification defines several standard hardware and
> > > > > >>> cache events. Currently, all of these events appear in the `perf list`
> > > > > >>> output, even if they are not actually implemented. Add logic to check
> > > > > >>> which events are supported by the hardware (i.e. can be mapped to some
> > > > > >>> counter), so only usable events are reported to userspace.
> > > > > >>>
> > > > > >>
> > > > > >> Thanks for the patch.
> > > > > >> This adds tons of SBI calls at every boot for a use case which is at
> > > > > >> best confusing for a subset of users who actually wants to run perf.
> > > > > >
> > > > > > I should have been clearer in the patch description. This is not just a cosmetic
> > > > > > change; because perf sees these as valid events, it tries to use them in
> > > > > > commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
> > > > > > causes the ->add() callback to fail, this prevents any other events from being
> > > > > > scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
> > > > > > why the dTLB/iTLB miss counts are missing in the "before" example below.
> > > > > >
> > > > >
> > > > > Thanks for explaining the problem. I can reproduce it in qemu as well if
> > > > > enough number of invalid events given on the command line and the
> > > > > workload is short enough.
> > > > >
> > > > > >> This probing can be done at runtime by invoking the
> > > > > >> pmu_sbi_check_event from pmu_sbi_event_map.
> > > > > >> We can update the event map after that so that it doesn't need to
> > > > > >> invoke pmu_sbi_check_event next time.
> > > > > >
> > > > > > I tried to implement this suggestion, but it did not work. The SBI interface
> > > > > > does not distinguish between "none of the counters can monitor the specified
> > > > > > event [because the event is unsupported]" and "none of the counters can monitor
> > > > > > the specified event [because the counters are busy]". It is not sufficient for
> > > > > > the kernel to verify that at least one counter is available before performing
> > > > > > the check, because certain events may only be usable on a subset of counters
> > > > > > (per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.
> > > > > >
> > > > >
> > > > > Yeah. My suggestion was to fix the perf list issue which is different
> > > > > than the issue reported now.
> > > > >
> > > > > > As a result, checking for event support is only reliable when none of the
> > > > > > counters are in use. So the check can be asynchronous/deferred to later in the
> > > > > > boot process, but I believe it still needs to be performed for all events before
> > > > > > userspace starts using the counters.
> > > > > >
> > > > >
> > > > > We should defer it a work queue for sure. We can also look at improving
> > > > > SBI PMU extension to support bulk matching behavior as well.
> > > > >
> > > > > However, I think a better solution would be to just rely on the json
> > > > > file mappings instead of making SBI calls. We are going to have the
> > > > > event encoding and mappings in the json in the future.
> > > >
> > > > The problem with JSON based event encoding is how to deal in-case
> > > > we are running inside Guest/VM because Host could be anything.
> > > >
> > > > IMO, the JSON based approach is not suitable for SBI PMU. For now,
> > > > we either defer the work using the work queue or keep the approach
> > > > of this patch as-is.
> > > >
> > > > The good part about SBI PMU extension is that we do have a standard
> > > > set of events and we only need a way to discover supported standard
> > > > events with a minimum number of SBI calls. It is better to add a new
> > > > SBI PMU call to assist supported event discovery which will also
> > > > help us virtualize it.
> > > >
> > > > Regards,
> > > > Anup
> > >
> > > +Ian Rogers
> > >
> > > `perf list` will already filter some events depending on whether the
> > > PMU supports them, for example, legacy cache events. I think we can
> > > extend this to json events.
> > >
> >
> > Yes. That's what I was thinking as well. However, that may be a
> > problem in virtualization
> > as Anup pointed out.
> >
> > As per my understanding, cloud providers provide json files for VMs
> > based on the host
> > architecture and allow migration only between hosts with the same
> > family of cpu. In RISC-V, the mapfile.csv works based on 3 registers
> > indicating marchid, mimpid, and mvendorid. Thus, the json file has to
> > be tied with the host machine it is going to be run.
>
> This is also my understanding. That is, that cloud instances typically
> dedicate CPUs to VMs and don't try to present CPU models to VMs which
> don't exactly match the host's CPUs. The remaining concern would be if
> the hypervisor doesn't emulate/passthrough everything the json describes
> for the host CPU type.

So this isn't accurate. For x86 perf uses the CPUID instruction. A
host operating system can change the CPUID for a guest, say pretending
a newer CPU model is actually an older one. This can be done when
migrating VMs as having the CPUID change dynamically in a guest would
be a problem. VM migration like this can have issues and it is fair to
say that it is avoided.

Fwiw, a particular problem we have with x86 guests is the host hiding
CPUID leaves that describe things like the frequency of the CPU. It is
difficult to compute metrics in units of time when you don't know what
frequency cycles relates to.

> However, this is just "typical" clouds. All bets are off for general
> virtualization, as Anup points out.
>
> >
> > We will end up doing the same if we only rely on the json file to
> > filter events in the future. Please let me know if the assumption is
> > incorrect.
> >
> > If we allow a SBI call route to discover which events are supported,
> > the guest can always support legacy events on any host even though it
> > doesn't have a json file.
>
> Yes, I think we need a solution which works even without a json file,
> since a VM may use a special mvendorid,marchid,mimpid triplet to
> describe a more generic CPU type. Unless we also create json files
> for these VCPUs, or provide other event discovery mechanisms, then
> the VMs will not have anything.

I think a set of generic events is a good thing, then the PMU driver
can map perf's legacy events to the generic events in a clean way. I'm
not sure what the issue is with RISC-V guest operating systems. To
lower overhead on x86 pass-through PMUs are being explored, that is
the guest operating system directly programming the CPU's performance
counters to avoid hypervisor traps. For this to work the triplet
mvendorid,marchid,mimpid should match that of the host.

The perf tool supports the notion of a standard set of legacy events
like instructions, cycles and certain cache events. More recently
we've moved to prefer sysfs and json events over these, primarily
because the Apple M1 ARM based Macs nobody was establishing the
mapping from legacy to an actual event. Users were complaining that
'cycles' via a sysfs event worked, but when ARM was made like Intel
and prioritized legacy first, we broke it. Now we prioritize sysfs and
json way for all architectures and hopefully everyone is happy. The
last clean up for this is in:
https://lore.kernel.org/lkml/20240416061533.921723-1-irogers@google.com/

Thanks,
Ian

> Thanks,
> drew
Atish Kumar Patra April 24, 2024, 9:06 p.m. UTC | #9
On Wed, Apr 24, 2024 at 11:27 AM Ian Rogers <irogers@google.com> wrote:
>
> On Wed, Apr 24, 2024 at 6:31 AM Andrew Jones <ajones@ventanamicro.com> wrote:
> >
> > On Tue, Apr 23, 2024 at 05:36:43PM -0700, Atish Kumar Patra wrote:
> > > On Mon, Apr 22, 2024 at 8:44 PM Ian Rogers <irogers@google.com> wrote:
> > > >
> > > > On Mon, Apr 15, 2024 at 9:07 PM Anup Patel <anup@brainfault.org> wrote:
> > > > >
> > > > > On Tue, Apr 16, 2024 at 5:31 AM Atish Patra <atishp@rivosinc.com> wrote:
> > > > > >
> > > > > > On 4/10/24 18:40, Samuel Holland wrote:
> > > > > > > Hi Atish,
> > > > > > >
> > > > > > > On 2024-03-18 2:44 PM, Atish Patra wrote:
> > > > > > >> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
> > > > > > >>>
> > > > > > >>> The RISC-V SBI PMU specification defines several standard hardware and
> > > > > > >>> cache events. Currently, all of these events appear in the `perf list`
> > > > > > >>> output, even if they are not actually implemented. Add logic to check
> > > > > > >>> which events are supported by the hardware (i.e. can be mapped to some
> > > > > > >>> counter), so only usable events are reported to userspace.
> > > > > > >>>
> > > > > > >>
> > > > > > >> Thanks for the patch.
> > > > > > >> This adds tons of SBI calls at every boot for a use case which is at
> > > > > > >> best confusing for a subset of users who actually wants to run perf.
> > > > > > >
> > > > > > > I should have been clearer in the patch description. This is not just a cosmetic
> > > > > > > change; because perf sees these as valid events, it tries to use them in
> > > > > > > commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
> > > > > > > causes the ->add() callback to fail, this prevents any other events from being
> > > > > > > scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
> > > > > > > why the dTLB/iTLB miss counts are missing in the "before" example below.
> > > > > > >
> > > > > >
> > > > > > Thanks for explaining the problem. I can reproduce it in qemu as well if
> > > > > > enough number of invalid events given on the command line and the
> > > > > > workload is short enough.
> > > > > >
> > > > > > >> This probing can be done at runtime by invoking the
> > > > > > >> pmu_sbi_check_event from pmu_sbi_event_map.
> > > > > > >> We can update the event map after that so that it doesn't need to
> > > > > > >> invoke pmu_sbi_check_event next time.
> > > > > > >
> > > > > > > I tried to implement this suggestion, but it did not work. The SBI interface
> > > > > > > does not distinguish between "none of the counters can monitor the specified
> > > > > > > event [because the event is unsupported]" and "none of the counters can monitor
> > > > > > > the specified event [because the counters are busy]". It is not sufficient for
> > > > > > > the kernel to verify that at least one counter is available before performing
> > > > > > > the check, because certain events may only be usable on a subset of counters
> > > > > > > (per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.
> > > > > > >
> > > > > >
> > > > > > Yeah. My suggestion was to fix the perf list issue which is different
> > > > > > than the issue reported now.
> > > > > >
> > > > > > > As a result, checking for event support is only reliable when none of the
> > > > > > > counters are in use. So the check can be asynchronous/deferred to later in the
> > > > > > > boot process, but I believe it still needs to be performed for all events before
> > > > > > > userspace starts using the counters.
> > > > > > >
> > > > > >
> > > > > > We should defer it a work queue for sure. We can also look at improving
> > > > > > SBI PMU extension to support bulk matching behavior as well.
> > > > > >
> > > > > > However, I think a better solution would be to just rely on the json
> > > > > > file mappings instead of making SBI calls. We are going to have the
> > > > > > event encoding and mappings in the json in the future.
> > > > >
> > > > > The problem with JSON based event encoding is how to deal in-case
> > > > > we are running inside Guest/VM because Host could be anything.
> > > > >
> > > > > IMO, the JSON based approach is not suitable for SBI PMU. For now,
> > > > > we either defer the work using the work queue or keep the approach
> > > > > of this patch as-is.
> > > > >
> > > > > The good part about SBI PMU extension is that we do have a standard
> > > > > set of events and we only need a way to discover supported standard
> > > > > events with a minimum number of SBI calls. It is better to add a new
> > > > > SBI PMU call to assist supported event discovery which will also
> > > > > help us virtualize it.
> > > > >
> > > > > Regards,
> > > > > Anup
> > > >
> > > > +Ian Rogers
> > > >
> > > > `perf list` will already filter some events depending on whether the
> > > > PMU supports them, for example, legacy cache events. I think we can
> > > > extend this to json events.
> > > >
> > >
> > > Yes. That's what I was thinking as well. However, that may be a
> > > problem in virtualization
> > > as Anup pointed out.
> > >
> > > As per my understanding, cloud providers provide json files for VMs
> > > based on the host
> > > architecture and allow migration only between hosts with the same
> > > family of cpu. In RISC-V, the mapfile.csv works based on 3 registers
> > > indicating marchid, mimpid, and mvendorid. Thus, the json file has to
> > > be tied with the host machine it is going to be run.
> >
> > This is also my understanding. That is, that cloud instances typically
> > dedicate CPUs to VMs and don't try to present CPU models to VMs which
> > don't exactly match the host's CPUs. The remaining concern would be if
> > the hypervisor doesn't emulate/passthrough everything the json describes
> > for the host CPU type.
>

x86/ARM64 kvm also can filter any event for the guest they want.

> So this isn't accurate. For x86 perf uses the CPUID instruction. A
> host operating system can change the CPUID for a guest, say pretending
> a newer CPU model is actually an older one. This can be done when
> migrating VMs as having the CPUID change dynamically in a guest would
> be a problem. VM migration like this can have issues and it is fair to
> say that it is avoided.
>

I was specifically asking if the json file is updated for a guest when migrated
if the events supported on the destination host are different than the
source host ?

Or The VM migration across different CPUIDs (e.g different family of CPUs)
are avoided completely. It seems the latter from your statement ?


> Fwiw, a particular problem we have with x86 guests is the host hiding
> CPUID leaves that describe things like the frequency of the CPU. It is
> difficult to compute metrics in units of time when you don't know what
> frequency cycles relates to.
>
> > However, this is just "typical" clouds. All bets are off for general
> > virtualization, as Anup points out.
> >
> > >
> > > We will end up doing the same if we only rely on the json file to
> > > filter events in the future. Please let me know if the assumption is
> > > incorrect.
> > >
> > > If we allow a SBI call route to discover which events are supported,
> > > the guest can always support legacy events on any host even though it
> > > doesn't have a json file.
> >
> > Yes, I think we need a solution which works even without a json file,
> > since a VM may use a special mvendorid,marchid,mimpid triplet to
> > describe a more generic CPU type. Unless we also create json files
> > for these VCPUs, or provide other event discovery mechanisms, then
> > the VMs will not have anything.
>

We have to create a json file for VMs anyways for raw events. Having
the discovery through
SBI call allows minimum guarantee for the perf legacy events.

> I think a set of generic events is a good thing, then the PMU driver
> can map perf's legacy events to the generic events in a clean way. I'm
> not sure what the issue is with RISC-V guest operating systems. To

The pertinent question here is how does the guest know the list of
supported generic or perf legacy
events as RISC-V doesn't have any standard event format/encoding
support. There are two approaches

1. Define a new SBI interface which allows the host to let the guest
know which events are supported at one shot.
The perf legacy events mappings are updated at guest boot time via
this interface. Currently, this patch achieves this
by iterating through all the possible legacy events and making an SBI
call one at a time during the boot.

2. Rely on the json file present (if) in the guest. In this case, all
the supported perf legacy events must be present in
the json. In absence of that, the driver assumes it is not supported.


> lower overhead on x86 pass-through PMUs are being explored, that is
> the guest operating system directly programming the CPU's performance
> counters to avoid hypervisor traps. For this to work the triplet
> mvendorid,marchid,mimpid should match that of the host.
>

Yes. I am tracking the pass through series for vPMU in x86.
RISC-V also doesn't have pass through support and implements counter
virtualization similar
to other architectures as well but we have less number of traps
because of the bulk access features.

> The perf tool supports the notion of a standard set of legacy events
> like instructions, cycles and certain cache events. More recently
> we've moved to prefer sysfs and json events over these, primarily
> because the Apple M1 ARM based Macs nobody was establishing the
> mapping from legacy to an actual event. Users were complaining that
> 'cycles' via a sysfs event worked, but when ARM was made like Intel
> and prioritized legacy first, we broke it. Now we prioritize sysfs and
> json way for all architectures and hopefully everyone is happy. The
> last clean up for this is in:
> https://lore.kernel.org/lkml/20240416061533.921723-1-irogers@google.com/
>
> Thanks,
> Ian
>
> > Thanks,
> > drew
Anup Patel April 25, 2024, 5:46 a.m. UTC | #10
On Thu, Apr 25, 2024 at 2:36 AM Atish Kumar Patra <atishp@rivosinc.com> wrote:
>
> On Wed, Apr 24, 2024 at 11:27 AM Ian Rogers <irogers@google.com> wrote:
> >
> > On Wed, Apr 24, 2024 at 6:31 AM Andrew Jones <ajones@ventanamicro.com> wrote:
> > >
> > > On Tue, Apr 23, 2024 at 05:36:43PM -0700, Atish Kumar Patra wrote:
> > > > On Mon, Apr 22, 2024 at 8:44 PM Ian Rogers <irogers@google.com> wrote:
> > > > >
> > > > > On Mon, Apr 15, 2024 at 9:07 PM Anup Patel <anup@brainfault.org> wrote:
> > > > > >
> > > > > > On Tue, Apr 16, 2024 at 5:31 AM Atish Patra <atishp@rivosinc.com> wrote:
> > > > > > >
> > > > > > > On 4/10/24 18:40, Samuel Holland wrote:
> > > > > > > > Hi Atish,
> > > > > > > >
> > > > > > > > On 2024-03-18 2:44 PM, Atish Patra wrote:
> > > > > > > >> On Wed, Jan 3, 2024 at 8:54 AM Samuel Holland <samuel.holland@sifive.com> wrote:
> > > > > > > >>>
> > > > > > > >>> The RISC-V SBI PMU specification defines several standard hardware and
> > > > > > > >>> cache events. Currently, all of these events appear in the `perf list`
> > > > > > > >>> output, even if they are not actually implemented. Add logic to check
> > > > > > > >>> which events are supported by the hardware (i.e. can be mapped to some
> > > > > > > >>> counter), so only usable events are reported to userspace.
> > > > > > > >>>
> > > > > > > >>
> > > > > > > >> Thanks for the patch.
> > > > > > > >> This adds tons of SBI calls at every boot for a use case which is at
> > > > > > > >> best confusing for a subset of users who actually wants to run perf.
> > > > > > > >
> > > > > > > > I should have been clearer in the patch description. This is not just a cosmetic
> > > > > > > > change; because perf sees these as valid events, it tries to use them in
> > > > > > > > commands like `perf stat`. When the error from SBI_EXT_PMU_COUNTER_CFG_MATCH
> > > > > > > > causes the ->add() callback to fail, this prevents any other events from being
> > > > > > > > scheduled on that same CPU (search can_add_hw in kernel/events/core.c). That is
> > > > > > > > why the dTLB/iTLB miss counts are missing in the "before" example below.
> > > > > > > >
> > > > > > >
> > > > > > > Thanks for explaining the problem. I can reproduce it in qemu as well if
> > > > > > > enough number of invalid events given on the command line and the
> > > > > > > workload is short enough.
> > > > > > >
> > > > > > > >> This probing can be done at runtime by invoking the
> > > > > > > >> pmu_sbi_check_event from pmu_sbi_event_map.
> > > > > > > >> We can update the event map after that so that it doesn't need to
> > > > > > > >> invoke pmu_sbi_check_event next time.
> > > > > > > >
> > > > > > > > I tried to implement this suggestion, but it did not work. The SBI interface
> > > > > > > > does not distinguish between "none of the counters can monitor the specified
> > > > > > > > event [because the event is unsupported]" and "none of the counters can monitor
> > > > > > > > the specified event [because the counters are busy]". It is not sufficient for
> > > > > > > > the kernel to verify that at least one counter is available before performing
> > > > > > > > the check, because certain events may only be usable on a subset of counters
> > > > > > > > (per riscv,event-to-mhpmcounters), and the kernel does not know that mapping.
> > > > > > > >
> > > > > > >
> > > > > > > Yeah. My suggestion was to fix the perf list issue which is different
> > > > > > > than the issue reported now.
> > > > > > >
> > > > > > > > As a result, checking for event support is only reliable when none of the
> > > > > > > > counters are in use. So the check can be asynchronous/deferred to later in the
> > > > > > > > boot process, but I believe it still needs to be performed for all events before
> > > > > > > > userspace starts using the counters.
> > > > > > > >
> > > > > > >
> > > > > > > We should defer it a work queue for sure. We can also look at improving
> > > > > > > SBI PMU extension to support bulk matching behavior as well.
> > > > > > >
> > > > > > > However, I think a better solution would be to just rely on the json
> > > > > > > file mappings instead of making SBI calls. We are going to have the
> > > > > > > event encoding and mappings in the json in the future.
> > > > > >
> > > > > > The problem with JSON based event encoding is how to deal in-case
> > > > > > we are running inside Guest/VM because Host could be anything.
> > > > > >
> > > > > > IMO, the JSON based approach is not suitable for SBI PMU. For now,
> > > > > > we either defer the work using the work queue or keep the approach
> > > > > > of this patch as-is.
> > > > > >
> > > > > > The good part about SBI PMU extension is that we do have a standard
> > > > > > set of events and we only need a way to discover supported standard
> > > > > > events with a minimum number of SBI calls. It is better to add a new
> > > > > > SBI PMU call to assist supported event discovery which will also
> > > > > > help us virtualize it.
> > > > > >
> > > > > > Regards,
> > > > > > Anup
> > > > >
> > > > > +Ian Rogers
> > > > >
> > > > > `perf list` will already filter some events depending on whether the
> > > > > PMU supports them, for example, legacy cache events. I think we can
> > > > > extend this to json events.
> > > > >
> > > >
> > > > Yes. That's what I was thinking as well. However, that may be a
> > > > problem in virtualization
> > > > as Anup pointed out.
> > > >
> > > > As per my understanding, cloud providers provide json files for VMs
> > > > based on the host
> > > > architecture and allow migration only between hosts with the same
> > > > family of cpu. In RISC-V, the mapfile.csv works based on 3 registers
> > > > indicating marchid, mimpid, and mvendorid. Thus, the json file has to
> > > > be tied with the host machine it is going to be run.
> > >
> > > This is also my understanding. That is, that cloud instances typically
> > > dedicate CPUs to VMs and don't try to present CPU models to VMs which
> > > don't exactly match the host's CPUs. The remaining concern would be if
> > > the hypervisor doesn't emulate/passthrough everything the json describes
> > > for the host CPU type.
> >
>
> x86/ARM64 kvm also can filter any event for the guest they want.
>
> > So this isn't accurate. For x86 perf uses the CPUID instruction. A
> > host operating system can change the CPUID for a guest, say pretending
> > a newer CPU model is actually an older one. This can be done when
> > migrating VMs as having the CPUID change dynamically in a guest would
> > be a problem. VM migration like this can have issues and it is fair to
> > say that it is avoided.
> >
>
> I was specifically asking if the json file is updated for a guest when migrated
> if the events supported on the destination host are different than the
> source host ?
>
> Or The VM migration across different CPUIDs (e.g different family of CPUs)
> are avoided completely. It seems the latter from your statement ?
>
>
> > Fwiw, a particular problem we have with x86 guests is the host hiding
> > CPUID leaves that describe things like the frequency of the CPU. It is
> > difficult to compute metrics in units of time when you don't know what
> > frequency cycles relates to.
> >
> > > However, this is just "typical" clouds. All bets are off for general
> > > virtualization, as Anup points out.
> > >
> > > >
> > > > We will end up doing the same if we only rely on the json file to
> > > > filter events in the future. Please let me know if the assumption is
> > > > incorrect.
> > > >
> > > > If we allow a SBI call route to discover which events are supported,
> > > > the guest can always support legacy events on any host even though it
> > > > doesn't have a json file.
> > >
> > > Yes, I think we need a solution which works even without a json file,
> > > since a VM may use a special mvendorid,marchid,mimpid triplet to
> > > describe a more generic CPU type. Unless we also create json files
> > > for these VCPUs, or provide other event discovery mechanisms, then
> > > the VMs will not have anything.
> >
>
> We have to create a json file for VMs anyways for raw events. Having
> the discovery through
> SBI call allows minimum guarantee for the perf legacy events.
>
> > I think a set of generic events is a good thing, then the PMU driver
> > can map perf's legacy events to the generic events in a clean way. I'm
> > not sure what the issue is with RISC-V guest operating systems. To
>
> The pertinent question here is how does the guest know the list of
> supported generic or perf legacy
> events as RISC-V doesn't have any standard event format/encoding
> support. There are two approaches
>
> 1. Define a new SBI interface which allows the host to let the guest
> know which events are supported at one shot.
> The perf legacy events mappings are updated at guest boot time via
> this interface. Currently, this patch achieves this
> by iterating through all the possible legacy events and making an SBI
> call one at a time during the boot.
>
> 2. Rely on the json file present (if) in the guest. In this case, all
> the supported perf legacy events must be present in
> the json. In absence of that, the driver assumes it is not supported.

I think we should define the SBI interface anyway and let hypervisors
choose their own approach. In other words, if the json file is not present
for a platform (guest/host) then kernel perf driver can rely on other
means (such as SBI interface).

>
>
> > lower overhead on x86 pass-through PMUs are being explored, that is
> > the guest operating system directly programming the CPU's performance
> > counters to avoid hypervisor traps. For this to work the triplet
> > mvendorid,marchid,mimpid should match that of the host.
> >
>
> Yes. I am tracking the pass through series for vPMU in x86.
> RISC-V also doesn't have pass through support and implements counter
> virtualization similar
> to other architectures as well but we have less number of traps
> because of the bulk access features.
>
> > The perf tool supports the notion of a standard set of legacy events
> > like instructions, cycles and certain cache events. More recently
> > we've moved to prefer sysfs and json events over these, primarily
> > because the Apple M1 ARM based Macs nobody was establishing the
> > mapping from legacy to an actual event. Users were complaining that
> > 'cycles' via a sysfs event worked, but when ARM was made like Intel
> > and prioritized legacy first, we broke it. Now we prioritize sysfs and
> > json way for all architectures and hopefully everyone is happy. The
> > last clean up for this is in:
> > https://lore.kernel.org/lkml/20240416061533.921723-1-irogers@google.com/
> >
> > Thanks,
> > Ian
> >
> > > Thanks,
> > > drew

Regards,
Anup
diff mbox series

Patch

diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 16acd4dcdb96..b58a70ee8317 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -86,7 +86,7 @@  struct sbi_pmu_event_data {
 	};
 };
 
-static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
+static struct sbi_pmu_event_data pmu_hw_event_map[] = {
 	[PERF_COUNT_HW_CPU_CYCLES]		= {.hw_gen_event = {
 							SBI_PMU_HW_CPU_CYCLES,
 							SBI_PMU_EVENT_TYPE_HW, 0}},
@@ -120,7 +120,7 @@  static const struct sbi_pmu_event_data pmu_hw_event_map[] = {
 };
 
 #define C(x) PERF_COUNT_HW_CACHE_##x
-static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
+static struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
 [PERF_COUNT_HW_CACHE_OP_MAX]
 [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 	[C(L1D)] = {
@@ -265,6 +265,36 @@  static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_M
 	},
 };
 
+static void pmu_sbi_check_event(struct sbi_pmu_event_data *edata)
+{
+	struct sbiret ret;
+
+	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH,
+			0, cmask, 0, edata->event_idx, 0, 0);
+	if (!ret.error) {
+		sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
+			  ret.value, 0x1, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
+	} else if (ret.error == SBI_ERR_NOT_SUPPORTED) {
+		/* This event cannot be monitored by any counter */
+		edata->event_idx = -EINVAL;
+	}
+}
+
+static void pmu_sbi_update_events(void)
+{
+	/* Ensure events are not already mapped to a counter */
+	sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP,
+		  0, cmask, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0);
+
+	for (int i = 0; i < ARRAY_SIZE(pmu_hw_event_map); i++)
+		pmu_sbi_check_event(&pmu_hw_event_map[i]);
+
+	for (int i = 0; i < ARRAY_SIZE(pmu_cache_event_map); i++)
+		for (int j = 0; j < ARRAY_SIZE(pmu_cache_event_map[i]); j++)
+			for (int k = 0; k < ARRAY_SIZE(pmu_cache_event_map[i][j]); k++)
+				pmu_sbi_check_event(&pmu_cache_event_map[i][j][k]);
+}
+
 static int pmu_sbi_ctr_get_width(int idx)
 {
 	return pmu_ctr_list[idx].width;
@@ -1046,6 +1076,9 @@  static int pmu_sbi_device_probe(struct platform_device *pdev)
 	if (pmu_sbi_get_ctrinfo(num_counters, &cmask))
 		goto out_free;
 
+	/* Check which standard events are available */
+	pmu_sbi_update_events();
+
 	ret = pmu_sbi_setup_irqs(pmu, pdev);
 	if (ret < 0) {
 		pr_info("Perf sampling/filtering is not supported as sscof extension is not available\n");