diff mbox series

[1/2] iommu/riscv: add RISC-V IOMMU PMU support

Message ID 20250115030306.29735-2-zong.li@sifive.com (mailing list archive)
State New
Headers show
Series RISC-V IOMMU HPM support | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR fail PR summary
conchuod/patch-1-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh took 103.53s
conchuod/patch-1-test-2 fail .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh took 1037.17s
conchuod/patch-1-test-3 success .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh took 1225.89s
conchuod/patch-1-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh took 16.62s
conchuod/patch-1-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh took 18.25s
conchuod/patch-1-test-6 warning .github/scripts/patches/tests/checkpatch.sh took 1.42s
conchuod/patch-1-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh took 36.61s
conchuod/patch-1-test-8 success .github/scripts/patches/tests/header_inline.sh took 0.00s
conchuod/patch-1-test-9 success .github/scripts/patches/tests/kdoc.sh took 0.51s
conchuod/patch-1-test-10 success .github/scripts/patches/tests/module_param.sh took 0.01s
conchuod/patch-1-test-11 success .github/scripts/patches/tests/verify_fixes.sh took 0.00s
conchuod/patch-1-test-12 success .github/scripts/patches/tests/verify_signedoff.sh took 0.02s

Commit Message

Zong Li Jan. 15, 2025, 3:03 a.m. UTC
Support for the RISC-V IOMMU hardware performance monitor includes
both counting and sampling modes.

The specification does not define an event ID for counting the
number of clock cycles, meaning there is no associated `iohpmevt0`.
However, we need an event for counting cycle, so we reserve the
maximum event ID for this purpose.

Signed-off-by: Zong Li <zong.li@sifive.com>
Tested-by: Xu Lu <luxu.kernel@bytedance.com>
---
 drivers/iommu/riscv/Makefile     |   2 +-
 drivers/iommu/riscv/iommu-bits.h |  16 +
 drivers/iommu/riscv/iommu-pmu.c  | 486 +++++++++++++++++++++++++++++++
 drivers/iommu/riscv/iommu.h      |   8 +
 4 files changed, 511 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/riscv/iommu-pmu.c

Comments

Xu Lu Jan. 15, 2025, 3:45 a.m. UTC | #1
Hi Zong,

On Wed, Jan 15, 2025 at 11:03 AM Zong Li <zong.li@sifive.com> wrote:
>
> Support for the RISC-V IOMMU hardware performance monitor includes
> both counting and sampling modes.
>
> The specification does not define an event ID for counting the
> number of clock cycles, meaning there is no associated `iohpmevt0`.
> However, we need an event for counting cycle, so we reserve the
> maximum event ID for this purpose.
>
> Signed-off-by: Zong Li <zong.li@sifive.com>
> Tested-by: Xu Lu <luxu.kernel@bytedance.com>
> ---
>  drivers/iommu/riscv/Makefile     |   2 +-
>  drivers/iommu/riscv/iommu-bits.h |  16 +
>  drivers/iommu/riscv/iommu-pmu.c  | 486 +++++++++++++++++++++++++++++++
>  drivers/iommu/riscv/iommu.h      |   8 +
>  4 files changed, 511 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/iommu/riscv/iommu-pmu.c
>
> diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
> index f54c9ed17d41..d36625a1fd08 100644
> --- a/drivers/iommu/riscv/Makefile
> +++ b/drivers/iommu/riscv/Makefile
> @@ -1,3 +1,3 @@
>  # SPDX-License-Identifier: GPL-2.0-only
> -obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
> +obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o iommu-pmu.o
>  obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
> diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> index 98daf0e1a306..60523449f016 100644
> --- a/drivers/iommu/riscv/iommu-bits.h
> +++ b/drivers/iommu/riscv/iommu-bits.h
> @@ -17,6 +17,7 @@
>  #include <linux/types.h>
>  #include <linux/bitfield.h>
>  #include <linux/bits.h>
> +#include <linux/perf_event.h>
>
>  /*
>   * Chapter 5: Memory Mapped register interface
> @@ -207,6 +208,7 @@ enum riscv_iommu_ddtp_modes {
>  /* 5.22 Performance monitoring event counters (31 * 64bits) */
>  #define RISCV_IOMMU_REG_IOHPMCTR_BASE  0x0068
>  #define RISCV_IOMMU_REG_IOHPMCTR(_n)   (RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
> +#define RISCV_IOMMU_IOHPMCTR_COUNTER   GENMASK_ULL(63, 0)
>
>  /* 5.23 Performance monitoring event selectors (31 * 64bits) */
>  #define RISCV_IOMMU_REG_IOHPMEVT_BASE  0x0160
> @@ -250,6 +252,20 @@ enum riscv_iommu_hpmevent_id {
>         RISCV_IOMMU_HPMEVENT_MAX        = 9
>  };
>
> +/* Use maximum event ID for cycle event */
> +#define RISCV_IOMMU_HPMEVENT_CYCLE     GENMASK_ULL(14, 0)
> +
> +#define RISCV_IOMMU_HPM_COUNTER_NUM    32
> +
> +struct riscv_iommu_pmu {
> +       struct pmu pmu;
> +       void __iomem *reg;
> +       int num_counters;
> +       u64 mask_counter;
> +       struct perf_event *events[RISCV_IOMMU_IOHPMEVT_CNT + 1];
> +       DECLARE_BITMAP(used_counters, RISCV_IOMMU_IOHPMEVT_CNT + 1);
> +};
> +
>  /* 5.24 Translation request IOVA (64bits) */
>  #define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
>  #define RISCV_IOMMU_TR_REQ_IOVA_VPN    GENMASK_ULL(63, 12)
> diff --git a/drivers/iommu/riscv/iommu-pmu.c b/drivers/iommu/riscv/iommu-pmu.c
> new file mode 100644
> index 000000000000..74eb1525cd32
> --- /dev/null
> +++ b/drivers/iommu/riscv/iommu-pmu.c
> @@ -0,0 +1,486 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2024 SiFive
> + *
> + * Authors
> + *     Zong Li <zong.li@sifive.com>
> + */
> +
> +#include <linux/io-64-nonatomic-hi-lo.h>
> +
> +#include "iommu.h"
> +#include "iommu-bits.h"
> +
> +#define to_riscv_iommu_pmu(p) (container_of(p, struct riscv_iommu_pmu, pmu))
> +
> +#define RISCV_IOMMU_PMU_ATTR_EXTRACTOR(_name, _mask)                   \
> +       static inline u32 get_##_name(struct perf_event *event)         \
> +       {                                                               \
> +               return FIELD_GET(_mask, event->attr.config);            \
> +       }                                                               \
> +
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(event, RISCV_IOMMU_IOHPMEVT_EVENTID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(partial_matching, RISCV_IOMMU_IOHPMEVT_DMASK);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(pid_pscid, RISCV_IOMMU_IOHPMEVT_PID_PSCID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(did_gscid, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_pid_pscid, RISCV_IOMMU_IOHPMEVT_PV_PSCV);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_did_gscid, RISCV_IOMMU_IOHPMEVT_DV_GSCV);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_id_type, RISCV_IOMMU_IOHPMEVT_IDT);
> +
> +/* Formats */
> +PMU_FORMAT_ATTR(event, "config:0-14");
> +PMU_FORMAT_ATTR(partial_matching, "config:15");
> +PMU_FORMAT_ATTR(pid_pscid, "config:16-35");
> +PMU_FORMAT_ATTR(did_gscid, "config:36-59");
> +PMU_FORMAT_ATTR(filter_pid_pscid, "config:60");
> +PMU_FORMAT_ATTR(filter_did_gscid, "config:61");
> +PMU_FORMAT_ATTR(filter_id_type, "config:62");
> +
> +static struct attribute *riscv_iommu_pmu_formats[] = {
> +       &format_attr_event.attr,
> +       &format_attr_partial_matching.attr,
> +       &format_attr_pid_pscid.attr,
> +       &format_attr_did_gscid.attr,
> +       &format_attr_filter_pid_pscid.attr,
> +       &format_attr_filter_did_gscid.attr,
> +       &format_attr_filter_id_type.attr,
> +       NULL,
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_format_group = {
> +       .name = "format",
> +       .attrs = riscv_iommu_pmu_formats,
> +};
> +
> +/* Events */
> +static ssize_t riscv_iommu_pmu_event_show(struct device *dev,
> +                                         struct device_attribute *attr,
> +                                         char *page)
> +{
> +       struct perf_pmu_events_attr *pmu_attr;
> +
> +       pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
> +
> +       return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
> +}
> +
> +PMU_EVENT_ATTR(cycle, event_attr_cycle,
> +              RISCV_IOMMU_HPMEVENT_CYCLE, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(dont_count, event_attr_dont_count,
> +              RISCV_IOMMU_HPMEVENT_INVALID, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(untranslated_req, event_attr_untranslated_req,
> +              RISCV_IOMMU_HPMEVENT_URQ, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(translated_req, event_attr_translated_req,
> +              RISCV_IOMMU_HPMEVENT_TRQ, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(ats_trans_req, event_attr_ats_trans_req,
> +              RISCV_IOMMU_HPMEVENT_ATS_RQ, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(tlb_miss, event_attr_tlb_miss,
> +              RISCV_IOMMU_HPMEVENT_TLB_MISS, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(ddt_walks, event_attr_ddt_walks,
> +              RISCV_IOMMU_HPMEVENT_DD_WALK, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(pdt_walks, event_attr_pdt_walks,
> +              RISCV_IOMMU_HPMEVENT_PD_WALK, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(s_vs_pt_walks, event_attr_s_vs_pt_walks,
> +              RISCV_IOMMU_HPMEVENT_S_VS_WALKS, riscv_iommu_pmu_event_show);
> +PMU_EVENT_ATTR(g_pt_walks, event_attr_g_pt_walks,
> +              RISCV_IOMMU_HPMEVENT_G_WALKS, riscv_iommu_pmu_event_show);
> +
> +static struct attribute *riscv_iommu_pmu_events[] = {
> +       &event_attr_cycle.attr.attr,
> +       &event_attr_dont_count.attr.attr,
> +       &event_attr_untranslated_req.attr.attr,
> +       &event_attr_translated_req.attr.attr,
> +       &event_attr_ats_trans_req.attr.attr,
> +       &event_attr_tlb_miss.attr.attr,
> +       &event_attr_ddt_walks.attr.attr,
> +       &event_attr_pdt_walks.attr.attr,
> +       &event_attr_s_vs_pt_walks.attr.attr,
> +       &event_attr_g_pt_walks.attr.attr,
> +       NULL,
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_events_group = {
> +       .name = "events",
> +       .attrs = riscv_iommu_pmu_events,
> +};
> +
> +static const struct attribute_group *riscv_iommu_pmu_attr_grps[] = {
> +       &riscv_iommu_pmu_format_group,
> +       &riscv_iommu_pmu_events_group,
> +       NULL,
> +};
> +
> +/* PMU Operations */
> +static void riscv_iommu_pmu_set_counter(struct riscv_iommu_pmu *pmu, u32 idx,
> +                                       u64 value)
> +{
> +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> +
> +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> +               return;
> +
> +       if (idx == 0)
> +               value = (value & ~RISCV_IOMMU_IOHPMCYCLES_OF) |
> +                        (readq(addr) & RISCV_IOMMU_IOHPMCYCLES_OF);
> +
> +       writeq(FIELD_PREP(RISCV_IOMMU_IOHPMCTR_COUNTER, value), addr + idx * 8);
> +}
> +
> +static u64 riscv_iommu_pmu_get_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> +       u64 value;
> +
> +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> +               return -EINVAL;
> +
> +       value = readq(addr + idx * 8);
> +
> +       if (idx == 0)
> +               return FIELD_GET(RISCV_IOMMU_IOHPMCYCLES_COUNTER, value);
> +
> +       return FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, value);
> +}
> +
> +static u64 riscv_iommu_pmu_get_event(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
> +
> +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> +               return 0;
> +
> +       /* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> +       if (idx == 0)
> +               return 0;
> +
> +       return readq(addr + (idx - 1) * 8);
> +}
> +
> +static void riscv_iommu_pmu_set_event(struct riscv_iommu_pmu *pmu, u32 idx,
> +                                     u64 value)
> +{
> +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
> +
> +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> +               return;
> +
> +       /* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> +       if (idx == 0)
> +               return;
> +
> +       writeq(value, addr + (idx - 1) * 8);
> +}
> +
> +static void riscv_iommu_pmu_enable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +       u32 value = readl(addr);
> +
> +       writel(value & ~BIT(idx), addr);
> +}
> +
> +static void riscv_iommu_pmu_disable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +       u32 value = readl(addr);
> +
> +       writel(value | BIT(idx), addr);
> +}
> +
> +static void riscv_iommu_pmu_enable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +       u64 value;
> +
> +       if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
> +               value = riscv_iommu_pmu_get_counter(pmu, idx) & ~RISCV_IOMMU_IOHPMCYCLES_OF;
> +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
> +       } else {
> +               value = riscv_iommu_pmu_get_event(pmu, idx) & ~RISCV_IOMMU_IOHPMEVT_OF;
> +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
> +       }
> +}
> +
> +static void riscv_iommu_pmu_disable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +       u64 value;
> +
> +       if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
> +               value = riscv_iommu_pmu_get_counter(pmu, idx) | RISCV_IOMMU_IOHPMCYCLES_OF;
> +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
> +       } else {
> +               value = riscv_iommu_pmu_get_event(pmu, idx) | RISCV_IOMMU_IOHPMEVT_OF;
> +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
> +       }
> +}
> +
> +static void riscv_iommu_pmu_start_all(struct riscv_iommu_pmu *pmu)
> +{
> +       int idx;
> +
> +       for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
> +               riscv_iommu_pmu_enable_ovf_intr(pmu, idx);
> +               riscv_iommu_pmu_enable_counter(pmu, idx);
> +       }
> +}
> +
> +static void riscv_iommu_pmu_stop_all(struct riscv_iommu_pmu *pmu)
> +{
> +       writel(GENMASK_ULL(pmu->num_counters - 1, 0),
> +              pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH);
> +}
> +
> +/* PMU APIs */
> +static int riscv_iommu_pmu_set_period(struct perf_event *event)
> +{
> +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +       struct hw_perf_event *hwc = &event->hw;
> +       s64 left = local64_read(&hwc->period_left);
> +       s64 period = hwc->sample_period;
> +       u64 max_period = pmu->mask_counter;
> +       int ret = 0;
> +
> +       if (unlikely(left <= -period)) {
> +               left = period;
> +               local64_set(&hwc->period_left, left);
> +               hwc->last_period = period;
> +               ret = 1;
> +       }
> +
> +       if (unlikely(left <= 0)) {
> +               left += period;
> +               local64_set(&hwc->period_left, left);
> +               hwc->last_period = period;
> +               ret = 1;
> +       }
> +
> +       /*
> +        * Limit the maximum period to prevent the counter value
> +        * from overtaking the one we are about to program. In
> +        * effect we are reducing max_period to account for
> +        * interrupt latency (and we are being very conservative).
> +        */
> +       if (left > (max_period >> 1))
> +               left = (max_period >> 1);
> +
> +       local64_set(&hwc->prev_count, (u64)-left);
> +       riscv_iommu_pmu_set_counter(pmu, hwc->idx, (u64)(-left) & max_period);
> +       perf_event_update_userpage(event);
> +
> +       return ret;
> +}
> +
> +static int riscv_iommu_pmu_event_init(struct perf_event *event)
> +{
> +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +       struct hw_perf_event *hwc = &event->hw;
> +
> +       hwc->idx = -1;
> +       hwc->config = event->attr.config;
> +
> +       if (!is_sampling_event(event)) {
> +               /*
> +                * For non-sampling runs, limit the sample_period to half
> +                * of the counter width. That way, the new counter value
> +                * is far less likely to overtake the previous one unless
> +                * you have some serious IRQ latency issues.
> +                */
> +               hwc->sample_period = pmu->mask_counter >> 1;
> +               hwc->last_period = hwc->sample_period;
> +               local64_set(&hwc->period_left, hwc->sample_period);
> +       }
> +
> +       return 0;
> +}
> +
> +static void riscv_iommu_pmu_update(struct perf_event *event)
> +{
> +       struct hw_perf_event *hwc = &event->hw;
> +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +       u64 delta, prev, now;
> +       u32 idx = hwc->idx;
> +
> +       do {
> +               prev = local64_read(&hwc->prev_count);
> +               now = riscv_iommu_pmu_get_counter(pmu, idx);
> +       } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +       delta = FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, now - prev) & pmu->mask_counter;
> +       local64_add(delta, &event->count);
> +       local64_sub(delta, &hwc->period_left);
> +}
> +
> +static void riscv_iommu_pmu_start(struct perf_event *event, int flags)
> +{
> +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +       struct hw_perf_event *hwc = &event->hw;
> +
> +       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> +               return;
> +
> +       if (flags & PERF_EF_RELOAD)
> +               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
> +
> +       hwc->state = 0;
> +       riscv_iommu_pmu_set_period(event);
> +       riscv_iommu_pmu_set_event(pmu, hwc->idx, hwc->config);
> +       riscv_iommu_pmu_enable_ovf_intr(pmu, hwc->idx);
> +       riscv_iommu_pmu_enable_counter(pmu, hwc->idx);
> +
> +       perf_event_update_userpage(event);
> +}
> +
> +static void riscv_iommu_pmu_stop(struct perf_event *event, int flags)
> +{
> +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +       struct hw_perf_event *hwc = &event->hw;
> +
> +       if (hwc->state & PERF_HES_STOPPED)
> +               return;
> +
> +       riscv_iommu_pmu_set_event(pmu, hwc->idx, RISCV_IOMMU_HPMEVENT_INVALID);
> +       riscv_iommu_pmu_disable_counter(pmu, hwc->idx);
> +
> +       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE))
> +               riscv_iommu_pmu_update(event);
> +
> +       hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static int riscv_iommu_pmu_add(struct perf_event *event, int flags)
> +{
> +       struct hw_perf_event *hwc = &event->hw;
> +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +       unsigned int num_counters = pmu->num_counters;
> +       int idx;
> +
> +       /* Reserve index zero for iohpmcycles */
> +       if (get_event(event) == RISCV_IOMMU_HPMEVENT_CYCLE)
> +               idx = 0;
> +       else
> +               idx = find_next_zero_bit(pmu->used_counters, num_counters, 1);
> +
> +       if (idx == num_counters)
> +               return -EAGAIN;
> +
> +       set_bit(idx, pmu->used_counters);
> +
> +       pmu->events[idx] = event;
> +       hwc->idx = idx;
> +       hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +
> +       if (flags & PERF_EF_START)
> +               riscv_iommu_pmu_start(event, flags);
> +
> +       /* Propagate changes to the userspace mapping. */
> +       perf_event_update_userpage(event);
> +
> +       return 0;
> +}
> +
> +static void riscv_iommu_pmu_read(struct perf_event *event)
> +{
> +       riscv_iommu_pmu_update(event);
> +}
> +
> +static void riscv_iommu_pmu_del(struct perf_event *event, int flags)
> +{
> +       struct hw_perf_event *hwc = &event->hw;
> +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +       int idx = hwc->idx;
> +
> +       riscv_iommu_pmu_stop(event, PERF_EF_UPDATE);
> +       pmu->events[idx] = NULL;
> +       clear_bit(idx, pmu->used_counters);
> +       perf_event_update_userpage(event);
> +}
> +
> +irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu)
> +{
> +       struct perf_sample_data data;
> +       struct pt_regs *regs;
> +       u32 ovf = readl(pmu->reg + RISCV_IOMMU_REG_IOCOUNTOVF);
> +       int idx;
> +
> +       if (!ovf)
> +               return IRQ_NONE;
> +
> +       riscv_iommu_pmu_stop_all(pmu);
> +
> +       regs = get_irq_regs();
> +
> +       for_each_set_bit(idx, (unsigned long *)&ovf, pmu->num_counters) {
> +               struct perf_event *event = pmu->events[idx];
> +               struct hw_perf_event *hwc;
> +
> +               if (WARN_ON_ONCE(!event) || !is_sampling_event(event))
> +                       continue;
> +
> +               hwc = &event->hw;
> +
> +               riscv_iommu_pmu_update(event);
> +               perf_sample_data_init(&data, 0, hwc->last_period);
> +               if (!riscv_iommu_pmu_set_period(event))
> +                       continue;
> +
> +               if (perf_event_overflow(event, &data, regs))
> +                       riscv_iommu_pmu_stop(event, 0);
> +       }
> +
> +       riscv_iommu_pmu_start_all(pmu);
> +
> +       return IRQ_HANDLED;
> +}
> +
> +int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
> +                        const char *dev_name)
> +{
> +       char *name;
> +       int ret;
> +
> +       pmu->reg = reg;
> +       pmu->num_counters = RISCV_IOMMU_HPM_COUNTER_NUM;
> +       pmu->mask_counter = RISCV_IOMMU_IOHPMCTR_COUNTER;
> +
> +       pmu->pmu = (struct pmu) {
> +               .task_ctx_nr    = perf_invalid_context,
> +               .event_init     = riscv_iommu_pmu_event_init,
> +               .add            = riscv_iommu_pmu_add,
> +               .del            = riscv_iommu_pmu_del,
> +               .start          = riscv_iommu_pmu_start,
> +               .stop           = riscv_iommu_pmu_stop,
> +               .read           = riscv_iommu_pmu_read,
> +               .attr_groups    = riscv_iommu_pmu_attr_grps,
> +               .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
> +               .module         = THIS_MODULE,
> +       };
> +
> +       name = kasprintf(GFP_KERNEL, "riscv_iommu_pmu_%s", dev_name);

The dev_name of RISCV IOMMU is usually 'riscv,iommu'. If we compose
the iommu pmu name of iommu dev name, then maybe perf subsystem can
not handle the pmu event name correctly as the exists ',' in it.

Best Regards,

Xu Lu

> +
> +       ret = perf_pmu_register(&pmu->pmu, name, -1);
> +       if (ret) {
> +               pr_err("Failed to register riscv_iommu_pmu_%s: %d\n",
> +                      dev_name, ret);
> +               return ret;
> +       }
> +
> +       /* Stop all counters and later start the counter with perf */
> +       riscv_iommu_pmu_stop_all(pmu);
> +
> +       pr_info("riscv_iommu_pmu_%s: Registered with %d counters\n",
> +               dev_name, pmu->num_counters);
> +
> +       return 0;
> +}
> +
> +void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu)
> +{
> +       int idx;
> +
> +       /* Disable interrupt and functions */
> +       for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
> +               riscv_iommu_pmu_disable_counter(pmu, idx);
> +               riscv_iommu_pmu_disable_ovf_intr(pmu, idx);
> +       }
> +
> +       perf_pmu_unregister(&pmu->pmu);
> +}
> diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> index b1c4664542b4..92659a8a75ae 100644
> --- a/drivers/iommu/riscv/iommu.h
> +++ b/drivers/iommu/riscv/iommu.h
> @@ -60,11 +60,19 @@ struct riscv_iommu_device {
>         unsigned int ddt_mode;
>         dma_addr_t ddt_phys;
>         u64 *ddt_root;
> +
> +       /* hardware performance monitor */
> +       struct riscv_iommu_pmu pmu;
>  };
>
>  int riscv_iommu_init(struct riscv_iommu_device *iommu);
>  void riscv_iommu_remove(struct riscv_iommu_device *iommu);
>
> +int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
> +                        const char *name);
> +void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu);
> +irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu);
> +
>  #define riscv_iommu_readl(iommu, addr) \
>         readl_relaxed((iommu)->reg + (addr))
>
> --
> 2.17.1
>
Zong Li Jan. 15, 2025, 7:48 a.m. UTC | #2
On Wed, Jan 15, 2025 at 11:45 AM Xu Lu <luxu.kernel@bytedance.com> wrote:
>
> Hi Zong,
>
> On Wed, Jan 15, 2025 at 11:03 AM Zong Li <zong.li@sifive.com> wrote:
> >
> > Support for the RISC-V IOMMU hardware performance monitor includes
> > both counting and sampling modes.
> >
> > The specification does not define an event ID for counting the
> > number of clock cycles, meaning there is no associated `iohpmevt0`.
> > However, we need an event for counting cycle, so we reserve the
> > maximum event ID for this purpose.
> >
> > Signed-off-by: Zong Li <zong.li@sifive.com>
> > Tested-by: Xu Lu <luxu.kernel@bytedance.com>
> > ---
> >  drivers/iommu/riscv/Makefile     |   2 +-
> >  drivers/iommu/riscv/iommu-bits.h |  16 +
> >  drivers/iommu/riscv/iommu-pmu.c  | 486 +++++++++++++++++++++++++++++++
> >  drivers/iommu/riscv/iommu.h      |   8 +
> >  4 files changed, 511 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/iommu/riscv/iommu-pmu.c
> >
> > diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
> > index f54c9ed17d41..d36625a1fd08 100644
> > --- a/drivers/iommu/riscv/Makefile
> > +++ b/drivers/iommu/riscv/Makefile
> > @@ -1,3 +1,3 @@
> >  # SPDX-License-Identifier: GPL-2.0-only
> > -obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
> > +obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o iommu-pmu.o
> >  obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
> > diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> > index 98daf0e1a306..60523449f016 100644
> > --- a/drivers/iommu/riscv/iommu-bits.h
> > +++ b/drivers/iommu/riscv/iommu-bits.h
> > @@ -17,6 +17,7 @@
> >  #include <linux/types.h>
> >  #include <linux/bitfield.h>
> >  #include <linux/bits.h>
> > +#include <linux/perf_event.h>
> >
> >  /*
> >   * Chapter 5: Memory Mapped register interface
> > @@ -207,6 +208,7 @@ enum riscv_iommu_ddtp_modes {
> >  /* 5.22 Performance monitoring event counters (31 * 64bits) */
> >  #define RISCV_IOMMU_REG_IOHPMCTR_BASE  0x0068
> >  #define RISCV_IOMMU_REG_IOHPMCTR(_n)   (RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
> > +#define RISCV_IOMMU_IOHPMCTR_COUNTER   GENMASK_ULL(63, 0)
> >
> >  /* 5.23 Performance monitoring event selectors (31 * 64bits) */
> >  #define RISCV_IOMMU_REG_IOHPMEVT_BASE  0x0160
> > @@ -250,6 +252,20 @@ enum riscv_iommu_hpmevent_id {
> >         RISCV_IOMMU_HPMEVENT_MAX        = 9
> >  };
> >
> > +/* Use maximum event ID for cycle event */
> > +#define RISCV_IOMMU_HPMEVENT_CYCLE     GENMASK_ULL(14, 0)
> > +
> > +#define RISCV_IOMMU_HPM_COUNTER_NUM    32
> > +
> > +struct riscv_iommu_pmu {
> > +       struct pmu pmu;
> > +       void __iomem *reg;
> > +       int num_counters;
> > +       u64 mask_counter;
> > +       struct perf_event *events[RISCV_IOMMU_IOHPMEVT_CNT + 1];
> > +       DECLARE_BITMAP(used_counters, RISCV_IOMMU_IOHPMEVT_CNT + 1);
> > +};
> > +
> >  /* 5.24 Translation request IOVA (64bits) */
> >  #define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
> >  #define RISCV_IOMMU_TR_REQ_IOVA_VPN    GENMASK_ULL(63, 12)
> > diff --git a/drivers/iommu/riscv/iommu-pmu.c b/drivers/iommu/riscv/iommu-pmu.c
> > new file mode 100644
> > index 000000000000..74eb1525cd32
> > --- /dev/null
> > +++ b/drivers/iommu/riscv/iommu-pmu.c
> > @@ -0,0 +1,486 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (C) 2024 SiFive
> > + *
> > + * Authors
> > + *     Zong Li <zong.li@sifive.com>
> > + */
> > +
> > +#include <linux/io-64-nonatomic-hi-lo.h>
> > +
> > +#include "iommu.h"
> > +#include "iommu-bits.h"
> > +
> > +#define to_riscv_iommu_pmu(p) (container_of(p, struct riscv_iommu_pmu, pmu))
> > +
> > +#define RISCV_IOMMU_PMU_ATTR_EXTRACTOR(_name, _mask)                   \
> > +       static inline u32 get_##_name(struct perf_event *event)         \
> > +       {                                                               \
> > +               return FIELD_GET(_mask, event->attr.config);            \
> > +       }                                                               \
> > +
> > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(event, RISCV_IOMMU_IOHPMEVT_EVENTID);
> > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(partial_matching, RISCV_IOMMU_IOHPMEVT_DMASK);
> > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(pid_pscid, RISCV_IOMMU_IOHPMEVT_PID_PSCID);
> > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(did_gscid, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
> > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_pid_pscid, RISCV_IOMMU_IOHPMEVT_PV_PSCV);
> > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_did_gscid, RISCV_IOMMU_IOHPMEVT_DV_GSCV);
> > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_id_type, RISCV_IOMMU_IOHPMEVT_IDT);
> > +
> > +/* Formats */
> > +PMU_FORMAT_ATTR(event, "config:0-14");
> > +PMU_FORMAT_ATTR(partial_matching, "config:15");
> > +PMU_FORMAT_ATTR(pid_pscid, "config:16-35");
> > +PMU_FORMAT_ATTR(did_gscid, "config:36-59");
> > +PMU_FORMAT_ATTR(filter_pid_pscid, "config:60");
> > +PMU_FORMAT_ATTR(filter_did_gscid, "config:61");
> > +PMU_FORMAT_ATTR(filter_id_type, "config:62");
> > +
> > +static struct attribute *riscv_iommu_pmu_formats[] = {
> > +       &format_attr_event.attr,
> > +       &format_attr_partial_matching.attr,
> > +       &format_attr_pid_pscid.attr,
> > +       &format_attr_did_gscid.attr,
> > +       &format_attr_filter_pid_pscid.attr,
> > +       &format_attr_filter_did_gscid.attr,
> > +       &format_attr_filter_id_type.attr,
> > +       NULL,
> > +};
> > +
> > +static const struct attribute_group riscv_iommu_pmu_format_group = {
> > +       .name = "format",
> > +       .attrs = riscv_iommu_pmu_formats,
> > +};
> > +
> > +/* Events */
> > +static ssize_t riscv_iommu_pmu_event_show(struct device *dev,
> > +                                         struct device_attribute *attr,
> > +                                         char *page)
> > +{
> > +       struct perf_pmu_events_attr *pmu_attr;
> > +
> > +       pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
> > +
> > +       return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
> > +}
> > +
> > +PMU_EVENT_ATTR(cycle, event_attr_cycle,
> > +              RISCV_IOMMU_HPMEVENT_CYCLE, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(dont_count, event_attr_dont_count,
> > +              RISCV_IOMMU_HPMEVENT_INVALID, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(untranslated_req, event_attr_untranslated_req,
> > +              RISCV_IOMMU_HPMEVENT_URQ, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(translated_req, event_attr_translated_req,
> > +              RISCV_IOMMU_HPMEVENT_TRQ, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(ats_trans_req, event_attr_ats_trans_req,
> > +              RISCV_IOMMU_HPMEVENT_ATS_RQ, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(tlb_miss, event_attr_tlb_miss,
> > +              RISCV_IOMMU_HPMEVENT_TLB_MISS, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(ddt_walks, event_attr_ddt_walks,
> > +              RISCV_IOMMU_HPMEVENT_DD_WALK, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(pdt_walks, event_attr_pdt_walks,
> > +              RISCV_IOMMU_HPMEVENT_PD_WALK, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(s_vs_pt_walks, event_attr_s_vs_pt_walks,
> > +              RISCV_IOMMU_HPMEVENT_S_VS_WALKS, riscv_iommu_pmu_event_show);
> > +PMU_EVENT_ATTR(g_pt_walks, event_attr_g_pt_walks,
> > +              RISCV_IOMMU_HPMEVENT_G_WALKS, riscv_iommu_pmu_event_show);
> > +
> > +static struct attribute *riscv_iommu_pmu_events[] = {
> > +       &event_attr_cycle.attr.attr,
> > +       &event_attr_dont_count.attr.attr,
> > +       &event_attr_untranslated_req.attr.attr,
> > +       &event_attr_translated_req.attr.attr,
> > +       &event_attr_ats_trans_req.attr.attr,
> > +       &event_attr_tlb_miss.attr.attr,
> > +       &event_attr_ddt_walks.attr.attr,
> > +       &event_attr_pdt_walks.attr.attr,
> > +       &event_attr_s_vs_pt_walks.attr.attr,
> > +       &event_attr_g_pt_walks.attr.attr,
> > +       NULL,
> > +};
> > +
> > +static const struct attribute_group riscv_iommu_pmu_events_group = {
> > +       .name = "events",
> > +       .attrs = riscv_iommu_pmu_events,
> > +};
> > +
> > +static const struct attribute_group *riscv_iommu_pmu_attr_grps[] = {
> > +       &riscv_iommu_pmu_format_group,
> > +       &riscv_iommu_pmu_events_group,
> > +       NULL,
> > +};
> > +
> > +/* PMU Operations */
> > +static void riscv_iommu_pmu_set_counter(struct riscv_iommu_pmu *pmu, u32 idx,
> > +                                       u64 value)
> > +{
> > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> > +
> > +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> > +               return;
> > +
> > +       if (idx == 0)
> > +               value = (value & ~RISCV_IOMMU_IOHPMCYCLES_OF) |
> > +                        (readq(addr) & RISCV_IOMMU_IOHPMCYCLES_OF);
> > +
> > +       writeq(FIELD_PREP(RISCV_IOMMU_IOHPMCTR_COUNTER, value), addr + idx * 8);
> > +}
> > +
> > +static u64 riscv_iommu_pmu_get_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> > +{
> > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> > +       u64 value;
> > +
> > +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> > +               return -EINVAL;
> > +
> > +       value = readq(addr + idx * 8);
> > +
> > +       if (idx == 0)
> > +               return FIELD_GET(RISCV_IOMMU_IOHPMCYCLES_COUNTER, value);
> > +
> > +       return FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, value);
> > +}
> > +
> > +static u64 riscv_iommu_pmu_get_event(struct riscv_iommu_pmu *pmu, u32 idx)
> > +{
> > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
> > +
> > +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> > +               return 0;
> > +
> > +       /* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> > +       if (idx == 0)
> > +               return 0;
> > +
> > +       return readq(addr + (idx - 1) * 8);
> > +}
> > +
> > +static void riscv_iommu_pmu_set_event(struct riscv_iommu_pmu *pmu, u32 idx,
> > +                                     u64 value)
> > +{
> > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
> > +
> > +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> > +               return;
> > +
> > +       /* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> > +       if (idx == 0)
> > +               return;
> > +
> > +       writeq(value, addr + (idx - 1) * 8);
> > +}
> > +
> > +static void riscv_iommu_pmu_enable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> > +{
> > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> > +       u32 value = readl(addr);
> > +
> > +       writel(value & ~BIT(idx), addr);
> > +}
> > +
> > +static void riscv_iommu_pmu_disable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> > +{
> > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> > +       u32 value = readl(addr);
> > +
> > +       writel(value | BIT(idx), addr);
> > +}
> > +
> > +static void riscv_iommu_pmu_enable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
> > +{
> > +       u64 value;
> > +
> > +       if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
> > +               value = riscv_iommu_pmu_get_counter(pmu, idx) & ~RISCV_IOMMU_IOHPMCYCLES_OF;
> > +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
> > +       } else {
> > +               value = riscv_iommu_pmu_get_event(pmu, idx) & ~RISCV_IOMMU_IOHPMEVT_OF;
> > +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
> > +       }
> > +}
> > +
> > +static void riscv_iommu_pmu_disable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
> > +{
> > +       u64 value;
> > +
> > +       if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
> > +               value = riscv_iommu_pmu_get_counter(pmu, idx) | RISCV_IOMMU_IOHPMCYCLES_OF;
> > +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
> > +       } else {
> > +               value = riscv_iommu_pmu_get_event(pmu, idx) | RISCV_IOMMU_IOHPMEVT_OF;
> > +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
> > +       }
> > +}
> > +
> > +static void riscv_iommu_pmu_start_all(struct riscv_iommu_pmu *pmu)
> > +{
> > +       int idx;
> > +
> > +       for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
> > +               riscv_iommu_pmu_enable_ovf_intr(pmu, idx);
> > +               riscv_iommu_pmu_enable_counter(pmu, idx);
> > +       }
> > +}
> > +
> > +static void riscv_iommu_pmu_stop_all(struct riscv_iommu_pmu *pmu)
> > +{
> > +       writel(GENMASK_ULL(pmu->num_counters - 1, 0),
> > +              pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH);
> > +}
> > +
> > +/* PMU APIs */
> > +static int riscv_iommu_pmu_set_period(struct perf_event *event)
> > +{
> > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > +       struct hw_perf_event *hwc = &event->hw;
> > +       s64 left = local64_read(&hwc->period_left);
> > +       s64 period = hwc->sample_period;
> > +       u64 max_period = pmu->mask_counter;
> > +       int ret = 0;
> > +
> > +       if (unlikely(left <= -period)) {
> > +               left = period;
> > +               local64_set(&hwc->period_left, left);
> > +               hwc->last_period = period;
> > +               ret = 1;
> > +       }
> > +
> > +       if (unlikely(left <= 0)) {
> > +               left += period;
> > +               local64_set(&hwc->period_left, left);
> > +               hwc->last_period = period;
> > +               ret = 1;
> > +       }
> > +
> > +       /*
> > +        * Limit the maximum period to prevent the counter value
> > +        * from overtaking the one we are about to program. In
> > +        * effect we are reducing max_period to account for
> > +        * interrupt latency (and we are being very conservative).
> > +        */
> > +       if (left > (max_period >> 1))
> > +               left = (max_period >> 1);
> > +
> > +       local64_set(&hwc->prev_count, (u64)-left);
> > +       riscv_iommu_pmu_set_counter(pmu, hwc->idx, (u64)(-left) & max_period);
> > +       perf_event_update_userpage(event);
> > +
> > +       return ret;
> > +}
> > +
> > +static int riscv_iommu_pmu_event_init(struct perf_event *event)
> > +{
> > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > +       struct hw_perf_event *hwc = &event->hw;
> > +
> > +       hwc->idx = -1;
> > +       hwc->config = event->attr.config;
> > +
> > +       if (!is_sampling_event(event)) {
> > +               /*
> > +                * For non-sampling runs, limit the sample_period to half
> > +                * of the counter width. That way, the new counter value
> > +                * is far less likely to overtake the previous one unless
> > +                * you have some serious IRQ latency issues.
> > +                */
> > +               hwc->sample_period = pmu->mask_counter >> 1;
> > +               hwc->last_period = hwc->sample_period;
> > +               local64_set(&hwc->period_left, hwc->sample_period);
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static void riscv_iommu_pmu_update(struct perf_event *event)
> > +{
> > +       struct hw_perf_event *hwc = &event->hw;
> > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > +       u64 delta, prev, now;
> > +       u32 idx = hwc->idx;
> > +
> > +       do {
> > +               prev = local64_read(&hwc->prev_count);
> > +               now = riscv_iommu_pmu_get_counter(pmu, idx);
> > +       } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> > +
> > +       delta = FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, now - prev) & pmu->mask_counter;
> > +       local64_add(delta, &event->count);
> > +       local64_sub(delta, &hwc->period_left);
> > +}
> > +
> > +static void riscv_iommu_pmu_start(struct perf_event *event, int flags)
> > +{
> > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > +       struct hw_perf_event *hwc = &event->hw;
> > +
> > +       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> > +               return;
> > +
> > +       if (flags & PERF_EF_RELOAD)
> > +               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
> > +
> > +       hwc->state = 0;
> > +       riscv_iommu_pmu_set_period(event);
> > +       riscv_iommu_pmu_set_event(pmu, hwc->idx, hwc->config);
> > +       riscv_iommu_pmu_enable_ovf_intr(pmu, hwc->idx);
> > +       riscv_iommu_pmu_enable_counter(pmu, hwc->idx);
> > +
> > +       perf_event_update_userpage(event);
> > +}
> > +
> > +static void riscv_iommu_pmu_stop(struct perf_event *event, int flags)
> > +{
> > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > +       struct hw_perf_event *hwc = &event->hw;
> > +
> > +       if (hwc->state & PERF_HES_STOPPED)
> > +               return;
> > +
> > +       riscv_iommu_pmu_set_event(pmu, hwc->idx, RISCV_IOMMU_HPMEVENT_INVALID);
> > +       riscv_iommu_pmu_disable_counter(pmu, hwc->idx);
> > +
> > +       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE))
> > +               riscv_iommu_pmu_update(event);
> > +
> > +       hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> > +}
> > +
> > +static int riscv_iommu_pmu_add(struct perf_event *event, int flags)
> > +{
> > +       struct hw_perf_event *hwc = &event->hw;
> > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > +       unsigned int num_counters = pmu->num_counters;
> > +       int idx;
> > +
> > +       /* Reserve index zero for iohpmcycles */
> > +       if (get_event(event) == RISCV_IOMMU_HPMEVENT_CYCLE)
> > +               idx = 0;
> > +       else
> > +               idx = find_next_zero_bit(pmu->used_counters, num_counters, 1);
> > +
> > +       if (idx == num_counters)
> > +               return -EAGAIN;
> > +
> > +       set_bit(idx, pmu->used_counters);
> > +
> > +       pmu->events[idx] = event;
> > +       hwc->idx = idx;
> > +       hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> > +
> > +       if (flags & PERF_EF_START)
> > +               riscv_iommu_pmu_start(event, flags);
> > +
> > +       /* Propagate changes to the userspace mapping. */
> > +       perf_event_update_userpage(event);
> > +
> > +       return 0;
> > +}
> > +
> > +static void riscv_iommu_pmu_read(struct perf_event *event)
> > +{
> > +       riscv_iommu_pmu_update(event);
> > +}
> > +
> > +static void riscv_iommu_pmu_del(struct perf_event *event, int flags)
> > +{
> > +       struct hw_perf_event *hwc = &event->hw;
> > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > +       int idx = hwc->idx;
> > +
> > +       riscv_iommu_pmu_stop(event, PERF_EF_UPDATE);
> > +       pmu->events[idx] = NULL;
> > +       clear_bit(idx, pmu->used_counters);
> > +       perf_event_update_userpage(event);
> > +}
> > +
> > +irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu)
> > +{
> > +       struct perf_sample_data data;
> > +       struct pt_regs *regs;
> > +       u32 ovf = readl(pmu->reg + RISCV_IOMMU_REG_IOCOUNTOVF);
> > +       int idx;
> > +
> > +       if (!ovf)
> > +               return IRQ_NONE;
> > +
> > +       riscv_iommu_pmu_stop_all(pmu);
> > +
> > +       regs = get_irq_regs();
> > +
> > +       for_each_set_bit(idx, (unsigned long *)&ovf, pmu->num_counters) {
> > +               struct perf_event *event = pmu->events[idx];
> > +               struct hw_perf_event *hwc;
> > +
> > +               if (WARN_ON_ONCE(!event) || !is_sampling_event(event))
> > +                       continue;
> > +
> > +               hwc = &event->hw;
> > +
> > +               riscv_iommu_pmu_update(event);
> > +               perf_sample_data_init(&data, 0, hwc->last_period);
> > +               if (!riscv_iommu_pmu_set_period(event))
> > +                       continue;
> > +
> > +               if (perf_event_overflow(event, &data, regs))
> > +                       riscv_iommu_pmu_stop(event, 0);
> > +       }
> > +
> > +       riscv_iommu_pmu_start_all(pmu);
> > +
> > +       return IRQ_HANDLED;
> > +}
> > +
> > +int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
> > +                        const char *dev_name)
> > +{
> > +       char *name;
> > +       int ret;
> > +
> > +       pmu->reg = reg;
> > +       pmu->num_counters = RISCV_IOMMU_HPM_COUNTER_NUM;
> > +       pmu->mask_counter = RISCV_IOMMU_IOHPMCTR_COUNTER;
> > +
> > +       pmu->pmu = (struct pmu) {
> > +               .task_ctx_nr    = perf_invalid_context,
> > +               .event_init     = riscv_iommu_pmu_event_init,
> > +               .add            = riscv_iommu_pmu_add,
> > +               .del            = riscv_iommu_pmu_del,
> > +               .start          = riscv_iommu_pmu_start,
> > +               .stop           = riscv_iommu_pmu_stop,
> > +               .read           = riscv_iommu_pmu_read,
> > +               .attr_groups    = riscv_iommu_pmu_attr_grps,
> > +               .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
> > +               .module         = THIS_MODULE,
> > +       };
> > +
> > +       name = kasprintf(GFP_KERNEL, "riscv_iommu_pmu_%s", dev_name);
>
> The dev_name of RISCV IOMMU is usually 'riscv,iommu'. If we compose
> the iommu pmu name of iommu dev name, then maybe perf subsystem can
> not handle the pmu event name correctly as the exists ',' in it.

I assume you are referring to compatible string because 'riscv,iommu'
appears to be the compatible string. However, it seems to me that the
dev_name is derived from the node name rather than the compatible
string. As a result, there is no comma (',') symbol; instead, a dot
('.') symbol is used. For example, if the IOMMU node in the device
tree is 'iommu@12345678', the dev_name would be '12345678.iommu'.
Please let me know if I missed anything. Thanks

>
> Best Regards,
>
> Xu Lu
>
> > +
> > +       ret = perf_pmu_register(&pmu->pmu, name, -1);
> > +       if (ret) {
> > +               pr_err("Failed to register riscv_iommu_pmu_%s: %d\n",
> > +                      dev_name, ret);
> > +               return ret;
> > +       }
> > +
> > +       /* Stop all counters and later start the counter with perf */
> > +       riscv_iommu_pmu_stop_all(pmu);
> > +
> > +       pr_info("riscv_iommu_pmu_%s: Registered with %d counters\n",
> > +               dev_name, pmu->num_counters);
> > +
> > +       return 0;
> > +}
> > +
> > +void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu)
> > +{
> > +       int idx;
> > +
> > +       /* Disable interrupt and functions */
> > +       for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
> > +               riscv_iommu_pmu_disable_counter(pmu, idx);
> > +               riscv_iommu_pmu_disable_ovf_intr(pmu, idx);
> > +       }
> > +
> > +       perf_pmu_unregister(&pmu->pmu);
> > +}
> > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> > index b1c4664542b4..92659a8a75ae 100644
> > --- a/drivers/iommu/riscv/iommu.h
> > +++ b/drivers/iommu/riscv/iommu.h
> > @@ -60,11 +60,19 @@ struct riscv_iommu_device {
> >         unsigned int ddt_mode;
> >         dma_addr_t ddt_phys;
> >         u64 *ddt_root;
> > +
> > +       /* hardware performance monitor */
> > +       struct riscv_iommu_pmu pmu;
> >  };
> >
> >  int riscv_iommu_init(struct riscv_iommu_device *iommu);
> >  void riscv_iommu_remove(struct riscv_iommu_device *iommu);
> >
> > +int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
> > +                        const char *name);
> > +void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu);
> > +irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu);
> > +
> >  #define riscv_iommu_readl(iommu, addr) \
> >         readl_relaxed((iommu)->reg + (addr))
> >
> > --
> > 2.17.1
> >
Xu Lu Jan. 15, 2025, 8:25 a.m. UTC | #3
On Wed, Jan 15, 2025 at 3:49 PM Zong Li <zong.li@sifive.com> wrote:
>
> On Wed, Jan 15, 2025 at 11:45 AM Xu Lu <luxu.kernel@bytedance.com> wrote:
> >
> > Hi Zong,
> >
> > On Wed, Jan 15, 2025 at 11:03 AM Zong Li <zong.li@sifive.com> wrote:
> > >
> > > Support for the RISC-V IOMMU hardware performance monitor includes
> > > both counting and sampling modes.
> > >
> > > The specification does not define an event ID for counting the
> > > number of clock cycles, meaning there is no associated `iohpmevt0`.
> > > However, we need an event for counting cycle, so we reserve the
> > > maximum event ID for this purpose.
> > >
> > > Signed-off-by: Zong Li <zong.li@sifive.com>
> > > Tested-by: Xu Lu <luxu.kernel@bytedance.com>
> > > ---
> > >  drivers/iommu/riscv/Makefile     |   2 +-
> > >  drivers/iommu/riscv/iommu-bits.h |  16 +
> > >  drivers/iommu/riscv/iommu-pmu.c  | 486 +++++++++++++++++++++++++++++++
> > >  drivers/iommu/riscv/iommu.h      |   8 +
> > >  4 files changed, 511 insertions(+), 1 deletion(-)
> > >  create mode 100644 drivers/iommu/riscv/iommu-pmu.c
> > >
> > > diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
> > > index f54c9ed17d41..d36625a1fd08 100644
> > > --- a/drivers/iommu/riscv/Makefile
> > > +++ b/drivers/iommu/riscv/Makefile
> > > @@ -1,3 +1,3 @@
> > >  # SPDX-License-Identifier: GPL-2.0-only
> > > -obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
> > > +obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o iommu-pmu.o
> > >  obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
> > > diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> > > index 98daf0e1a306..60523449f016 100644
> > > --- a/drivers/iommu/riscv/iommu-bits.h
> > > +++ b/drivers/iommu/riscv/iommu-bits.h
> > > @@ -17,6 +17,7 @@
> > >  #include <linux/types.h>
> > >  #include <linux/bitfield.h>
> > >  #include <linux/bits.h>
> > > +#include <linux/perf_event.h>
> > >
> > >  /*
> > >   * Chapter 5: Memory Mapped register interface
> > > @@ -207,6 +208,7 @@ enum riscv_iommu_ddtp_modes {
> > >  /* 5.22 Performance monitoring event counters (31 * 64bits) */
> > >  #define RISCV_IOMMU_REG_IOHPMCTR_BASE  0x0068
> > >  #define RISCV_IOMMU_REG_IOHPMCTR(_n)   (RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
> > > +#define RISCV_IOMMU_IOHPMCTR_COUNTER   GENMASK_ULL(63, 0)
> > >
> > >  /* 5.23 Performance monitoring event selectors (31 * 64bits) */
> > >  #define RISCV_IOMMU_REG_IOHPMEVT_BASE  0x0160
> > > @@ -250,6 +252,20 @@ enum riscv_iommu_hpmevent_id {
> > >         RISCV_IOMMU_HPMEVENT_MAX        = 9
> > >  };
> > >
> > > +/* Use maximum event ID for cycle event */
> > > +#define RISCV_IOMMU_HPMEVENT_CYCLE     GENMASK_ULL(14, 0)
> > > +
> > > +#define RISCV_IOMMU_HPM_COUNTER_NUM    32
> > > +
> > > +struct riscv_iommu_pmu {
> > > +       struct pmu pmu;
> > > +       void __iomem *reg;
> > > +       int num_counters;
> > > +       u64 mask_counter;
> > > +       struct perf_event *events[RISCV_IOMMU_IOHPMEVT_CNT + 1];
> > > +       DECLARE_BITMAP(used_counters, RISCV_IOMMU_IOHPMEVT_CNT + 1);
> > > +};
> > > +
> > >  /* 5.24 Translation request IOVA (64bits) */
> > >  #define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
> > >  #define RISCV_IOMMU_TR_REQ_IOVA_VPN    GENMASK_ULL(63, 12)
> > > diff --git a/drivers/iommu/riscv/iommu-pmu.c b/drivers/iommu/riscv/iommu-pmu.c
> > > new file mode 100644
> > > index 000000000000..74eb1525cd32
> > > --- /dev/null
> > > +++ b/drivers/iommu/riscv/iommu-pmu.c
> > > @@ -0,0 +1,486 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * Copyright (C) 2024 SiFive
> > > + *
> > > + * Authors
> > > + *     Zong Li <zong.li@sifive.com>
> > > + */
> > > +
> > > +#include <linux/io-64-nonatomic-hi-lo.h>
> > > +
> > > +#include "iommu.h"
> > > +#include "iommu-bits.h"
> > > +
> > > +#define to_riscv_iommu_pmu(p) (container_of(p, struct riscv_iommu_pmu, pmu))
> > > +
> > > +#define RISCV_IOMMU_PMU_ATTR_EXTRACTOR(_name, _mask)                   \
> > > +       static inline u32 get_##_name(struct perf_event *event)         \
> > > +       {                                                               \
> > > +               return FIELD_GET(_mask, event->attr.config);            \
> > > +       }                                                               \
> > > +
> > > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(event, RISCV_IOMMU_IOHPMEVT_EVENTID);
> > > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(partial_matching, RISCV_IOMMU_IOHPMEVT_DMASK);
> > > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(pid_pscid, RISCV_IOMMU_IOHPMEVT_PID_PSCID);
> > > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(did_gscid, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
> > > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_pid_pscid, RISCV_IOMMU_IOHPMEVT_PV_PSCV);
> > > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_did_gscid, RISCV_IOMMU_IOHPMEVT_DV_GSCV);
> > > +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_id_type, RISCV_IOMMU_IOHPMEVT_IDT);
> > > +
> > > +/* Formats */
> > > +PMU_FORMAT_ATTR(event, "config:0-14");
> > > +PMU_FORMAT_ATTR(partial_matching, "config:15");
> > > +PMU_FORMAT_ATTR(pid_pscid, "config:16-35");
> > > +PMU_FORMAT_ATTR(did_gscid, "config:36-59");
> > > +PMU_FORMAT_ATTR(filter_pid_pscid, "config:60");
> > > +PMU_FORMAT_ATTR(filter_did_gscid, "config:61");
> > > +PMU_FORMAT_ATTR(filter_id_type, "config:62");
> > > +
> > > +static struct attribute *riscv_iommu_pmu_formats[] = {
> > > +       &format_attr_event.attr,
> > > +       &format_attr_partial_matching.attr,
> > > +       &format_attr_pid_pscid.attr,
> > > +       &format_attr_did_gscid.attr,
> > > +       &format_attr_filter_pid_pscid.attr,
> > > +       &format_attr_filter_did_gscid.attr,
> > > +       &format_attr_filter_id_type.attr,
> > > +       NULL,
> > > +};
> > > +
> > > +static const struct attribute_group riscv_iommu_pmu_format_group = {
> > > +       .name = "format",
> > > +       .attrs = riscv_iommu_pmu_formats,
> > > +};
> > > +
> > > +/* Events */
> > > +static ssize_t riscv_iommu_pmu_event_show(struct device *dev,
> > > +                                         struct device_attribute *attr,
> > > +                                         char *page)
> > > +{
> > > +       struct perf_pmu_events_attr *pmu_attr;
> > > +
> > > +       pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
> > > +
> > > +       return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
> > > +}
> > > +
> > > +PMU_EVENT_ATTR(cycle, event_attr_cycle,
> > > +              RISCV_IOMMU_HPMEVENT_CYCLE, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(dont_count, event_attr_dont_count,
> > > +              RISCV_IOMMU_HPMEVENT_INVALID, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(untranslated_req, event_attr_untranslated_req,
> > > +              RISCV_IOMMU_HPMEVENT_URQ, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(translated_req, event_attr_translated_req,
> > > +              RISCV_IOMMU_HPMEVENT_TRQ, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(ats_trans_req, event_attr_ats_trans_req,
> > > +              RISCV_IOMMU_HPMEVENT_ATS_RQ, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(tlb_miss, event_attr_tlb_miss,
> > > +              RISCV_IOMMU_HPMEVENT_TLB_MISS, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(ddt_walks, event_attr_ddt_walks,
> > > +              RISCV_IOMMU_HPMEVENT_DD_WALK, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(pdt_walks, event_attr_pdt_walks,
> > > +              RISCV_IOMMU_HPMEVENT_PD_WALK, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(s_vs_pt_walks, event_attr_s_vs_pt_walks,
> > > +              RISCV_IOMMU_HPMEVENT_S_VS_WALKS, riscv_iommu_pmu_event_show);
> > > +PMU_EVENT_ATTR(g_pt_walks, event_attr_g_pt_walks,
> > > +              RISCV_IOMMU_HPMEVENT_G_WALKS, riscv_iommu_pmu_event_show);
> > > +
> > > +static struct attribute *riscv_iommu_pmu_events[] = {
> > > +       &event_attr_cycle.attr.attr,
> > > +       &event_attr_dont_count.attr.attr,
> > > +       &event_attr_untranslated_req.attr.attr,
> > > +       &event_attr_translated_req.attr.attr,
> > > +       &event_attr_ats_trans_req.attr.attr,
> > > +       &event_attr_tlb_miss.attr.attr,
> > > +       &event_attr_ddt_walks.attr.attr,
> > > +       &event_attr_pdt_walks.attr.attr,
> > > +       &event_attr_s_vs_pt_walks.attr.attr,
> > > +       &event_attr_g_pt_walks.attr.attr,
> > > +       NULL,
> > > +};
> > > +
> > > +static const struct attribute_group riscv_iommu_pmu_events_group = {
> > > +       .name = "events",
> > > +       .attrs = riscv_iommu_pmu_events,
> > > +};
> > > +
> > > +static const struct attribute_group *riscv_iommu_pmu_attr_grps[] = {
> > > +       &riscv_iommu_pmu_format_group,
> > > +       &riscv_iommu_pmu_events_group,
> > > +       NULL,
> > > +};
> > > +
> > > +/* PMU Operations */
> > > +static void riscv_iommu_pmu_set_counter(struct riscv_iommu_pmu *pmu, u32 idx,
> > > +                                       u64 value)
> > > +{
> > > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> > > +
> > > +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> > > +               return;
> > > +
> > > +       if (idx == 0)
> > > +               value = (value & ~RISCV_IOMMU_IOHPMCYCLES_OF) |
> > > +                        (readq(addr) & RISCV_IOMMU_IOHPMCYCLES_OF);
> > > +
> > > +       writeq(FIELD_PREP(RISCV_IOMMU_IOHPMCTR_COUNTER, value), addr + idx * 8);
> > > +}
> > > +
> > > +static u64 riscv_iommu_pmu_get_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> > > +{
> > > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> > > +       u64 value;
> > > +
> > > +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> > > +               return -EINVAL;
> > > +
> > > +       value = readq(addr + idx * 8);
> > > +
> > > +       if (idx == 0)
> > > +               return FIELD_GET(RISCV_IOMMU_IOHPMCYCLES_COUNTER, value);
> > > +
> > > +       return FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, value);
> > > +}
> > > +
> > > +static u64 riscv_iommu_pmu_get_event(struct riscv_iommu_pmu *pmu, u32 idx)
> > > +{
> > > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
> > > +
> > > +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> > > +               return 0;
> > > +
> > > +       /* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> > > +       if (idx == 0)
> > > +               return 0;
> > > +
> > > +       return readq(addr + (idx - 1) * 8);
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_set_event(struct riscv_iommu_pmu *pmu, u32 idx,
> > > +                                     u64 value)
> > > +{
> > > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
> > > +
> > > +       if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
> > > +               return;
> > > +
> > > +       /* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> > > +       if (idx == 0)
> > > +               return;
> > > +
> > > +       writeq(value, addr + (idx - 1) * 8);
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_enable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> > > +{
> > > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> > > +       u32 value = readl(addr);
> > > +
> > > +       writel(value & ~BIT(idx), addr);
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_disable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> > > +{
> > > +       void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> > > +       u32 value = readl(addr);
> > > +
> > > +       writel(value | BIT(idx), addr);
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_enable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
> > > +{
> > > +       u64 value;
> > > +
> > > +       if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
> > > +               value = riscv_iommu_pmu_get_counter(pmu, idx) & ~RISCV_IOMMU_IOHPMCYCLES_OF;
> > > +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
> > > +       } else {
> > > +               value = riscv_iommu_pmu_get_event(pmu, idx) & ~RISCV_IOMMU_IOHPMEVT_OF;
> > > +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
> > > +       }
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_disable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
> > > +{
> > > +       u64 value;
> > > +
> > > +       if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
> > > +               value = riscv_iommu_pmu_get_counter(pmu, idx) | RISCV_IOMMU_IOHPMCYCLES_OF;
> > > +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
> > > +       } else {
> > > +               value = riscv_iommu_pmu_get_event(pmu, idx) | RISCV_IOMMU_IOHPMEVT_OF;
> > > +               writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
> > > +       }
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_start_all(struct riscv_iommu_pmu *pmu)
> > > +{
> > > +       int idx;
> > > +
> > > +       for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
> > > +               riscv_iommu_pmu_enable_ovf_intr(pmu, idx);
> > > +               riscv_iommu_pmu_enable_counter(pmu, idx);
> > > +       }
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_stop_all(struct riscv_iommu_pmu *pmu)
> > > +{
> > > +       writel(GENMASK_ULL(pmu->num_counters - 1, 0),
> > > +              pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH);
> > > +}
> > > +
> > > +/* PMU APIs */
> > > +static int riscv_iommu_pmu_set_period(struct perf_event *event)
> > > +{
> > > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > > +       struct hw_perf_event *hwc = &event->hw;
> > > +       s64 left = local64_read(&hwc->period_left);
> > > +       s64 period = hwc->sample_period;
> > > +       u64 max_period = pmu->mask_counter;
> > > +       int ret = 0;
> > > +
> > > +       if (unlikely(left <= -period)) {
> > > +               left = period;
> > > +               local64_set(&hwc->period_left, left);
> > > +               hwc->last_period = period;
> > > +               ret = 1;
> > > +       }
> > > +
> > > +       if (unlikely(left <= 0)) {
> > > +               left += period;
> > > +               local64_set(&hwc->period_left, left);
> > > +               hwc->last_period = period;
> > > +               ret = 1;
> > > +       }
> > > +
> > > +       /*
> > > +        * Limit the maximum period to prevent the counter value
> > > +        * from overtaking the one we are about to program. In
> > > +        * effect we are reducing max_period to account for
> > > +        * interrupt latency (and we are being very conservative).
> > > +        */
> > > +       if (left > (max_period >> 1))
> > > +               left = (max_period >> 1);
> > > +
> > > +       local64_set(&hwc->prev_count, (u64)-left);
> > > +       riscv_iommu_pmu_set_counter(pmu, hwc->idx, (u64)(-left) & max_period);
> > > +       perf_event_update_userpage(event);
> > > +
> > > +       return ret;
> > > +}
> > > +
> > > +static int riscv_iommu_pmu_event_init(struct perf_event *event)
> > > +{
> > > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > > +       struct hw_perf_event *hwc = &event->hw;
> > > +
> > > +       hwc->idx = -1;
> > > +       hwc->config = event->attr.config;
> > > +
> > > +       if (!is_sampling_event(event)) {
> > > +               /*
> > > +                * For non-sampling runs, limit the sample_period to half
> > > +                * of the counter width. That way, the new counter value
> > > +                * is far less likely to overtake the previous one unless
> > > +                * you have some serious IRQ latency issues.
> > > +                */
> > > +               hwc->sample_period = pmu->mask_counter >> 1;
> > > +               hwc->last_period = hwc->sample_period;
> > > +               local64_set(&hwc->period_left, hwc->sample_period);
> > > +       }
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_update(struct perf_event *event)
> > > +{
> > > +       struct hw_perf_event *hwc = &event->hw;
> > > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > > +       u64 delta, prev, now;
> > > +       u32 idx = hwc->idx;
> > > +
> > > +       do {
> > > +               prev = local64_read(&hwc->prev_count);
> > > +               now = riscv_iommu_pmu_get_counter(pmu, idx);
> > > +       } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> > > +
> > > +       delta = FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, now - prev) & pmu->mask_counter;
> > > +       local64_add(delta, &event->count);
> > > +       local64_sub(delta, &hwc->period_left);
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_start(struct perf_event *event, int flags)
> > > +{
> > > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > > +       struct hw_perf_event *hwc = &event->hw;
> > > +
> > > +       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> > > +               return;
> > > +
> > > +       if (flags & PERF_EF_RELOAD)
> > > +               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
> > > +
> > > +       hwc->state = 0;
> > > +       riscv_iommu_pmu_set_period(event);
> > > +       riscv_iommu_pmu_set_event(pmu, hwc->idx, hwc->config);
> > > +       riscv_iommu_pmu_enable_ovf_intr(pmu, hwc->idx);
> > > +       riscv_iommu_pmu_enable_counter(pmu, hwc->idx);
> > > +
> > > +       perf_event_update_userpage(event);
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_stop(struct perf_event *event, int flags)
> > > +{
> > > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > > +       struct hw_perf_event *hwc = &event->hw;
> > > +
> > > +       if (hwc->state & PERF_HES_STOPPED)
> > > +               return;
> > > +
> > > +       riscv_iommu_pmu_set_event(pmu, hwc->idx, RISCV_IOMMU_HPMEVENT_INVALID);
> > > +       riscv_iommu_pmu_disable_counter(pmu, hwc->idx);
> > > +
> > > +       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE))
> > > +               riscv_iommu_pmu_update(event);
> > > +
> > > +       hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> > > +}
> > > +
> > > +static int riscv_iommu_pmu_add(struct perf_event *event, int flags)
> > > +{
> > > +       struct hw_perf_event *hwc = &event->hw;
> > > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > > +       unsigned int num_counters = pmu->num_counters;
> > > +       int idx;
> > > +
> > > +       /* Reserve index zero for iohpmcycles */
> > > +       if (get_event(event) == RISCV_IOMMU_HPMEVENT_CYCLE)
> > > +               idx = 0;
> > > +       else
> > > +               idx = find_next_zero_bit(pmu->used_counters, num_counters, 1);
> > > +
> > > +       if (idx == num_counters)
> > > +               return -EAGAIN;
> > > +
> > > +       set_bit(idx, pmu->used_counters);
> > > +
> > > +       pmu->events[idx] = event;
> > > +       hwc->idx = idx;
> > > +       hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> > > +
> > > +       if (flags & PERF_EF_START)
> > > +               riscv_iommu_pmu_start(event, flags);
> > > +
> > > +       /* Propagate changes to the userspace mapping. */
> > > +       perf_event_update_userpage(event);
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_read(struct perf_event *event)
> > > +{
> > > +       riscv_iommu_pmu_update(event);
> > > +}
> > > +
> > > +static void riscv_iommu_pmu_del(struct perf_event *event, int flags)
> > > +{
> > > +       struct hw_perf_event *hwc = &event->hw;
> > > +       struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> > > +       int idx = hwc->idx;
> > > +
> > > +       riscv_iommu_pmu_stop(event, PERF_EF_UPDATE);
> > > +       pmu->events[idx] = NULL;
> > > +       clear_bit(idx, pmu->used_counters);
> > > +       perf_event_update_userpage(event);
> > > +}
> > > +
> > > +irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu)
> > > +{
> > > +       struct perf_sample_data data;
> > > +       struct pt_regs *regs;
> > > +       u32 ovf = readl(pmu->reg + RISCV_IOMMU_REG_IOCOUNTOVF);
> > > +       int idx;
> > > +
> > > +       if (!ovf)
> > > +               return IRQ_NONE;
> > > +
> > > +       riscv_iommu_pmu_stop_all(pmu);
> > > +
> > > +       regs = get_irq_regs();
> > > +
> > > +       for_each_set_bit(idx, (unsigned long *)&ovf, pmu->num_counters) {
> > > +               struct perf_event *event = pmu->events[idx];
> > > +               struct hw_perf_event *hwc;
> > > +
> > > +               if (WARN_ON_ONCE(!event) || !is_sampling_event(event))
> > > +                       continue;
> > > +
> > > +               hwc = &event->hw;
> > > +
> > > +               riscv_iommu_pmu_update(event);
> > > +               perf_sample_data_init(&data, 0, hwc->last_period);
> > > +               if (!riscv_iommu_pmu_set_period(event))
> > > +                       continue;
> > > +
> > > +               if (perf_event_overflow(event, &data, regs))
> > > +                       riscv_iommu_pmu_stop(event, 0);
> > > +       }
> > > +
> > > +       riscv_iommu_pmu_start_all(pmu);
> > > +
> > > +       return IRQ_HANDLED;
> > > +}
> > > +
> > > +int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
> > > +                        const char *dev_name)
> > > +{
> > > +       char *name;
> > > +       int ret;
> > > +
> > > +       pmu->reg = reg;
> > > +       pmu->num_counters = RISCV_IOMMU_HPM_COUNTER_NUM;
> > > +       pmu->mask_counter = RISCV_IOMMU_IOHPMCTR_COUNTER;
> > > +
> > > +       pmu->pmu = (struct pmu) {
> > > +               .task_ctx_nr    = perf_invalid_context,
> > > +               .event_init     = riscv_iommu_pmu_event_init,
> > > +               .add            = riscv_iommu_pmu_add,
> > > +               .del            = riscv_iommu_pmu_del,
> > > +               .start          = riscv_iommu_pmu_start,
> > > +               .stop           = riscv_iommu_pmu_stop,
> > > +               .read           = riscv_iommu_pmu_read,
> > > +               .attr_groups    = riscv_iommu_pmu_attr_grps,
> > > +               .capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
> > > +               .module         = THIS_MODULE,
> > > +       };
> > > +
> > > +       name = kasprintf(GFP_KERNEL, "riscv_iommu_pmu_%s", dev_name);
> >
> > The dev_name of RISCV IOMMU is usually 'riscv,iommu'. If we compose
> > the iommu pmu name of iommu dev name, then maybe perf subsystem can
> > not handle the pmu event name correctly as the exists ',' in it.
>
> I assume you are referring to compatible string because 'riscv,iommu'
> appears to be the compatible string. However, it seems to me that the
> dev_name is derived from the node name rather than the compatible
> string. As a result, there is no comma (',') symbol; instead, a dot
> ('.') symbol is used. For example, if the IOMMU node in the device
> tree is 'iommu@12345678', the dev_name would be '12345678.iommu'.
> Please let me know if I missed anything. Thanks
>

Tested it on qemu. You are right. Please ignore me. Thanks!

> >
> > Best Regards,
> >
> > Xu Lu
> >
> > > +
> > > +       ret = perf_pmu_register(&pmu->pmu, name, -1);
> > > +       if (ret) {
> > > +               pr_err("Failed to register riscv_iommu_pmu_%s: %d\n",
> > > +                      dev_name, ret);
> > > +               return ret;
> > > +       }
> > > +
> > > +       /* Stop all counters and later start the counter with perf */
> > > +       riscv_iommu_pmu_stop_all(pmu);
> > > +
> > > +       pr_info("riscv_iommu_pmu_%s: Registered with %d counters\n",
> > > +               dev_name, pmu->num_counters);
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu)
> > > +{
> > > +       int idx;
> > > +
> > > +       /* Disable interrupt and functions */
> > > +       for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
> > > +               riscv_iommu_pmu_disable_counter(pmu, idx);
> > > +               riscv_iommu_pmu_disable_ovf_intr(pmu, idx);
> > > +       }
> > > +
> > > +       perf_pmu_unregister(&pmu->pmu);
> > > +}
> > > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> > > index b1c4664542b4..92659a8a75ae 100644
> > > --- a/drivers/iommu/riscv/iommu.h
> > > +++ b/drivers/iommu/riscv/iommu.h
> > > @@ -60,11 +60,19 @@ struct riscv_iommu_device {
> > >         unsigned int ddt_mode;
> > >         dma_addr_t ddt_phys;
> > >         u64 *ddt_root;
> > > +
> > > +       /* hardware performance monitor */
> > > +       struct riscv_iommu_pmu pmu;
> > >  };
> > >
> > >  int riscv_iommu_init(struct riscv_iommu_device *iommu);
> > >  void riscv_iommu_remove(struct riscv_iommu_device *iommu);
> > >
> > > +int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
> > > +                        const char *name);
> > > +void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu);
> > > +irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu);
> > > +
> > >  #define riscv_iommu_readl(iommu, addr) \
> > >         readl_relaxed((iommu)->reg + (addr))
> > >
> > > --
> > > 2.17.1
> > >
diff mbox series

Patch

diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
index f54c9ed17d41..d36625a1fd08 100644
--- a/drivers/iommu/riscv/Makefile
+++ b/drivers/iommu/riscv/Makefile
@@ -1,3 +1,3 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
+obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o iommu-pmu.o
 obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
index 98daf0e1a306..60523449f016 100644
--- a/drivers/iommu/riscv/iommu-bits.h
+++ b/drivers/iommu/riscv/iommu-bits.h
@@ -17,6 +17,7 @@ 
 #include <linux/types.h>
 #include <linux/bitfield.h>
 #include <linux/bits.h>
+#include <linux/perf_event.h>
 
 /*
  * Chapter 5: Memory Mapped register interface
@@ -207,6 +208,7 @@  enum riscv_iommu_ddtp_modes {
 /* 5.22 Performance monitoring event counters (31 * 64bits) */
 #define RISCV_IOMMU_REG_IOHPMCTR_BASE	0x0068
 #define RISCV_IOMMU_REG_IOHPMCTR(_n)	(RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
+#define RISCV_IOMMU_IOHPMCTR_COUNTER	GENMASK_ULL(63, 0)
 
 /* 5.23 Performance monitoring event selectors (31 * 64bits) */
 #define RISCV_IOMMU_REG_IOHPMEVT_BASE	0x0160
@@ -250,6 +252,20 @@  enum riscv_iommu_hpmevent_id {
 	RISCV_IOMMU_HPMEVENT_MAX        = 9
 };
 
+/* Use maximum event ID for cycle event */
+#define RISCV_IOMMU_HPMEVENT_CYCLE	GENMASK_ULL(14, 0)
+
+#define RISCV_IOMMU_HPM_COUNTER_NUM	32
+
+struct riscv_iommu_pmu {
+	struct pmu pmu;
+	void __iomem *reg;
+	int num_counters;
+	u64 mask_counter;
+	struct perf_event *events[RISCV_IOMMU_IOHPMEVT_CNT + 1];
+	DECLARE_BITMAP(used_counters, RISCV_IOMMU_IOHPMEVT_CNT + 1);
+};
+
 /* 5.24 Translation request IOVA (64bits) */
 #define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
 #define RISCV_IOMMU_TR_REQ_IOVA_VPN	GENMASK_ULL(63, 12)
diff --git a/drivers/iommu/riscv/iommu-pmu.c b/drivers/iommu/riscv/iommu-pmu.c
new file mode 100644
index 000000000000..74eb1525cd32
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-pmu.c
@@ -0,0 +1,486 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 SiFive
+ *
+ * Authors
+ *	Zong Li <zong.li@sifive.com>
+ */
+
+#include <linux/io-64-nonatomic-hi-lo.h>
+
+#include "iommu.h"
+#include "iommu-bits.h"
+
+#define to_riscv_iommu_pmu(p) (container_of(p, struct riscv_iommu_pmu, pmu))
+
+#define RISCV_IOMMU_PMU_ATTR_EXTRACTOR(_name, _mask)			\
+	static inline u32 get_##_name(struct perf_event *event)		\
+	{								\
+		return FIELD_GET(_mask, event->attr.config);		\
+	}								\
+
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(event, RISCV_IOMMU_IOHPMEVT_EVENTID);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(partial_matching, RISCV_IOMMU_IOHPMEVT_DMASK);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(pid_pscid, RISCV_IOMMU_IOHPMEVT_PID_PSCID);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(did_gscid, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_pid_pscid, RISCV_IOMMU_IOHPMEVT_PV_PSCV);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_did_gscid, RISCV_IOMMU_IOHPMEVT_DV_GSCV);
+RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_id_type, RISCV_IOMMU_IOHPMEVT_IDT);
+
+/* Formats */
+PMU_FORMAT_ATTR(event, "config:0-14");
+PMU_FORMAT_ATTR(partial_matching, "config:15");
+PMU_FORMAT_ATTR(pid_pscid, "config:16-35");
+PMU_FORMAT_ATTR(did_gscid, "config:36-59");
+PMU_FORMAT_ATTR(filter_pid_pscid, "config:60");
+PMU_FORMAT_ATTR(filter_did_gscid, "config:61");
+PMU_FORMAT_ATTR(filter_id_type, "config:62");
+
+static struct attribute *riscv_iommu_pmu_formats[] = {
+	&format_attr_event.attr,
+	&format_attr_partial_matching.attr,
+	&format_attr_pid_pscid.attr,
+	&format_attr_did_gscid.attr,
+	&format_attr_filter_pid_pscid.attr,
+	&format_attr_filter_did_gscid.attr,
+	&format_attr_filter_id_type.attr,
+	NULL,
+};
+
+static const struct attribute_group riscv_iommu_pmu_format_group = {
+	.name = "format",
+	.attrs = riscv_iommu_pmu_formats,
+};
+
+/* Events */
+static ssize_t riscv_iommu_pmu_event_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+PMU_EVENT_ATTR(cycle, event_attr_cycle,
+	       RISCV_IOMMU_HPMEVENT_CYCLE, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(dont_count, event_attr_dont_count,
+	       RISCV_IOMMU_HPMEVENT_INVALID, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(untranslated_req, event_attr_untranslated_req,
+	       RISCV_IOMMU_HPMEVENT_URQ, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(translated_req, event_attr_translated_req,
+	       RISCV_IOMMU_HPMEVENT_TRQ, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(ats_trans_req, event_attr_ats_trans_req,
+	       RISCV_IOMMU_HPMEVENT_ATS_RQ, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(tlb_miss, event_attr_tlb_miss,
+	       RISCV_IOMMU_HPMEVENT_TLB_MISS, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(ddt_walks, event_attr_ddt_walks,
+	       RISCV_IOMMU_HPMEVENT_DD_WALK, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(pdt_walks, event_attr_pdt_walks,
+	       RISCV_IOMMU_HPMEVENT_PD_WALK, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(s_vs_pt_walks, event_attr_s_vs_pt_walks,
+	       RISCV_IOMMU_HPMEVENT_S_VS_WALKS, riscv_iommu_pmu_event_show);
+PMU_EVENT_ATTR(g_pt_walks, event_attr_g_pt_walks,
+	       RISCV_IOMMU_HPMEVENT_G_WALKS, riscv_iommu_pmu_event_show);
+
+static struct attribute *riscv_iommu_pmu_events[] = {
+	&event_attr_cycle.attr.attr,
+	&event_attr_dont_count.attr.attr,
+	&event_attr_untranslated_req.attr.attr,
+	&event_attr_translated_req.attr.attr,
+	&event_attr_ats_trans_req.attr.attr,
+	&event_attr_tlb_miss.attr.attr,
+	&event_attr_ddt_walks.attr.attr,
+	&event_attr_pdt_walks.attr.attr,
+	&event_attr_s_vs_pt_walks.attr.attr,
+	&event_attr_g_pt_walks.attr.attr,
+	NULL,
+};
+
+static const struct attribute_group riscv_iommu_pmu_events_group = {
+	.name = "events",
+	.attrs = riscv_iommu_pmu_events,
+};
+
+static const struct attribute_group *riscv_iommu_pmu_attr_grps[] = {
+	&riscv_iommu_pmu_format_group,
+	&riscv_iommu_pmu_events_group,
+	NULL,
+};
+
+/* PMU Operations */
+static void riscv_iommu_pmu_set_counter(struct riscv_iommu_pmu *pmu, u32 idx,
+					u64 value)
+{
+	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
+
+	if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
+		return;
+
+	if (idx == 0)
+		value = (value & ~RISCV_IOMMU_IOHPMCYCLES_OF) |
+			 (readq(addr) & RISCV_IOMMU_IOHPMCYCLES_OF);
+
+	writeq(FIELD_PREP(RISCV_IOMMU_IOHPMCTR_COUNTER, value), addr + idx * 8);
+}
+
+static u64 riscv_iommu_pmu_get_counter(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
+	u64 value;
+
+	if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
+		return -EINVAL;
+
+	value = readq(addr + idx * 8);
+
+	if (idx == 0)
+		return FIELD_GET(RISCV_IOMMU_IOHPMCYCLES_COUNTER, value);
+
+	return FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, value);
+}
+
+static u64 riscv_iommu_pmu_get_event(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
+
+	if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
+		return 0;
+
+	/* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
+	if (idx == 0)
+		return 0;
+
+	return readq(addr + (idx - 1) * 8);
+}
+
+static void riscv_iommu_pmu_set_event(struct riscv_iommu_pmu *pmu, u32 idx,
+				      u64 value)
+{
+	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE;
+
+	if (WARN_ON_ONCE(idx < 0 || idx > pmu->num_counters))
+		return;
+
+	/* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
+	if (idx == 0)
+		return;
+
+	writeq(value, addr + (idx - 1) * 8);
+}
+
+static void riscv_iommu_pmu_enable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
+	u32 value = readl(addr);
+
+	writel(value & ~BIT(idx), addr);
+}
+
+static void riscv_iommu_pmu_disable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
+	u32 value = readl(addr);
+
+	writel(value | BIT(idx), addr);
+}
+
+static void riscv_iommu_pmu_enable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+	u64 value;
+
+	if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
+		value = riscv_iommu_pmu_get_counter(pmu, idx) & ~RISCV_IOMMU_IOHPMCYCLES_OF;
+		writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
+	} else {
+		value = riscv_iommu_pmu_get_event(pmu, idx) & ~RISCV_IOMMU_IOHPMEVT_OF;
+		writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
+	}
+}
+
+static void riscv_iommu_pmu_disable_ovf_intr(struct riscv_iommu_pmu *pmu, u32 idx)
+{
+	u64 value;
+
+	if (get_event(pmu->events[idx]) == RISCV_IOMMU_HPMEVENT_CYCLE) {
+		value = riscv_iommu_pmu_get_counter(pmu, idx) | RISCV_IOMMU_IOHPMCYCLES_OF;
+		writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES);
+	} else {
+		value = riscv_iommu_pmu_get_event(pmu, idx) | RISCV_IOMMU_IOHPMEVT_OF;
+		writeq(value, pmu->reg + RISCV_IOMMU_REG_IOHPMEVT_BASE + (idx - 1) * 8);
+	}
+}
+
+static void riscv_iommu_pmu_start_all(struct riscv_iommu_pmu *pmu)
+{
+	int idx;
+
+	for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
+		riscv_iommu_pmu_enable_ovf_intr(pmu, idx);
+		riscv_iommu_pmu_enable_counter(pmu, idx);
+	}
+}
+
+static void riscv_iommu_pmu_stop_all(struct riscv_iommu_pmu *pmu)
+{
+	writel(GENMASK_ULL(pmu->num_counters - 1, 0),
+	       pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH);
+}
+
+/* PMU APIs */
+static int riscv_iommu_pmu_set_period(struct perf_event *event)
+{
+	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	u64 max_period = pmu->mask_counter;
+	int ret = 0;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	/*
+	 * Limit the maximum period to prevent the counter value
+	 * from overtaking the one we are about to program. In
+	 * effect we are reducing max_period to account for
+	 * interrupt latency (and we are being very conservative).
+	 */
+	if (left > (max_period >> 1))
+		left = (max_period >> 1);
+
+	local64_set(&hwc->prev_count, (u64)-left);
+	riscv_iommu_pmu_set_counter(pmu, hwc->idx, (u64)(-left) & max_period);
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+static int riscv_iommu_pmu_event_init(struct perf_event *event)
+{
+	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = -1;
+	hwc->config = event->attr.config;
+
+	if (!is_sampling_event(event)) {
+		/*
+		 * For non-sampling runs, limit the sample_period to half
+		 * of the counter width. That way, the new counter value
+		 * is far less likely to overtake the previous one unless
+		 * you have some serious IRQ latency issues.
+		 */
+		hwc->sample_period = pmu->mask_counter >> 1;
+		hwc->last_period = hwc->sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	return 0;
+}
+
+static void riscv_iommu_pmu_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+	u64 delta, prev, now;
+	u32 idx = hwc->idx;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+		now = riscv_iommu_pmu_get_counter(pmu, idx);
+	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+	delta = FIELD_GET(RISCV_IOMMU_IOHPMCTR_COUNTER, now - prev) & pmu->mask_counter;
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+}
+
+static void riscv_iommu_pmu_start(struct perf_event *event, int flags)
+{
+	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+	hwc->state = 0;
+	riscv_iommu_pmu_set_period(event);
+	riscv_iommu_pmu_set_event(pmu, hwc->idx, hwc->config);
+	riscv_iommu_pmu_enable_ovf_intr(pmu, hwc->idx);
+	riscv_iommu_pmu_enable_counter(pmu, hwc->idx);
+
+	perf_event_update_userpage(event);
+}
+
+static void riscv_iommu_pmu_stop(struct perf_event *event, int flags)
+{
+	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->state & PERF_HES_STOPPED)
+		return;
+
+	riscv_iommu_pmu_set_event(pmu, hwc->idx, RISCV_IOMMU_HPMEVENT_INVALID);
+	riscv_iommu_pmu_disable_counter(pmu, hwc->idx);
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE))
+		riscv_iommu_pmu_update(event);
+
+	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int riscv_iommu_pmu_add(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+	unsigned int num_counters = pmu->num_counters;
+	int idx;
+
+	/* Reserve index zero for iohpmcycles */
+	if (get_event(event) == RISCV_IOMMU_HPMEVENT_CYCLE)
+		idx = 0;
+	else
+		idx = find_next_zero_bit(pmu->used_counters, num_counters, 1);
+
+	if (idx == num_counters)
+		return -EAGAIN;
+
+	set_bit(idx, pmu->used_counters);
+
+	pmu->events[idx] = event;
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	if (flags & PERF_EF_START)
+		riscv_iommu_pmu_start(event, flags);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void riscv_iommu_pmu_read(struct perf_event *event)
+{
+	riscv_iommu_pmu_update(event);
+}
+
+static void riscv_iommu_pmu_del(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
+	int idx = hwc->idx;
+
+	riscv_iommu_pmu_stop(event, PERF_EF_UPDATE);
+	pmu->events[idx] = NULL;
+	clear_bit(idx, pmu->used_counters);
+	perf_event_update_userpage(event);
+}
+
+irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu)
+{
+	struct perf_sample_data data;
+	struct pt_regs *regs;
+	u32 ovf = readl(pmu->reg + RISCV_IOMMU_REG_IOCOUNTOVF);
+	int idx;
+
+	if (!ovf)
+		return IRQ_NONE;
+
+	riscv_iommu_pmu_stop_all(pmu);
+
+	regs = get_irq_regs();
+
+	for_each_set_bit(idx, (unsigned long *)&ovf, pmu->num_counters) {
+		struct perf_event *event = pmu->events[idx];
+		struct hw_perf_event *hwc;
+
+		if (WARN_ON_ONCE(!event) || !is_sampling_event(event))
+			continue;
+
+		hwc = &event->hw;
+
+		riscv_iommu_pmu_update(event);
+		perf_sample_data_init(&data, 0, hwc->last_period);
+		if (!riscv_iommu_pmu_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			riscv_iommu_pmu_stop(event, 0);
+	}
+
+	riscv_iommu_pmu_start_all(pmu);
+
+	return IRQ_HANDLED;
+}
+
+int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
+			 const char *dev_name)
+{
+	char *name;
+	int ret;
+
+	pmu->reg = reg;
+	pmu->num_counters = RISCV_IOMMU_HPM_COUNTER_NUM;
+	pmu->mask_counter = RISCV_IOMMU_IOHPMCTR_COUNTER;
+
+	pmu->pmu = (struct pmu) {
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= riscv_iommu_pmu_event_init,
+		.add		= riscv_iommu_pmu_add,
+		.del		= riscv_iommu_pmu_del,
+		.start		= riscv_iommu_pmu_start,
+		.stop		= riscv_iommu_pmu_stop,
+		.read		= riscv_iommu_pmu_read,
+		.attr_groups	= riscv_iommu_pmu_attr_grps,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+		.module		= THIS_MODULE,
+	};
+
+	name = kasprintf(GFP_KERNEL, "riscv_iommu_pmu_%s", dev_name);
+
+	ret = perf_pmu_register(&pmu->pmu, name, -1);
+	if (ret) {
+		pr_err("Failed to register riscv_iommu_pmu_%s: %d\n",
+		       dev_name, ret);
+		return ret;
+	}
+
+	/* Stop all counters and later start the counter with perf */
+	riscv_iommu_pmu_stop_all(pmu);
+
+	pr_info("riscv_iommu_pmu_%s: Registered with %d counters\n",
+		dev_name, pmu->num_counters);
+
+	return 0;
+}
+
+void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu)
+{
+	int idx;
+
+	/* Disable interrupt and functions */
+	for_each_set_bit(idx, pmu->used_counters, pmu->num_counters) {
+		riscv_iommu_pmu_disable_counter(pmu, idx);
+		riscv_iommu_pmu_disable_ovf_intr(pmu, idx);
+	}
+
+	perf_pmu_unregister(&pmu->pmu);
+}
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index b1c4664542b4..92659a8a75ae 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -60,11 +60,19 @@  struct riscv_iommu_device {
 	unsigned int ddt_mode;
 	dma_addr_t ddt_phys;
 	u64 *ddt_root;
+
+	/* hardware performance monitor */
+	struct riscv_iommu_pmu pmu;
 };
 
 int riscv_iommu_init(struct riscv_iommu_device *iommu);
 void riscv_iommu_remove(struct riscv_iommu_device *iommu);
 
+int riscv_iommu_pmu_init(struct riscv_iommu_pmu *pmu, void __iomem *reg,
+			 const char *name);
+void riscv_iommu_pmu_uninit(struct riscv_iommu_pmu *pmu);
+irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu);
+
 #define riscv_iommu_readl(iommu, addr) \
 	readl_relaxed((iommu)->reg + (addr))