diff mbox series

[5/6] perf: Add ARM SMMU PMU driver

Message ID 041f45146d77b49b488bfa37926f96a319cb43d3.1645106346.git.robin.murphy@arm.com (mailing list archive)
State New, archived
Headers show
Series perf: Arm SMMU PMU driver | expand

Commit Message

Robin Murphy Feb. 17, 2022, 2:24 p.m. UTC
Add a basic driver for the SMMUv2 Performance Monitors Extension. This
exposes the architecturally-defined events along with a fairly low-level
interface, based on the relevant register fields, for filtering.

The relationship between Stream ID Groups and Counter Groups, i.e. which
subsets of events may be visible to which sets of counters, is entirely
implementation defined and non-discoverable, and attempting to define
firmware bindings to describe the mappings would be considerable work
for very little benefit. Thus we expect the user to have the relevant
implementation knowledge, and explicitly specify the appropriate counter
group for each desired event.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 drivers/perf/Kconfig        |  24 +-
 drivers/perf/Makefile       |   1 +
 drivers/perf/arm-smmu-pmu.c | 732 ++++++++++++++++++++++++++++++++++++
 3 files changed, 748 insertions(+), 9 deletions(-)
 create mode 100644 drivers/perf/arm-smmu-pmu.c
diff mbox series

Patch

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index e1a0c44bc686..6ee07c392bdb 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -48,6 +48,15 @@  config ARM_CMN
 	  Support for PMU events monitoring on the Arm CMN-600 Coherent Mesh
 	  Network interconnect.
 
+config ARM_DSU_PMU
+	tristate "ARM DynamIQ Shared Unit (DSU) PMU"
+	depends on ARM64
+	  help
+	  Provides support for performance monitor unit in ARM DynamIQ Shared
+	  Unit (DSU). The DSU integrates one or more cores with an L3 memory
+	  system, control logic. The PMU allows counting various events related
+	  to DSU.
+
 config ARM_PMU
 	depends on ARM || ARM64
 	bool "ARM PMU framework"
@@ -60,6 +69,12 @@  config ARM_PMU_ACPI
 	depends on ARM_PMU && ACPI
 	def_bool y
 
+config ARM_SMMU_PMU
+	tristate "Arm SMMUv2 PMU support"
+	depends on ARM_SMMU
+	help
+	 Support for ARM SMMUv1/v2 performance monitors.
+
 config ARM_SMMU_V3_PMU
 	 tristate "ARM SMMUv3 Performance Monitors Extension"
 	 depends on (ARM64 && ACPI) || (COMPILE_TEST && 64BIT)
@@ -70,15 +85,6 @@  config ARM_SMMU_V3_PMU
 	   through the SMMU and allow the resulting information to be filtered
 	   based on the Stream ID of the corresponding master.
 
-config ARM_DSU_PMU
-	tristate "ARM DynamIQ Shared Unit (DSU) PMU"
-	depends on ARM64
-	  help
-	  Provides support for performance monitor unit in ARM DynamIQ Shared
-	  Unit (DSU). The DSU integrates one or more cores with an L3 memory
-	  system, control logic. The PMU allows counting various events related
-	  to DSU.
-
 config FSL_IMX8_DDR_PMU
 	tristate "Freescale i.MX8 DDR perf monitor"
 	depends on ARCH_MXC || COMPILE_TEST
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 2db5418d5b0a..a0a1483a6825 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -5,6 +5,7 @@  obj-$(CONFIG_ARM_CMN) += arm-cmn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_PMU) += arm-smmu-pmu.o
 obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_FSL_IMX8_DDR_PMU) += fsl_imx8_ddr_perf.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
diff --git a/drivers/perf/arm-smmu-pmu.c b/drivers/perf/arm-smmu-pmu.c
new file mode 100644
index 000000000000..da2cf00a5569
--- /dev/null
+++ b/drivers/perf/arm-smmu-pmu.c
@@ -0,0 +1,732 @@ 
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017-2022 Arm Ltd.
+// PMU driver for the Arm SMMU Performance Monitors Extension
+
+#define pr_fmt(fmt) "arm-smmu-pmu: " fmt
+
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_irq.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+#define ARM_SMMU_PMEVCTR(n)		(0x0000 + (n) * 4)
+#define ARM_SMMU_PMEVTYPER(n)		(0x0400 + (n) * 4)
+#define ARM_SMMU_PMCGCR(g)		(0x0800 + (g) * 4)
+#define ARM_SMMU_PMCGSMR(g)		(0x0A00 + (g) * 4)
+#define ARM_SMMU_PMCNTENSET(n)		(0x0C00 + (n) / 32 * 4)
+#define ARM_SMMU_PMCNTENCLR(n)		(0x0C20 + (n) / 32 * 4)
+#define ARM_SMMU_PMINTENSET(n)		(0x0C40 + (n) / 32 * 4)
+#define ARM_SMMU_PMINTENCLR(n)		(0x0C60 + (n) / 32 * 4)
+#define ARM_SMMU_PMOVSCLR(n)		(0x0C80 + (n) / 32 * 4)
+#define ARM_SMMU_PMCFGR			0x0e00
+#define ARM_SMMU_PMCR			0x0e04
+#define ARM_SMMU_PMCEID0		0x0e20
+
+#define ARM_SMMU_EVENT_BIT(n)		BIT((n) % 32)
+
+#define ARM_SMMU_PMEVTYPER_EVENT	GENMASK(15, 0)
+#define ARM_SMMU_PMEVTYPER_NSU		BIT(28)
+#define ARM_SMMU_PMEVTYPER_NSP		BIT(29)
+#define ARM_SMMU_PMEVTYPER_U		BIT(30)
+#define ARM_SMMU_PMEVTYPER_P		BIT(31)
+
+#define ARM_SMMU_PMCGCR_NDX		GENMASK(7, 0)
+#define ARM_SMMU_PMCGCR_TCEFCFG		GENMASK(9, 8)
+#define ARM_SMMU_PMCGCR_E		BIT(11)
+#define ARM_SMMU_PMCGCR_SIDG		GENMASK(22, 16)
+#define ARM_SMMU_PMCGCR_CGNC		GENMASK(27, 24)
+
+#define ARM_SMMU_PMCFGR_N		GENMASK(7, 0)
+#define ARM_SMMU_PMCFGR_NCG		GENMASK(31, 24)
+
+#define ARM_SMMU_PMCGSMR_ID		GENMASK(15, 0)
+#define ARM_SMMU_PMCGSMR_MASK		GENMASK(31, 16)
+
+#define ARM_SMMU_PMCR_E			BIT(0)
+#define ARM_SMMU_PMCR_P			BIT(1)
+
+#define ARM_SMMU_PMU_MAX_COUNTERS	256
+
+/*
+ * For system PMUs where sampling events are irrelevant, the often-copied
+ * vague comments about "interrupt latency" accompanying this counter logic
+ * are misleading nonsense; here's the real deal...
+ *
+ * Without a sample period to worry about overrunning, this boils down to a
+ * trick to make reading counter values easier. Starting an n-bit counter
+ * at (2^(n-1)) effectively gives us an (n-1)-bit counter plus an overflow
+ * bit which can be read atomically together; the real trick at play,
+ * though, is that it puts logical overflow (where we lose information)
+ * half a period out-of-phase with arithmetic overflow (where we get an
+ * interrupt). By virtue of two's complement, reads can always compute the
+ * delta from the full counter value as ((new - prev) % (2^n)) regardless
+ * of wraparound, while the job of the overflow interrupt becomes that of
+ * maintaining the out-of-phase relationship; once it fires, the handler
+ * has the remaining half-period in which to accumulate any residual delta
+ * since the last read and restart the counter from the half-maximum value.
+ * In terms of simply reading counter values, this offers no more tolerance
+ * of latency than simply adding a full period to the count once per period,
+ * but means there is no race if, say, an interrupt is left pending as the
+ * counter is stopped. The result is that we avoid accessing the overflow
+ * register (beyond clearing interrupts), and indeed any explicit overflow
+ * arithmetic at all, at the cost of taking interrupts up to twice as often
+ * as we otherwise might, and writing to the counter register each time.
+ */
+#define ARM_SMMU_PMU_COUNTER_INIT_VAL	0x80000000
+
+static int arm_smmu_hp_state;
+
+struct arm_smmu_pmu_cg {
+	struct arm_smmu_pmu *pmu;
+	int irq;
+	u16 first;
+	u16 last;
+};
+
+struct arm_smmu_pmu {
+	void __iomem *base;
+	u16 num_counters;
+	u16 num_cgs;
+	u32 events;
+	struct perf_event **counters;
+	struct arm_smmu_pmu_cg *cgs;
+	struct pmu pmu;
+	struct hlist_node cpuhp_node;
+	int cpu;
+};
+
+#define to_smmu_pmu(x)	container_of(x, struct arm_smmu_pmu, pmu)
+
+struct arm_smmu_format_attr {
+	struct device_attribute attr;
+	u8 cfg;
+	u8 lsb;
+	u8 msb;
+};
+
+#define ARM_SMMU_FORMAT_ATTR(_name, _cfgoff, _mask)			\
+	(&((struct arm_smmu_format_attr[]) {{				\
+		.attr = __ATTR(_name, 0444, arm_smmu_format_show, NULL),\
+		.cfg = _cfgoff / 64,					\
+		.lsb = __builtin_ctz(_mask) + _cfgoff % 64,		\
+		.msb = (31 - __builtin_clz(_mask)) + _cfgoff % 64,	\
+	}})[0].attr.attr)
+
+static ssize_t arm_smmu_format_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct arm_smmu_format_attr *fattr;
+	int n;
+
+	fattr = container_of(attr, typeof(*fattr), attr);
+
+	n = sysfs_emit(buf, "config");
+	if (fattr->cfg > 0)
+		n += sysfs_emit_at(buf, n, "%u", fattr->cfg);
+	n += sysfs_emit_at(buf, n, ":%u", fattr->lsb);
+	if (fattr->msb > fattr->lsb)
+		n += sysfs_emit_at(buf, n, "-%u", fattr->msb);
+
+	return n + sysfs_emit_at(buf, n, "\n");
+}
+
+#define ARM_SMMU_EVENT_ATTR(_name, _var)				\
+	(&((struct dev_ext_attribute[]) {{				\
+		.attr = __ATTR(_name, 0444, arm_smmu_event_show, NULL),	\
+		.var = (void *)_var,					\
+	}})[0].attr.attr)
+
+static ssize_t arm_smmu_event_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, typeof(*eattr), attr);
+	return sysfs_emit(buf, "event=0x%lx\n", (unsigned long)eattr->var);
+}
+
+static bool arm_smmu_event_supported(struct arm_smmu_pmu *pmu, u16 event)
+{
+	/* We can't validate IMP-DEF events; assume they're OK */
+	if (event >= 128)
+		return true;
+	/* Otherwise, check PMCEID0 (nothing is defined in PMCEID1) */
+	return (event < 32) && (pmu->events & (1U << event));
+}
+
+static umode_t arm_smmu_event_attr_is_visible(struct kobject *kobj,
+					      struct attribute *attr,
+					      int unused)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct arm_smmu_pmu *smmu_pmu = to_smmu_pmu(dev_get_drvdata(dev));
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, typeof(*eattr), attr.attr);
+	if (arm_smmu_event_supported(smmu_pmu, (unsigned long)eattr->var))
+		return attr->mode;
+
+	return 0;
+}
+
+static struct attribute *arm_smmu_event_attrs[] = {
+	ARM_SMMU_EVENT_ATTR(cycles,		0x00),
+	ARM_SMMU_EVENT_ATTR(cycles_div64,	0x01),
+	ARM_SMMU_EVENT_ATTR(tlb_alloc,		0x08),
+	ARM_SMMU_EVENT_ATTR(tlb_alloc_r,	0x09),
+	ARM_SMMU_EVENT_ATTR(tlb_alloc_w,	0x0a),
+	ARM_SMMU_EVENT_ATTR(access,		0x10),
+	ARM_SMMU_EVENT_ATTR(access_r,		0x11),
+	ARM_SMMU_EVENT_ATTR(access_w,		0x12),
+	NULL
+};
+
+static const struct attribute_group arm_smmu_event_attrs_group = {
+	.name = "events",
+	.attrs = arm_smmu_event_attrs,
+	.is_visible = arm_smmu_event_attr_is_visible,
+};
+
+#define ARM_SMMU_FORMAT_CFG0L(name, mask) ARM_SMMU_FORMAT_ATTR(name, 0, mask)
+#define ARM_SMMU_FORMAT_CFG0H(name, mask) ARM_SMMU_FORMAT_ATTR(name, 32, mask)
+#define ARM_SMMU_FORMAT_CFG1L(name, mask) ARM_SMMU_FORMAT_ATTR(name, 64, mask)
+#define ARM_SMMU_FORMAT_CFG1H(name, mask) ARM_SMMU_FORMAT_ATTR(name, 96, mask)
+
+static struct attribute *arm_smmu_format_attrs[] = {
+	/* The lower half of config looks like PMEVTYPER... */
+	ARM_SMMU_FORMAT_CFG0L(event,	ARM_SMMU_PMEVTYPER_EVENT),
+	ARM_SMMU_FORMAT_CFG0L(nsu,	ARM_SMMU_PMEVTYPER_NSU),
+	ARM_SMMU_FORMAT_CFG0L(nsp,	ARM_SMMU_PMEVTYPER_NSP),
+	ARM_SMMU_FORMAT_CFG0L(u,	ARM_SMMU_PMEVTYPER_U),
+	ARM_SMMU_FORMAT_CFG0L(p,	ARM_SMMU_PMEVTYPER_P),
+	/* ...and the upper half is PMCGCR */
+	ARM_SMMU_FORMAT_CFG0H(ndx,	ARM_SMMU_PMCGCR_NDX),
+	ARM_SMMU_FORMAT_CFG0H(tcefcfg,	ARM_SMMU_PMCGCR_TCEFCFG),
+	/* Similarly, PMCGSMR goes in config1... */
+	ARM_SMMU_FORMAT_CFG1L(smr_id,	ARM_SMMU_PMCGSMR_ID),
+	ARM_SMMU_FORMAT_CFG1L(smr_mask,	ARM_SMMU_PMCGSMR_MASK),
+	/* ...with the counter group above it */
+	ARM_SMMU_FORMAT_CFG1H(cg,	0xff),
+	NULL
+};
+
+#define ARM_SMMU_CONFIG_PMEVTYPER(cfg)						\
+	((cfg) & (ARM_SMMU_PMEVTYPER_EVENT | ARM_SMMU_PMEVTYPER_NSU |		\
+	ARM_SMMU_PMEVTYPER_NSP | ARM_SMMU_PMEVTYPER_U |	ARM_SMMU_PMEVTYPER_P))
+#define ARM_SMMU_CONFIG_PMCGCR(cfg) \
+	((cfg) >> 32 & (ARM_SMMU_PMCGCR_TCEFCFG | ARM_SMMU_PMCGCR_NDX))
+#define ARM_SMMU_CONFIG1_PMCGSMR(cfg1)	((u32)(cfg1))
+#define ARM_SMMU_CONFIG1_CGIDX(cfg1)	((cfg1) >> 32 & 0xff)
+
+static const struct attribute_group arm_smmu_format_attrs_group = {
+	.name = "format",
+	.attrs = arm_smmu_format_attrs,
+};
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct arm_smmu_pmu *pmu = to_smmu_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pmu->cpu));
+}
+
+static struct device_attribute arm_smmu_cpumask_attr = __ATTR_RO(cpumask);
+
+static struct attribute *arm_smmu_cpumask_attrs[] = {
+	&arm_smmu_cpumask_attr.attr,
+	NULL,
+};
+
+static struct attribute_group arm_smmu_cpumask_attr_group = {
+	.attrs = arm_smmu_cpumask_attrs,
+};
+
+static const struct attribute_group *arm_smmu_attr_groups[] = {
+	&arm_smmu_event_attrs_group,
+	&arm_smmu_format_attrs_group,
+	&arm_smmu_cpumask_attr_group,
+	NULL
+};
+
+static void arm_smmu_pmu_enable(struct pmu *pmu)
+{
+	struct arm_smmu_pmu *smmu_pmu = to_smmu_pmu(pmu);
+
+	writel_relaxed(ARM_SMMU_PMCR_E, smmu_pmu->base + ARM_SMMU_PMCR);
+}
+
+static void arm_smmu_pmu_disable(struct pmu *pmu)
+{
+	struct arm_smmu_pmu *smmu_pmu = to_smmu_pmu(pmu);
+
+	writel_relaxed(0, smmu_pmu->base + ARM_SMMU_PMCR);
+}
+
+static u32 arm_smmu_read_ctr(struct perf_event *event)
+{
+	struct arm_smmu_pmu *pmu = to_smmu_pmu(event->pmu);
+
+	return readl_relaxed(pmu->base + ARM_SMMU_PMEVCTR(event->hw.idx));
+}
+
+static void arm_smmu_write_ctr(struct perf_event *event, u32 val)
+{
+	struct arm_smmu_pmu *pmu = to_smmu_pmu(event->pmu);
+
+	writel_relaxed(val, pmu->base + ARM_SMMU_PMEVCTR(event->hw.idx));
+}
+
+static void arm_smmu_init_ctr(struct perf_event *event)
+{
+	local64_set(&event->hw.prev_count, ARM_SMMU_PMU_COUNTER_INIT_VAL);
+	arm_smmu_write_ctr(event, ARM_SMMU_PMU_COUNTER_INIT_VAL);
+}
+
+static void arm_smmu_event_read(struct perf_event *event)
+{
+	local64_t *hw_prev = &event->hw.prev_count;
+	u32 new, prev;
+
+	do {
+		prev = local64_read(hw_prev);
+		new = arm_smmu_read_ctr(event);
+	} while (local64_cmpxchg(hw_prev, prev, new) != prev);
+
+	local64_add(new - prev, &event->count);
+}
+
+static void arm_smmu_event_start(struct perf_event *event, int flags)
+{
+	struct arm_smmu_pmu *pmu = to_smmu_pmu(event->pmu);
+	int idx = event->hw.idx;
+
+	if (flags & PERF_EF_RELOAD)
+		arm_smmu_write_ctr(event, local64_read(&event->hw.prev_count));
+
+	writel_relaxed(ARM_SMMU_EVENT_BIT(idx),
+		       pmu->base + ARM_SMMU_PMCNTENSET(idx));
+}
+
+static void arm_smmu_event_stop(struct perf_event *event, int flags)
+{
+	struct arm_smmu_pmu *pmu = to_smmu_pmu(event->pmu);
+	int idx = event->hw.idx;
+
+	writel_relaxed(ARM_SMMU_EVENT_BIT(idx),
+		       pmu->base + ARM_SMMU_PMCNTENCLR(idx));
+
+	if (flags & PERF_EF_UPDATE)
+		arm_smmu_event_read(event);
+}
+
+static bool arm_smmu_event_compatible(struct perf_event *a,
+				      struct perf_event *b)
+{
+	u32 pmcgcr_a, pmcgcr_b, pmcgsmr_a, pmcgsmr_b, mask;
+	int tcefcfg;
+
+	pmcgcr_a = ARM_SMMU_CONFIG_PMCGCR(a->attr.config);
+	pmcgcr_b = ARM_SMMU_CONFIG_PMCGCR(b->attr.config);
+	tcefcfg = FIELD_GET(ARM_SMMU_PMCGCR_TCEFCFG, pmcgcr_a);
+	if (tcefcfg != FIELD_GET(ARM_SMMU_PMCGCR_TCEFCFG, pmcgcr_b))
+		return false;
+
+	if (tcefcfg == 2 && FIELD_GET(ARM_SMMU_PMCGCR_NDX, pmcgcr_a ^ pmcgcr_b))
+		return false;
+
+	pmcgsmr_a = ARM_SMMU_CONFIG1_PMCGSMR(a->attr.config1);
+	pmcgsmr_b = ARM_SMMU_CONFIG1_PMCGSMR(b->attr.config1);
+	mask = FIELD_GET(ARM_SMMU_PMCGSMR_MASK, pmcgsmr_a);
+	if (tcefcfg == 1 && ((pmcgsmr_a ^ pmcgsmr_b) & ~mask))
+		return false;
+
+	return true;
+}
+
+static bool arm_smmu_group_event_ok(struct perf_event *event,
+				    struct perf_event *group_event,
+				    int *num_counters)
+{
+	if (is_software_event(group_event))
+		return true;
+
+	if (group_event->pmu != event->pmu)
+		return false;
+
+	if (ARM_SMMU_CONFIG1_CGIDX(event->attr.config1) !=
+	    ARM_SMMU_CONFIG1_CGIDX(group_event->attr.config1))
+		return true;
+
+	if (!arm_smmu_event_compatible(event, group_event))
+		return false;
+
+	if (--*num_counters < 0)
+		return false;
+
+	return true;
+}
+
+static bool arm_smmu_group_valid(struct arm_smmu_pmu *pmu,
+				 struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct perf_event *sibling;
+	int cgidx, free_counters;
+
+	cgidx = ARM_SMMU_CONFIG1_CGIDX(event->attr.config1);
+	free_counters = pmu->cgs[cgidx].last - pmu->cgs[cgidx].first;
+
+	if (!arm_smmu_group_event_ok(event, leader, &free_counters))
+		return false;
+
+	for_each_sibling_event(sibling, leader) {
+		if (!arm_smmu_group_event_ok(event, sibling, &free_counters))
+			return false;
+	}
+	return true;
+}
+
+static bool arm_smmu_event_valid(struct arm_smmu_pmu *pmu,
+				 struct perf_event *event)
+{
+	int cgidx, tcefcfg;
+	u32 pmcgcr;
+
+	cgidx = ARM_SMMU_CONFIG1_CGIDX(event->attr.config1);
+	if (cgidx >= pmu->num_cgs)
+		return false;
+
+	pmcgcr = ARM_SMMU_CONFIG_PMCGCR(event->attr.config);
+	tcefcfg = FIELD_GET(ARM_SMMU_PMCGCR_TCEFCFG, pmcgcr);
+	if (tcefcfg == 3)
+		return false;
+
+	return true;
+}
+
+static int arm_smmu_event_init(struct perf_event *event)
+{
+	struct arm_smmu_pmu *pmu = to_smmu_pmu(event->pmu);
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	event->cpu = pmu->cpu;
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	if (!arm_smmu_event_valid(pmu, event))
+		return -EINVAL;
+
+	if (!arm_smmu_event_supported(pmu, event->attr.config))
+		return -EOPNOTSUPP;
+
+	if (event->group_leader != event && !arm_smmu_group_valid(pmu, event))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int arm_smmu_find_counter(struct arm_smmu_pmu *pmu, int cgidx,
+				 struct perf_event *event)
+{
+	struct arm_smmu_pmu_cg *cg = &pmu->cgs[cgidx];
+	int i, alloc_idx = -ENOSPC;
+
+	for (i = cg->first; i <= cg->last; i++) {
+		struct perf_event *tmp = pmu->counters[i];
+
+		if (!tmp)
+			alloc_idx = i;
+		else if (!arm_smmu_event_compatible(event, tmp))
+			return -EINVAL;
+	}
+
+	return alloc_idx;
+}
+
+static int arm_smmu_event_add(struct perf_event *event, int flags)
+{
+	struct arm_smmu_pmu *pmu = to_smmu_pmu(event->pmu);
+	struct hw_perf_event *hw = &event->hw;
+	u64 config = event->attr.config;
+	u64 config1 = event->attr.config1;
+	int cgidx, ctridx;
+
+	cgidx = ARM_SMMU_CONFIG1_CGIDX(event->attr.config1);
+	ctridx = arm_smmu_find_counter(pmu, cgidx, event);
+	if (ctridx < 0)
+		return ctridx;
+
+	pmu->counters[ctridx] = event;
+	hw->idx = ctridx;
+	arm_smmu_init_ctr(event);
+
+	writel_relaxed(ARM_SMMU_CONFIG_PMCGCR(config),
+		       pmu->base + ARM_SMMU_PMCGCR(cgidx));
+	writel_relaxed(ARM_SMMU_CONFIG1_PMCGSMR(config1),
+		       pmu->base + ARM_SMMU_PMCGSMR(cgidx));
+	writel_relaxed(ARM_SMMU_CONFIG_PMEVTYPER(config),
+		       pmu->base + ARM_SMMU_PMEVTYPER(ctridx));
+
+	if (flags & PERF_EF_START)
+		arm_smmu_event_start(event, 0);
+
+	return 0;
+}
+
+static void arm_smmu_event_del(struct perf_event *event, int flags)
+{
+	struct arm_smmu_pmu *pmu = to_smmu_pmu(event->pmu);
+
+	arm_smmu_event_stop(event, PERF_EF_UPDATE);
+	pmu->counters[event->hw.idx] = NULL;
+}
+
+static irqreturn_t arm_smmu_pmu_irq(int irq, void *dev)
+{
+	struct arm_smmu_pmu_cg *cg = dev;
+	void __iomem *pmovsclr;
+	u64 set, clear, bit;
+	int i;
+
+	pmovsclr = cg->pmu->base + ARM_SMMU_PMOVSCLR(cg->first);
+	set = readl_relaxed(pmovsclr);
+	clear = 0;
+	/* At worst, the bitmap for a given group may straddle two registers */
+	if ((cg->first ^ cg->last) & 32)
+		set |= (u64)readl_relaxed(pmovsclr + 4) << 32;
+
+	bit = ARM_SMMU_EVENT_BIT(cg->first);
+	for (i = cg->first; i <= cg->last; i++, bit <<= 1) {
+		if (!(set & bit))
+			continue;
+
+		clear |= bit;
+		arm_smmu_event_read(cg->pmu->counters[i]);
+		arm_smmu_init_ctr(cg->pmu->counters[i]);
+	}
+
+	if (!clear)
+		return IRQ_NONE;
+
+	if (lower_32_bits(clear))
+		writel_relaxed(lower_32_bits(clear), pmovsclr);
+	if (upper_32_bits(clear))
+		writel_relaxed(upper_32_bits(clear), pmovsclr + 4);
+
+	return IRQ_HANDLED;
+}
+
+static void arm_smmu_pmu_reset(struct arm_smmu_pmu *pmu, bool irq_set)
+{
+	int i, pminten;
+
+	writel_relaxed(ARM_SMMU_PMCR_P, pmu->base + ARM_SMMU_PMCR);
+	for (i = 0; i < pmu->num_counters; i += 32) {
+		pminten = irq_set ? ARM_SMMU_PMINTENSET(i) :
+				    ARM_SMMU_PMINTENCLR(i);
+		writel_relaxed(~0UL, pmu->base + pminten);
+		writel_relaxed(~0UL, pmu->base + ARM_SMMU_PMCNTENCLR(i));
+		writel_relaxed(~0UL, pmu->base + ARM_SMMU_PMOVSCLR(i));
+	}
+}
+
+static int arm_smmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct arm_smmu_pmu *pmu;
+	unsigned int target;
+	int i;
+
+	pmu = hlist_entry_safe(node, struct arm_smmu_pmu, cpuhp_node);
+	if (cpu != pmu->cpu)
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	perf_pmu_migrate_context(&pmu->pmu, cpu, target);
+	for (i = 0; i < pmu->num_cgs; i++)
+		irq_set_affinity(pmu->cgs[i].irq, cpumask_of(target));
+	pmu->cpu = target;
+
+	return 0;
+}
+
+static int arm_smmu_pmu_probe(struct platform_device *pdev)
+{
+	struct arm_smmu_pmu *pmu;
+	struct device *dev = &pdev->dev;
+	void __iomem *base;
+	const char *name;
+	int err, i, counter_offset;
+	u32 reg;
+
+	base = (void __iomem *)dev->platform_data;
+	if (!base)
+		return -EPROBE_DEFER;
+
+	pmu = devm_kzalloc(dev, sizeof(*pmu), GFP_KERNEL);
+	if (!pmu)
+		return -ENOMEM;
+
+	pmu->base = base;
+	pmu->events = readl_relaxed(base + ARM_SMMU_PMCEID0);
+
+	reg = readl_relaxed(base + ARM_SMMU_PMCFGR);
+	pmu->num_cgs = FIELD_GET(ARM_SMMU_PMCFGR_NCG, reg) + 1;
+	pmu->cgs = devm_kcalloc(dev, pmu->num_cgs,
+				sizeof(*pmu->cgs), GFP_KERNEL);
+	if (!pmu->cgs)
+		return -ENOMEM;
+
+	pmu->num_counters = FIELD_GET(ARM_SMMU_PMCFGR_N, reg) + 1;
+	pmu->counters = devm_kcalloc(dev, pmu->num_counters,
+				     sizeof(*pmu->counters), GFP_KERNEL);
+	if (!pmu->counters)
+		return -ENOMEM;
+
+	arm_smmu_pmu_reset(pmu, true);
+
+	counter_offset = 0;
+	for (i = 0; i < pmu->num_cgs; i++) {
+		int irq, cg_num_counters;
+
+		reg = readl_relaxed(base + ARM_SMMU_PMCGCR(i));
+		cg_num_counters = FIELD_GET(ARM_SMMU_PMCGCR_CGNC, reg);
+		dev_dbg(dev, "cg %d, %d counters, sidg %ld\n", i,
+			cg_num_counters, FIELD_GET(ARM_SMMU_PMCGCR_SIDG, reg));
+		pmu->cgs[i].first = counter_offset;
+		pmu->cgs[i].last = counter_offset + cg_num_counters - 1;
+		counter_offset += cg_num_counters;
+
+		irq = platform_get_irq(pdev, i);
+		if (irq < 0)
+			return irq;
+
+		pmu->cgs[i].pmu = pmu;
+		pmu->cgs[i].irq = irq;
+		/*
+		 * IRQF_SHARED here is relying entirely on the expectation that
+		 * at most we're only ever sharing with arm-smmu for the same
+		 * SMMU instance our PMU belongs to, and that that driver will
+		 * not touch the affinity. It's sketchy, but about the best we
+		 * can do given that there most definitely exists hardware
+		 * using a single combined IRQ for everything.
+		 */
+		err = devm_request_irq(dev, irq, arm_smmu_pmu_irq,
+				       IRQF_SHARED | IRQF_NOBALANCING,
+				       "arm-smmu pmu", &pmu->cgs[i]);
+		if (err)
+			return err;
+
+		writel_relaxed(0, base + ARM_SMMU_PMCGCR(i));
+		writel_relaxed(0, base + ARM_SMMU_PMCGSMR(i));
+	}
+	if (WARN_ON(counter_offset != pmu->num_counters))
+		return -ENODEV;
+
+	dev_info(dev, "PMU with %d counters in %d groups\n",
+		 pmu->num_counters, pmu->num_cgs);
+
+	platform_set_drvdata(pdev, pmu);
+
+	pmu->cpu = raw_smp_processor_id();
+	pmu->pmu = (struct pmu) {
+		.attr_groups = arm_smmu_attr_groups,
+		.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+		.task_ctx_nr = perf_invalid_context,
+		.pmu_enable = arm_smmu_pmu_enable,
+		.pmu_disable = arm_smmu_pmu_disable,
+		.event_init = arm_smmu_event_init,
+		.add = arm_smmu_event_add,
+		.del = arm_smmu_event_del,
+		.start = arm_smmu_event_start,
+		.stop = arm_smmu_event_stop,
+		.read = arm_smmu_event_read,
+	};
+
+	/* It's helpful if the PMU device correlates to the platform device */
+	name = devm_kasprintf(dev, GFP_KERNEL, "arm_smmu_%d", pdev->id);
+	if (!name)
+		return -ENOMEM;
+
+	err = cpuhp_state_add_instance(arm_smmu_hp_state, &pmu->cpuhp_node);
+	if (err)
+		return err;
+
+	err = perf_pmu_register(&pmu->pmu, name, -1);
+	if (err)
+		cpuhp_state_remove_instance_nocalls(arm_smmu_hp_state, &pmu->cpuhp_node);
+	return err;
+}
+
+static int arm_smmu_pmu_remove(struct platform_device *pdev)
+{
+	struct arm_smmu_pmu *pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&pmu->pmu);
+
+	/* Make sure it's turned off, and devres will do the rest */
+	arm_smmu_pmu_reset(pmu, false);
+
+	return 0;
+}
+
+static struct platform_device_id arm_smmu_pmu_id[] = {
+	{ .name = "arm-smmu-pmu" },
+	{ }
+};
+MODULE_DEVICE_TABLE(platform, arm_smmu_pmu_id);
+
+static struct platform_driver arm_smmu_pmu_driver = {
+	.driver = { .name = "arm-smmu-pmu" },
+	.id_table = arm_smmu_pmu_id,
+	.probe = arm_smmu_pmu_probe,
+	.remove = arm_smmu_pmu_remove,
+};
+
+static int __init arm_smmu_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "perf/arm/smmu:online",
+				      NULL,
+				      arm_smmu_offline_cpu);
+	if (ret < 0)
+		return ret;
+
+	arm_smmu_hp_state = ret;
+
+	ret = platform_driver_register(&arm_smmu_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(arm_smmu_hp_state);
+	return ret;
+}
+
+static void __exit arm_smmu_pmu_exit(void)
+{
+	platform_driver_unregister(&arm_smmu_pmu_driver);
+	cpuhp_remove_multi_state(arm_smmu_hp_state);
+}
+
+module_init(arm_smmu_pmu_init);
+module_exit(arm_smmu_pmu_exit);
+
+MODULE_DESCRIPTION("PMU driver for Arm SMMU Performance Monitors");
+MODULE_AUTHOR("Robin Murphy <robin.murphy@arm.com>");
+MODULE_LICENSE("GPL v2");