[v3,4/5] cxl: CXL Performance Monitoring Unit driver

Message ID	20230327170247.6968-5-Jonathan.Cameron@huawei.com
State	Superseded
Headers	show Return-Path: <linux-cxl-owner@vger.kernel.org> From: Jonathan Cameron <Jonathan.Cameron@huawei.com> To: Liang Kan <kan.liang@linux.intel.com>, <linux-cxl@vger.kernel.org>, <peterz@infradead.org> CC: <mingo@redhat.com>, <acme@kernel.org>, <mark.rutland@arm.com>, <will@kernel.org>, <dan.j.williams@intel.com>, <linuxarm@huawei.com>, <linux-perf-users@vger.kernel.org>, <linux-kernel@vger.kernel.org>, Davidlohr Bueso <dave@stgolabs.net>, Dave Jiang <dave.jiang@intel.com> Subject: [PATCH v3 4/5] cxl: CXL Performance Monitoring Unit driver Date: Mon, 27 Mar 2023 18:02:46 +0100 Message-ID: <20230327170247.6968-5-Jonathan.Cameron@huawei.com> In-Reply-To: <20230327170247.6968-1-Jonathan.Cameron@huawei.com> References: <20230327170247.6968-1-Jonathan.Cameron@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: 7BIT Content-Type: text/plain; charset=US-ASCII Precedence: bulk
Series	CXL 3.0 Performance Monitoring Unit support \| expand [v3,0/5] CXL 3.0 Performance Monitoring Unit support [v3,1/5] cxl: Add function to count regblocks of a given type [v3,2/5] perf: Allow a PMU to have a parent [v3,3/5] cxl/pci: Find and register CXL PMU devices [v3,4/5] cxl: CXL Performance Monitoring Unit driver [v3,5/5] docs: perf: Minimal introduction the the CXL PMU device and driver

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index ff4e78117b31..0be514b00b8f 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -139,4 +139,17 @@ config CXL_REGION_INVALIDATION_TEST If unsure, or if this kernel is meant for production environments, say N. +config CXL_CPMU + tristate "CXL Performance Monitoring Unit" + default CXL_BUS + depends on PERF_EVENTS + help + Support performance monitoring as defined in CXL rev 3.0 + section 13.2: Performance Monitoring. CXL components may have + one or more CXL Performance Monitoring Units (CPMUs). + + Say 'y/m' to enable a driver that will attach to performance + monitoring units and provide standard perf based interfaces. + + If unsure say 'm'. endif diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile index db321f48ba52..024bb739554b 100644 --- a/drivers/cxl/Makefile +++ b/drivers/cxl/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_CXL_MEM) += cxl_mem.o obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o obj-$(CONFIG_CXL_PORT) += cxl_port.o +obj-$(CONFIG_CXL_CPMU) += cpmu.o cxl_mem-y := mem.o cxl_pci-y := pci.o diff --git a/drivers/cxl/cpmu.c b/drivers/cxl/cpmu.c new file mode 100644 index 000000000000..6e564649114a --- /dev/null +++ b/drivers/cxl/cpmu.c @@ -0,0 +1,939 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright(c) 2023 Huawei + * + * The CXL 3.0 specification includes a standard Performance Monitoring Unit, + * called the CXL PMU, or CPMU. In order to allow a high degree of + * implementation flexibility the specification provides a wide range of + * options all of which are self describing. + * + * Details in CXL rev 3.0 section 8.2.7 CPMU Register Interface + * + * TODO + * o Discoverability of counters. Allow perftool to provide summed counters + * and vendor defined counters. + * o Support free running counters - copy the Intel uncore PMU handling for these. + * o CPMUs which do not support freeze. + * o Add filter validation in cpmu_event_init() so problems are detected earlier. + * o Reject configurations that the hardware is ignoring + * (e.g. invert when not invertible) + * o Support CPMUs with no interrupts using an HRTIMER. + */ + +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/perf_event.h> +#include <linux/bitops.h> +#include <linux/device.h> +#include <linux/bits.h> +#include <linux/list.h> +#include <linux/bug.h> +#include <linux/pci.h> + +#include "cpmu.h" +#include "cxlpci.h" +#include "cxl.h" + +/* CXL rev 3.0 Table 13-5 Events under CXL Vendor ID */ +#define CPMU_GID_CLOCK_TICKS 0x00 +#define CPMU_GID_D2H_REQ 0x0010 +#define CPMU_GID_D2H_RSP 0x0011 +#define CPMU_GID_H2D_REQ 0x0012 +#define CPMU_GID_H2D_RSP 0x0013 +#define CPMU_GID_CACHE_DATA 0x0014 +#define CPMU_GID_M2S_REQ 0x0020 +#define CPMU_GID_M2S_RWD 0x0021 +#define CPMU_GID_M2S_BIRSP 0x0022 +#define CPMU_GID_S2M_BISNP 0x0023 +#define CPMU_GID_S2M_NDR 0x0024 +#define CPMU_GID_S2M_DRS 0x0025 +#define CPMU_GID_DDR 0x8000 + +static int cpmu_cpuhp_state_num; + +struct cpmu_event { + u16 vid; + u16 gid; + u32 msk; + union { + int counter_idx; /* fixed counters */ + int event_idx; /* configurable counters */ + }; + struct list_head node; +}; + +#define CPMU_MAX_COUNTERS 64 +struct cpmu_info { + struct pmu pmu; + void __iomem *base; + struct perf_event **hw_events; + struct list_head events_configurable; + struct list_head events_fixed; + DECLARE_BITMAP(used_counter_bm, CPMU_MAX_COUNTERS); + DECLARE_BITMAP(conf_counter_bm, CPMU_MAX_COUNTERS); + u16 counter_width; + u8 num_counters; + u8 num_event_capabilities; + int on_cpu; + struct hlist_node node; + bool freeze_for_enable; + bool filter_hdm; + int irq; +}; + +#define pmu_to_cpmu_info(_pmu) container_of(_pmu, struct cpmu_info, pmu) + +/* + * All CPMU counters are discoverable via the Event Capabilities Registers. + * Each Event Capability register contains a a VID / GroupID. + * A counter may then count any combination (by summing) of events in + * that group which are in the Supported Events Bitmask. + * However, there are some complexities to the scheme. + * - Fixed function counters refer to an Event Capabilities register. + * That event capability register is not then used for Configurable + * counters. + */ +static int cpmu_parse_caps(struct device *dev, struct cpmu_info *info) +{ + DECLARE_BITMAP(fixed_counter_event_cap_bm, 32) = {0}; + void __iomem *base = info->base; + u64 val, eval; + int i; + + val = readq(base + CPMU_CAP_REG); + info->freeze_for_enable = FIELD_GET(CPMU_CAP_WRITEABLE_WHEN_FROZEN, val) & + FIELD_GET(CPMU_CAP_FREEZE, val); + if (!info->freeze_for_enable) { + dev_err(dev, "Driver does not support CPMUs that do not support freeze for enable\n"); + return -ENODEV; + } + + info->num_counters = FIELD_GET(CPMU_CAP_NUM_COUNTERS_MSK, val) + 1; + info->counter_width = FIELD_GET(CPMU_CAP_COUNTER_WIDTH_MSK, val); + info->num_event_capabilities = FIELD_GET(CPMU_CAP_NUM_EVN_CAP_REG_SUP_MSK, val) + 1; + + info->filter_hdm = FIELD_GET(CPMU_CAP_FILTERS_SUP_MSK, val) & CPMU_FILTER_HDM; + if (FIELD_GET(CPMU_CAP_INT, val)) + info->irq = FIELD_GET(CPMU_CAP_MSI_N_MSK, val); + else + info->irq = -1; + + /* First handle fixed function counters; note if configurable counters found */ + for (i = 0; i < info->num_counters; i++) { + struct cpmu_event *cpmu_ev; + u32 events_msk; + u8 group_idx; + + val = readq(base + CPMU_COUNTER_CFG_REG(i)); + + if (FIELD_GET(CPMU_COUNTER_CFG_TYPE_MSK, val) == + CPMU_COUNTER_CFG_TYPE_CONFIGURABLE) { + set_bit(i, info->conf_counter_bm); + } + + if (FIELD_GET(CPMU_COUNTER_CFG_TYPE_MSK, val) != + CPMU_COUNTER_CFG_TYPE_FIXED_FUN) + continue; + + /* In this case we know which fields are const */ + group_idx = FIELD_GET(CPMU_COUNTER_CFG_EVENT_GRP_ID_IDX_MSK, val); + events_msk = FIELD_GET(CPMU_COUNTER_CFG_EVENTS_MSK, val); + eval = readq(base + CPMU_EVENT_CAP_REG(group_idx)); + cpmu_ev = devm_kzalloc(dev, sizeof(*cpmu_ev), GFP_KERNEL); + if (!cpmu_ev) + return -ENOMEM; + + cpmu_ev->vid = FIELD_GET(CPMU_EVENT_CAP_VENDOR_ID_MSK, eval); + cpmu_ev->gid = FIELD_GET(CPMU_EVENT_CAP_GROUP_ID_MSK, eval); + /* For a fixed purpose counter use the events mask from the counter CFG */ + cpmu_ev->msk = events_msk; + cpmu_ev->counter_idx = i; + /* This list add is never unwound as all entries deleted on remove */ + list_add(&cpmu_ev->node, &info->events_fixed); + /* + * Configurable counters must not use an Event Capability registers that + * is in use for a Fixed counter + */ + set_bit(group_idx, fixed_counter_event_cap_bm); + } + + if (!bitmap_empty(info->conf_counter_bm, CPMU_MAX_COUNTERS)) { + struct cpmu_event *cpmu_ev; + int j; + /* Walk event capabilities unused by fixed counters */ + for_each_clear_bit(j, fixed_counter_event_cap_bm, + info->num_event_capabilities) { + cpmu_ev = devm_kzalloc(dev, sizeof(*cpmu_ev), GFP_KERNEL); + if (!cpmu_ev) + return -ENOMEM; + + eval = readq(base + CPMU_EVENT_CAP_REG(j)); + cpmu_ev->vid = FIELD_GET(CPMU_EVENT_CAP_VENDOR_ID_MSK, eval); + cpmu_ev->gid = FIELD_GET(CPMU_EVENT_CAP_GROUP_ID_MSK, eval); + cpmu_ev->msk = FIELD_GET(CPMU_EVENT_CAP_SUPPORTED_EVENTS_MSK, eval); + cpmu_ev->event_idx = j; + list_add(&cpmu_ev->node, &info->events_configurable); + } + } + + return 0; +} + +static ssize_t cpmu_event_sysfs_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct perf_pmu_events_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_attr, attr); + + return sysfs_emit(buf, "config=%#llx\n", pmu_attr->id); +} + +#define CPMU_PMU_EVENT_ATTR(_name, _vid, _gid, _msk) \ + PMU_EVENT_ATTR_ID(_name, cpmu_event_sysfs_show, \ + ((u64)(_vid) << 48) | ((u64)(_gid) << 32) | (u64)(_msk)) + +/* For CXL spec defined events */ +#define CPMU_PMU_EVENT_CXL_ATTR(_name, _gid, _msk) \ + CPMU_PMU_EVENT_ATTR(_name, PCI_DVSEC_VENDOR_ID_CXL, _gid, _msk) + +static struct attribute *cpmu_event_attrs[] = { + CPMU_PMU_EVENT_CXL_ATTR(clock_ticks, CPMU_GID_CLOCK_TICKS, BIT(0)), + /* CXL rev 3.0 Table 3-17 - Device to Host Requests */ + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_rdcurr, CPMU_GID_D2H_REQ, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_rdown, CPMU_GID_D2H_REQ, BIT(2)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_rdshared, CPMU_GID_D2H_REQ, BIT(3)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_rdany, CPMU_GID_D2H_REQ, BIT(4)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_rdownnodata, CPMU_GID_D2H_REQ, BIT(5)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_itomwr, CPMU_GID_D2H_REQ, BIT(6)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_wrcurr, CPMU_GID_D2H_REQ, BIT(7)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_clflush, CPMU_GID_D2H_REQ, BIT(8)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_cleanevict, CPMU_GID_D2H_REQ, BIT(9)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_dirtyevict, CPMU_GID_D2H_REQ, BIT(10)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_cleanevictnodata, CPMU_GID_D2H_REQ, BIT(11)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_wowrinv, CPMU_GID_D2H_REQ, BIT(12)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_wowrinvf, CPMU_GID_D2H_REQ, BIT(13)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_wrinv, CPMU_GID_D2H_REQ, BIT(14)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_req_cacheflushed, CPMU_GID_D2H_REQ, BIT(16)), + /* CXL rev 3.0 Table 3-20 - D2H Repsonse Encodings */ + CPMU_PMU_EVENT_CXL_ATTR(d2h_rsp_rspihiti, CPMU_GID_D2H_RSP, BIT(4)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_rsp_rspvhitv, CPMU_GID_D2H_RSP, BIT(6)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_rsp_rspihitse, CPMU_GID_D2H_RSP, BIT(5)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_rsp_rspshitse, CPMU_GID_D2H_RSP, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_rsp_rspsfwdm, CPMU_GID_D2H_RSP, BIT(7)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_rsp_rspifwdm, CPMU_GID_D2H_RSP, BIT(15)), + CPMU_PMU_EVENT_CXL_ATTR(d2h_rsp_rspvfwdv, CPMU_GID_D2H_RSP, BIT(22)), + /* CXL rev 3.0 Table 3-21 - CXL.cache - Mapping of H2D Requests to D2H Responses */ + CPMU_PMU_EVENT_CXL_ATTR(h2d_req_snpdata, CPMU_GID_H2D_REQ, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(h2d_req_snpinv, CPMU_GID_H2D_REQ, BIT(2)), + CPMU_PMU_EVENT_CXL_ATTR(h2d_req_snpcur, CPMU_GID_H2D_REQ, BIT(3)), + /* CXL rev 3.0 Table 3-22 - H2D Response Opcode Encodings */ + CPMU_PMU_EVENT_CXL_ATTR(h2d_rsp_writepull, CPMU_GID_H2D_RSP, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(h2d_rsp_go, CPMU_GID_H2D_RSP, BIT(4)), + CPMU_PMU_EVENT_CXL_ATTR(h2d_rsp_gowritepull, CPMU_GID_H2D_RSP, BIT(5)), + CPMU_PMU_EVENT_CXL_ATTR(h2d_rsp_extcmp, CPMU_GID_H2D_RSP, BIT(6)), + CPMU_PMU_EVENT_CXL_ATTR(h2d_rsp_gowritepulldrop, CPMU_GID_H2D_RSP, BIT(8)), + CPMU_PMU_EVENT_CXL_ATTR(h2d_rsp_fastgowritepull, CPMU_GID_H2D_RSP, BIT(13)), + CPMU_PMU_EVENT_CXL_ATTR(h2d_rsp_goerrwritepull, CPMU_GID_H2D_RSP, BIT(15)), + /* CXL rev 3.0 Table 13-5 directly lists these */ + CPMU_PMU_EVENT_CXL_ATTR(cachedata_d2h_data, CPMU_GID_CACHE_DATA, BIT(0)), + CPMU_PMU_EVENT_CXL_ATTR(cachedata_h2d_data, CPMU_GID_CACHE_DATA, BIT(1)), + /* CXL rev 3.0 Table 3-29 M2S Req Memory Opcodes */ + CPMU_PMU_EVENT_CXL_ATTR(m2s_req_meminv, CPMU_GID_M2S_REQ, BIT(0)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_req_memrd, CPMU_GID_M2S_REQ, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_req_memrddata, CPMU_GID_M2S_REQ, BIT(2)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_req_memrdfwd, CPMU_GID_M2S_REQ, BIT(3)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_req_memwrfwd, CPMU_GID_M2S_REQ, BIT(4)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_req_memspecrd, CPMU_GID_M2S_REQ, BIT(8)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_req_meminvnt, CPMU_GID_M2S_REQ, BIT(9)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_req_memcleanevict, CPMU_GID_M2S_REQ, BIT(10)), + /* CXL rev 3.0 Table 3-35 M2S RwD Memory Opcodes */ + CPMU_PMU_EVENT_CXL_ATTR(m2s_rwd_memwr, CPMU_GID_M2S_RWD, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_rwd_memwrptl, CPMU_GID_M2S_RWD, BIT(2)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_rwd_biconflict, CPMU_GID_M2S_RWD, BIT(4)), + /* CXL rev 3.0 Table 3-38 M2S BIRsp Memory Opcodes */ + CPMU_PMU_EVENT_CXL_ATTR(m2s_birsp_i, CPMU_GID_M2S_BIRSP, BIT(0)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_birsp_s, CPMU_GID_M2S_BIRSP, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_birsp_e, CPMU_GID_M2S_BIRSP, BIT(2)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_birsp_iblk, CPMU_GID_M2S_BIRSP, BIT(4)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_birsp_sblk, CPMU_GID_M2S_BIRSP, BIT(5)), + CPMU_PMU_EVENT_CXL_ATTR(m2s_birsp_eblk, CPMU_GID_M2S_BIRSP, BIT(6)), + /* CXL rev 3.0 Table 3-40 S2M BISnp Opcodes */ + CPMU_PMU_EVENT_CXL_ATTR(s2m_bisnp_cur, CPMU_GID_S2M_BISNP, BIT(0)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_bisnp_data, CPMU_GID_S2M_BISNP, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_bisnp_inv, CPMU_GID_S2M_BISNP, BIT(2)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_bisnp_curblk, CPMU_GID_S2M_BISNP, BIT(4)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_bisnp_datblk, CPMU_GID_S2M_BISNP, BIT(5)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_bisnp_invblk, CPMU_GID_S2M_BISNP, BIT(6)), + /* CXL rev 3.0 Table 3-43 S2M NDR Opcopdes */ + CPMU_PMU_EVENT_CXL_ATTR(s2m_ndr_cmp, CPMU_GID_S2M_NDR, BIT(0)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_ndr_cmps, CPMU_GID_S2M_NDR, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_ndr_cmpe, CPMU_GID_S2M_NDR, BIT(2)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_ndr_biconflictack, CPMU_GID_S2M_NDR, BIT(3)), + /* CXL rev 3.0 Table 3-46 S2M DRS opcodes */ + CPMU_PMU_EVENT_CXL_ATTR(s2m_drs_memdata, CPMU_GID_S2M_DRS, BIT(0)), + CPMU_PMU_EVENT_CXL_ATTR(s2m_drs_memdatanxm, CPMU_GID_S2M_DRS, BIT(1)), + /* CXL rev 3.0 Table 13-5 directly lists these */ + CPMU_PMU_EVENT_CXL_ATTR(ddr_act, CPMU_GID_DDR, BIT(0)), + CPMU_PMU_EVENT_CXL_ATTR(ddr_pre, CPMU_GID_DDR, BIT(1)), + CPMU_PMU_EVENT_CXL_ATTR(ddr_casrd, CPMU_GID_DDR, BIT(2)), + CPMU_PMU_EVENT_CXL_ATTR(ddr_caswr, CPMU_GID_DDR, BIT(3)), + CPMU_PMU_EVENT_CXL_ATTR(ddr_refresh, CPMU_GID_DDR, BIT(4)), + CPMU_PMU_EVENT_CXL_ATTR(ddr_selfrefreshent, CPMU_GID_DDR, BIT(5)), + CPMU_PMU_EVENT_CXL_ATTR(ddr_rfm, CPMU_GID_DDR, BIT(6)), + NULL +}; + +static struct cpmu_event *cpmu_find_fixed_counter_event(struct cpmu_info *info, + int vid, int gid, int msk) +{ + struct cpmu_event *cpmu_ev; + + list_for_each_entry(cpmu_ev, &info->events_fixed, node) { + if (vid != cpmu_ev->vid || gid != cpmu_ev->gid) + continue; + + /* Precise match for fixed counter */ + if (msk == cpmu_ev->msk) + return cpmu_ev; + } + + return ERR_PTR(-EINVAL); +} + +static struct cpmu_event *cpmu_find_config_counter_event(struct cpmu_info *info, + int vid, int gid, int msk) +{ + struct cpmu_event *cpmu_ev; + + list_for_each_entry(cpmu_ev, &info->events_configurable, node) { + if (vid != cpmu_ev->vid || gid != cpmu_ev->gid) + continue; + + /* Request mask must be subset of supported */ + if (msk & ~cpmu_ev->msk) + continue; + + return cpmu_ev; + } + + return ERR_PTR(-EINVAL); +} + +static umode_t cpmu_event_is_visible(struct kobject *kobj, struct attribute *attr, int a) +{ + struct device_attribute *dev_attr = container_of(attr, struct device_attribute, attr); + struct perf_pmu_events_attr *pmu_attr = + container_of(dev_attr, struct perf_pmu_events_attr, attr); + struct device *dev = kobj_to_dev(kobj); + struct cpmu_info *info = dev_get_drvdata(dev); + int vid = FIELD_GET(GENMASK_ULL(63, 48), pmu_attr->id); + int gid = FIELD_GET(GENMASK_ULL(47, 32), pmu_attr->id); + int msk = FIELD_GET(GENMASK_ULL(31, 0), pmu_attr->id); + + if (!IS_ERR(cpmu_find_fixed_counter_event(info, vid, gid, msk))) + return attr->mode; + + if (!IS_ERR(cpmu_find_config_counter_event(info, vid, gid, msk))) + return attr->mode; + + return 0; +} + +static const struct attribute_group cpmu_events = { + .name = "events", + .attrs = cpmu_event_attrs, + .is_visible = cpmu_event_is_visible, +}; + +static ssize_t cpmu_format_sysfs_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_ext_attribute *eattr; + + eattr = container_of(attr, struct dev_ext_attribute, attr); + + return sysfs_emit(buf, "%s\n", (char *)eattr->var); +} + +#define CPMU_FORMAT_ATTR(_name, _format)\ + (&((struct dev_ext_attribute[]) { \ + { \ + .attr = __ATTR(_name, 0444, \ + cpmu_format_sysfs_show, NULL), \ + .var = (void *)_format \ + } \ + })[0].attr.attr) + +enum { + cpmu_mask_attr, + cpmu_gid_attr, + cpmu_vid_attr, + cpmu_threshold_attr, + cpmu_invert_attr, + cpmu_edge_attr, + cpmu_hdm_filter_en_attr, + cpmu_hdm_attr, +}; + +static struct attribute *cpmu_format_attr[] = { + [cpmu_mask_attr] = CPMU_FORMAT_ATTR(mask, "config:0-31"), + [cpmu_gid_attr] = CPMU_FORMAT_ATTR(gid, "config:32-47"), + [cpmu_vid_attr] = CPMU_FORMAT_ATTR(vid, "config:48-63"), + [cpmu_threshold_attr] = CPMU_FORMAT_ATTR(threshold, "config1:0-15"), + [cpmu_invert_attr] = CPMU_FORMAT_ATTR(invert, "config1:16"), + [cpmu_edge_attr] = CPMU_FORMAT_ATTR(edge, "config1:17"), + [cpmu_hdm_filter_en_attr] = CPMU_FORMAT_ATTR(hdm_filter_en, "config1:18"), + [cpmu_hdm_attr] = CPMU_FORMAT_ATTR(hdm, "config2:0-15"), + NULL +}; + +static umode_t cpmu_format_is_visible(struct kobject *kobj, struct attribute *attr, int a) +{ + struct device *dev = kobj_to_dev(kobj); + struct cpmu_info *info = dev_get_drvdata(dev); + + /* + * Filter capability at the CPMU level, so hide the attributes if the particular + * filter is not supported. + */ + if (attr == cpmu_format_attr[cpmu_hdm_filter_en_attr] || + attr == cpmu_format_attr[cpmu_hdm_attr]) { + if (info->filter_hdm) + return 0444; + else + return 0; + } else { + return 0444; + } +} + +static const struct attribute_group cpmu_format_group = { + .name = "format", + .attrs = cpmu_format_attr, + .is_visible = cpmu_format_is_visible, +}; + +static u32 cpmu_config_get_mask(struct perf_event *event) +{ + return FIELD_GET(GENMASK_ULL(31, 0), event->attr.config); +} + +static u16 cpmu_config_get_gid(struct perf_event *event) +{ + return FIELD_GET(GENMASK_ULL(47, 32), event->attr.config); +} + +static u16 cpmu_config_get_vid(struct perf_event *event) +{ + return FIELD_GET(GENMASK_ULL(63, 48), event->attr.config); +} + +static u8 cpmu_config1_get_threshold(struct perf_event *event) +{ + return FIELD_GET(GENMASK_ULL(15, 0), event->attr.config1); +} + +static bool cpmu_config1_get_invert(struct perf_event *event) +{ + return FIELD_GET(BIT(16), event->attr.config1); +} + +static bool cpmu_config1_get_edge(struct perf_event *event) +{ + return FIELD_GET(BIT(17), event->attr.config1); +} + +/* + * CPMU specification allows for 8 filters, each with a 16 bit value... + * So we need to find 8x16bits to store it in. + * As the value used for disable is 0xffff, a separate enable switch + * is needed. + */ + +static bool cpmu_config1_hdm_filter_en(struct perf_event *event) +{ + return FIELD_GET(BIT(14), event->attr.config1); +} + +static u16 cpmu_config2_get_hdm_decoder(struct perf_event *event) +{ + return FIELD_GET(GENMASK(15, 0), event->attr.config2); +} + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cpmu_info *info = dev_get_drvdata(dev); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(info->on_cpu)); +} +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *cpmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static const struct attribute_group cpmu_cpumask_group = { + .attrs = cpmu_cpumask_attrs, +}; + +static const struct attribute_group *cpmu_attr_groups[] = { + &cpmu_events, + &cpmu_format_group, + &cpmu_cpumask_group, + NULL +}; + +/* If counter_idx == NULL, don't try to allocate a counter. */ +static int cpmu_get_event_idx(struct perf_event *event, int *counter_idx, int *event_idx) +{ + struct cpmu_info *info = pmu_to_cpmu_info(event->pmu); + DECLARE_BITMAP(configurable_and_free, CPMU_MAX_COUNTERS); + struct cpmu_event *cpmu_ev; + u32 mask; + u16 gid, vid; + int i; + + vid = cpmu_config_get_vid(event); + gid = cpmu_config_get_gid(event); + mask = cpmu_config_get_mask(event); + + cpmu_ev = cpmu_find_fixed_counter_event(info, vid, gid, mask); + if (!IS_ERR(cpmu_ev)) { + if (!counter_idx) + return 0; + if (!info->hw_events[cpmu_ev->counter_idx]) { + *counter_idx = cpmu_ev->counter_idx; + return 0; + } + /* Fixed counter is in use, but maybe a configurable one? */ + } + + cpmu_ev = cpmu_find_config_counter_event(info, vid, gid, mask); + if (!IS_ERR(cpmu_ev)) { + if (!counter_idx) + return 0; + + bitmap_andnot(configurable_and_free, info->conf_counter_bm, + info->used_counter_bm, CPMU_MAX_COUNTERS); + + i = find_first_bit(configurable_and_free, CPMU_MAX_COUNTERS); + if (i == CPMU_MAX_COUNTERS) + return -EINVAL; + + *counter_idx = i; + return 0; + } + + return -EINVAL; +} + +static int cpmu_event_init(struct perf_event *event) +{ + struct cpmu_info *info = pmu_to_cpmu_info(event->pmu); + + event->cpu = info->on_cpu; + /* Top level type sanity check - is this a Hardware Event being requested */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EOPNOTSUPP; + /* TODO: Validation of any filter */ + + /* + * Verify that it is possible to count what was requested. Either must + * be a fixed counter that is a precise match or a configurable counter + * where this is a subset. + */ + return cpmu_get_event_idx(event, NULL, NULL); +} + +static void cpmu_pmu_enable(struct pmu *pmu) +{ + struct cpmu_info *info = pmu_to_cpmu_info(pmu); + void __iomem *base = info->base; + + /* We don't have a global enable, but we 'might' have a global freeze which we can use */ + if (info->freeze_for_enable) { + /* Can assume frozen at this stage */ + writeq(0, base + CPMU_FREEZE_REG); + + return; + } +} + +static void cpmu_pmu_disable(struct pmu *pmu) +{ + struct cpmu_info *info = pmu_to_cpmu_info(pmu); + void __iomem *base = info->base; + + if (info->freeze_for_enable) { + /* + * Whilst bits above number of counters are RsvdZ + * they are unlikely to be repurposed given + * number of counters is allowed to be 64 leaving + * no reserved bits. Hence this is only slightly + * naughty. + */ + writeq(GENMASK_ULL(63, 0), base + CPMU_FREEZE_REG); + return; + } +} + +static void cpmu_event_start(struct perf_event *event, int flags) +{ + struct cpmu_info *info = pmu_to_cpmu_info(event->pmu); + struct hw_perf_event *hwc = &event->hw; + void __iomem *base = info->base; + u64 cfg, prev_cnt; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + /* + * Currently only hdm filter control is implemnted, this code will + * want generalizing when more filters are added. + */ + if (info->filter_hdm) { + if (cpmu_config1_hdm_filter_en(event)) + cfg = cpmu_config2_get_hdm_decoder(event); + else + cfg = GENMASK(15, 0); + writeq(cfg, base + CPMU_FILTER_CFG_REG(hwc->idx, 0)); + } + + cfg = readq(base + CPMU_COUNTER_CFG_REG(hwc->idx)); + cfg |= FIELD_PREP(CPMU_COUNTER_CFG_INT_ON_OVRFLW, 1); + cfg |= FIELD_PREP(CPMU_COUNTER_CFG_ENABLE, 1); + cfg |= FIELD_PREP(CPMU_COUNTER_CFG_EDGE, cpmu_config1_get_edge(event) ? 1 : 0); + cfg |= FIELD_PREP(CPMU_COUNTER_CFG_INVERT, cpmu_config1_get_invert(event) ? 1 : 0); + + /* Fixed purpose counters have next two fields RO */ + if (test_bit(hwc->idx, info->conf_counter_bm)) { + cfg |= FIELD_PREP(CPMU_COUNTER_CFG_EVENT_GRP_ID_IDX_MSK, hwc->event_base); + cfg |= FIELD_PREP(CPMU_COUNTER_CFG_EVENTS_MSK, cpmu_config_get_mask(event)); + } + cfg &= ~CPMU_COUNTER_CFG_THRESHOLD_MSK; + /* + * For events that generate only 1 count per clock the CXL 3.0 spec + * states the threshold shall be set to 1 but if set to 0 it will + * count the raw value anwyay? + * There is no definition of what events will count multiple per cycle + * and hence to which non 1 values of threshold can apply. + * (CXL 3.0 8.2.7.2.1 Counter Configuration - threshold field definition) + */ + cfg |= FIELD_PREP(CPMU_COUNTER_CFG_THRESHOLD_MSK, + cpmu_config1_get_threshold(event)); + writeq(cfg, base + CPMU_COUNTER_CFG_REG(hwc->idx)); + + local64_set(&hwc->prev_count, 0); + writeq(0, base + CPMU_COUNTER_REG(hwc->idx)); + + if (flags & PERF_EF_RELOAD) { + prev_cnt = local64_read(&hwc->prev_count); + writeq(prev_cnt, base + CPMU_COUNTER_REG(hwc->idx)); + } + + perf_event_update_userpage(event); +} + +static u64 cpmu_read_counter(struct perf_event *event) +{ + struct cpmu_info *info = pmu_to_cpmu_info(event->pmu); + void __iomem *base = info->base; + + return readq(base + CPMU_COUNTER_REG(event->hw.idx)); +} + +static void __cpmu_read(struct perf_event *event, bool overflow) +{ + struct cpmu_info *info = pmu_to_cpmu_info(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u64 new_cnt, prev_cnt, delta; + + do { + prev_cnt = local64_read(&hwc->prev_count); + new_cnt = cpmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, prev_cnt, new_cnt) != prev_cnt); + + /* + * If we know an overflow occur then take that into account. + * Note counter is not reset as that would lose events + */ + delta = (new_cnt - prev_cnt) & GENMASK_ULL(info->counter_width - 1, 0); + if (overflow && delta < GENMASK_ULL(info->counter_width - 1, 0)) + delta += (1UL << info->counter_width); + + local64_add(delta, &event->count); +} + +static void cpmu_read(struct perf_event *event) +{ + __cpmu_read(event, false); +} + +static void cpmu_event_stop(struct perf_event *event, int flags) +{ + struct cpmu_info *info = pmu_to_cpmu_info(event->pmu); + void __iomem *base = info->base; + struct hw_perf_event *hwc = &event->hw; + u64 cfg; + + cpmu_read(event); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + cfg = readq(base + CPMU_COUNTER_CFG_REG(hwc->idx)); + cfg &= ~(FIELD_PREP(CPMU_COUNTER_CFG_INT_ON_OVRFLW, 1) | + FIELD_PREP(CPMU_COUNTER_CFG_ENABLE, 1)); + writeq(cfg, base + CPMU_COUNTER_CFG_REG(hwc->idx)); + + if (hwc->state & PERF_HES_UPTODATE) + return; + + hwc->state |= PERF_HES_UPTODATE; +} + +/* + * Reset ensures no possibility of any information leaking to wrong + * counter. Note that all fields written during start() + */ +static void cpmu_reset_counter(struct cpmu_info *info, int idx) +{ + void __iomem *base = info->base; + + /* Much of this register is read only */ + writeq(0, base + CPMU_EVENT_CAP_REG(idx)); + /* Filters are not per counter, so no reset here */ + writeq(0, base + CPMU_COUNTER_REG(idx)); +} + +static int cpmu_event_add(struct perf_event *event, int flags) +{ + struct cpmu_info *info = pmu_to_cpmu_info(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx, rc; + int event_idx = 0; + + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + rc = cpmu_get_event_idx(event, &idx, &event_idx); + if (rc < 0) + return rc; + + hwc->idx = idx; + + /* Only set for configurable counters */ + hwc->event_base = event_idx; + info->hw_events[idx] = event; + set_bit(idx, info->used_counter_bm); + + cpmu_reset_counter(info, idx); + + if (flags & PERF_EF_START) + cpmu_event_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void cpmu_event_del(struct perf_event *event, int flags) +{ + struct cpmu_info *info = pmu_to_cpmu_info(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + cpmu_event_stop(event, PERF_EF_UPDATE); + clear_bit(hwc->idx, info->used_counter_bm); + info->hw_events[hwc->idx] = NULL; + perf_event_update_userpage(event); +} + +static irqreturn_t cpmu_irq(int irq, void *data) +{ + struct cpmu_info *info = data; + void __iomem *base = info->base; + u64 overflowed; + DECLARE_BITMAP(overflowedbm, 64); + int i; + + overflowed = readq(base + CPMU_OVERFLOW_REG); + + /* Interrupt may be shared, so maybe it isn't ours */ + if (!overflowed) + return IRQ_NONE; + + bitmap_from_arr64(overflowedbm, &overflowed, 64); + for_each_set_bit(i, overflowedbm, info->num_counters) { + struct perf_event *event = info->hw_events[i]; + + if (!event) { + dev_dbg(info->pmu.dev, + "overflow but on non enabled counter %d\n", i); + continue; + } + + __cpmu_read(event, true); + } + + writeq(overflowed, base + CPMU_OVERFLOW_REG); + + return IRQ_HANDLED; +} + +static int cxl_cpmu_probe(struct device *dev) +{ + struct cxl_cpmu *cpmu = to_cxl_cpmu(dev); + struct pci_dev *pdev = to_pci_dev(dev->parent); + struct cpmu_info *info; + char *irq_name; + int rc, irq; + + info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + INIT_LIST_HEAD(&info->events_fixed); + INIT_LIST_HEAD(&info->events_configurable); + + info->base = cpmu->base; + + info->on_cpu = -1; + rc = cpmu_parse_caps(dev, info); + if (rc) + return rc; + + info->hw_events = devm_kcalloc(dev, sizeof(*info->hw_events), + info->num_counters, GFP_KERNEL); + if (!info->hw_events) + return -ENOMEM; + + info->pmu = (struct pmu) { + .name = dev_name(dev), + .parent = dev, + .module = THIS_MODULE, + .event_init = cpmu_event_init, + .pmu_enable = cpmu_pmu_enable, + .pmu_disable = cpmu_pmu_disable, + .add = cpmu_event_add, + .del = cpmu_event_del, + .start = cpmu_event_start, + .stop = cpmu_event_stop, + .read = cpmu_read, + .task_ctx_nr = perf_invalid_context, + .attr_groups = cpmu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + }; + + if (info->irq <= 0) + return -EINVAL; + + rc = pci_irq_vector(pdev, info->irq); + if (rc < 0) + return rc; + irq = rc; + + irq_name = devm_kasprintf(dev, GFP_KERNEL, "%s_overflow\n", dev_name(dev)); + if (!irq_name) + return -ENOMEM; + + rc = devm_request_irq(dev, irq, cpmu_irq, IRQF_SHARED, irq_name, info); + if (rc) + return rc; + info->irq = irq; + + rc = cpuhp_state_add_instance(cpmu_cpuhp_state_num, &info->node); + if (rc) + return rc; + + rc = perf_pmu_register(&info->pmu, info->pmu.name, -1); + if (rc) + return rc; + + dev_set_drvdata(dev, info); + + return 0; +} + +static void cxl_cpmu_remove(struct device *dev) +{ + struct cpmu_info *info = dev_get_drvdata(dev); + + perf_pmu_unregister(&info->pmu); + cpuhp_state_remove_instance_nocalls(cpmu_cpuhp_state_num, &info->node); +} + +static struct cxl_driver cxl_cpmu_driver = { + .name = "cxl_cpmu", + .probe = cxl_cpmu_probe, + .remove = cxl_cpmu_remove, + .id = CXL_DEVICE_CPMU, +}; + +static int cpmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct cpmu_info *info = hlist_entry_safe(node, struct cpmu_info, node); + + if (info->on_cpu != -1) + return 0; + + info->on_cpu = cpu; + WARN_ON(irq_set_affinity(info->irq, cpumask_of(cpu))); + + return 0; +} + +static int cpmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct cpmu_info *info = hlist_entry_safe(node, struct cpmu_info, node); + unsigned int target; + + if (info->on_cpu != cpu) + return 0; + + info->on_cpu = -1; + target = cpumask_first(cpu_online_mask); + if (target >= nr_cpu_ids) { + dev_err(info->pmu.dev, "Unable to find a suitable CPU\n"); + return 0; + } + + perf_pmu_migrate_context(&info->pmu, cpu, target); + info->on_cpu = target; + WARN_ON(irq_set_affinity(info->irq, cpumask_of(target))); + + return 0; +} + +static __init int cxl_cpmu_init(void) +{ + int rc; + + rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "AP_PERF_CPMU_ONLINE", + cpmu_online_cpu, cpmu_offline_cpu); + if (rc < 0) + return rc; + cpmu_cpuhp_state_num = rc; + + rc = cxl_driver_register(&cxl_cpmu_driver); + if (rc) + cpuhp_remove_multi_state(cpmu_cpuhp_state_num); + + return rc; +} + +static __exit void cxl_cpmu_exit(void) +{ + cxl_driver_unregister(&cxl_cpmu_driver); + cpuhp_remove_multi_state(cpmu_cpuhp_state_num); +} + +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(CXL); +module_init(cxl_cpmu_init); +module_exit(cxl_cpmu_exit); +MODULE_ALIAS_CXL(CXL_DEVICE_CPMU);

[v3,4/5] cxl: CXL Performance Monitoring Unit driver

Commit Message

Patch