@@ -14,6 +14,7 @@ vc4-y := \
vc4_vec.o \
vc4_hvs.o \
vc4_irq.o \
+ vc4_perfmon.o \
vc4_plane.o \
vc4_render_cl.o \
vc4_trace_points.o \
@@ -102,6 +102,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
case DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER:
case DRM_VC4_PARAM_SUPPORTS_MADVISE:
case DRM_VC4_PARAM_SUPPORTS_EXTENDED_CL:
+ case DRM_VC4_PARAM_SUPPORTS_PERFMON:
args->value = true;
break;
default:
@@ -119,6 +120,26 @@ static void vc4_lastclose(struct drm_device *dev)
drm_fbdev_cma_restore_mode(vc4->fbdev);
}
+static int vc4_open(struct drm_device *dev, struct drm_file *file)
+{
+ struct vc4_file *vc4file;
+
+ vc4file = kzalloc(sizeof(*vc4file), GFP_KERNEL);
+ if (!vc4file)
+ return -ENOMEM;
+
+ vc4_perfmon_open_file(vc4file);
+ file->driver_priv = vc4file;
+ return 0;
+}
+
+static void vc4_close(struct drm_device *dev, struct drm_file *file)
+{
+ struct vc4_file *vc4file = file->driver_priv;
+
+ vc4_perfmon_close_file(vc4file);
+}
+
static const struct vm_operations_struct vc4_vm_ops = {
.fault = vc4_fault,
.open = drm_gem_vm_open,
@@ -151,6 +172,9 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(VC4_PERFMON_CREATE, vc4_perfmon_create_ioctl, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(VC4_PERFMON_DESTROY, vc4_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(VC4_PERFMON_GET_VALUES, vc4_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
};
static struct drm_driver vc4_drm_driver = {
@@ -161,6 +185,8 @@ static struct drm_driver vc4_drm_driver = {
DRIVER_RENDER |
DRIVER_PRIME),
.lastclose = vc4_lastclose,
+ .open = vc4_open,
+ .postclose = vc4_close,
.irq_handler = vc4_irq,
.irq_preinstall = vc4_irq_preinstall,
.irq_postinstall = vc4_irq_postinstall,
@@ -11,6 +11,8 @@
#include <drm/drm_encoder.h>
#include <drm/drm_gem_cma_helper.h>
+#include "uapi/drm/vc4_drm.h"
+
/* Don't forget to update vc4_bo.c: bo_type_names[] when adding to
* this.
*/
@@ -29,6 +31,13 @@ enum vc4_kernel_bo_type {
VC4_BO_TYPE_COUNT
};
+struct vc4_perfmon {
+ refcount_t refcnt;
+ u8 ncounters;
+ u8 events[DRM_VC4_MAX_PERF_COUNTERS];
+ u64 counters[0];
+};
+
struct vc4_dev {
struct drm_device *dev;
@@ -158,6 +167,8 @@ struct vc4_dev {
} hangcheck;
struct semaphore async_modeset;
+
+ bool perfmon_active;
};
static inline struct vc4_dev *
@@ -410,6 +421,22 @@ struct vc4_exec_info {
void *uniforms_v;
uint32_t uniforms_p;
uint32_t uniforms_size;
+
+ /* Pointer to a performance monitor object if the user requested it,
+ * NULL otherwise.
+ */
+ struct vc4_perfmon *perfmon;
+};
+
+/*
+ * Per-open file private data. Any driver-specific resource that has to be
+ * released when the DRM file is closed should be placed here.
+ */
+struct vc4_file {
+ struct {
+ struct idr idr;
+ struct mutex lock;
+ } perfmon;
};
static inline struct vc4_exec_info *
@@ -650,3 +677,19 @@ bool vc4_check_tex_size(struct vc4_exec_info *exec,
/* vc4_validate_shader.c */
struct vc4_validated_shader_info *
vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
+
+/* vc4_perfmon.c */
+void vc4_perfmon_get(struct vc4_perfmon *perfmon);
+void vc4_perfmon_put(struct vc4_perfmon *perfmon);
+void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon);
+void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
+ bool capture);
+struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id);
+void vc4_perfmon_open_file(struct vc4_file *vc4file);
+void vc4_perfmon_close_file(struct vc4_file *vc4file);
+int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
@@ -454,6 +454,8 @@ vc4_submit_next_bin_job(struct drm_device *dev)
vc4_flush_caches(dev);
+ vc4_perfmon_start(vc4, exec->perfmon);
+
/* Either put the job in the binner if it uses the binner, or
* immediately move it to the to-be-rendered queue.
*/
@@ -646,11 +648,11 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
list_add_tail(&exec->head, &vc4->bin_job_list);
- /* If no job was executing, kick ours off. Otherwise, it'll
- * get started when the previous job's flush done interrupt
- * occurs.
+ /* If no job was executing and the previous job did not activate
+ * the performance monitor, kick ours off. Otherwise, it'll get
+ * started when the previous job's flush/render done interrupt occurs.
*/
- if (vc4_first_bin_job(vc4) == exec) {
+ if (vc4_first_bin_job(vc4) == exec && !vc4->perfmon_active) {
vc4_submit_next_bin_job(dev);
vc4_queue_hangcheck(dev);
}
@@ -913,6 +915,9 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
vc4->bin_alloc_used &= ~exec->bin_slots;
spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+ /* Release the reference we had on the perf monitor. */
+ vc4_perfmon_put(exec->perfmon);
+
mutex_lock(&vc4->power_lock);
if (--vc4->power_refcount == 0) {
pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -1050,7 +1055,8 @@ vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
}
static int
-vc4_parse_cl_chunk(struct vc4_dev *vc4, struct vc4_exec_info *exec,
+vc4_parse_cl_chunk(struct vc4_dev *vc4, struct vc4_file *vc4file,
+ struct vc4_exec_info *exec,
const union drm_vc4_submit_cl_chunk *chunk)
{
switch(chunk->dummy.type) {
@@ -1063,6 +1069,20 @@ vc4_parse_cl_chunk(struct vc4_dev *vc4, struct vc4_exec_info *exec,
exec->args->bin_cl_size = chunk->bin.size;
break;
+ case VC4_PERFMON_CHUNK:
+ if (chunk->perfmon.pad)
+ return -EINVAL;
+
+ if (exec->perfmon)
+ return -EINVAL;
+
+ exec->perfmon = vc4_perfmon_find(vc4file,
+ chunk->perfmon.id);
+ if (!exec->perfmon)
+ return -EINVAL;
+
+ break;
+
default:
return -EINVAL;
}
@@ -1087,6 +1107,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv)
{
struct vc4_dev *vc4 = to_vc4_dev(dev);
+ struct vc4_file *vc4file = file_priv->driver_priv;
struct drm_vc4_submit_cl *args = data;
struct vc4_exec_info *exec;
struct ww_acquire_ctx acquire_ctx;
@@ -1154,7 +1175,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
exec->args->bin_cl = 0;
exec->args->bin_cl_size = 0;
for (i = 0; i < exec->nchunks; i++) {
- ret = vc4_parse_cl_chunk(vc4, exec,
+ ret = vc4_parse_cl_chunk(vc4, vc4file, exec,
&exec->chunks[i]);
if (ret)
goto fail;
@@ -110,7 +110,9 @@ vc4_irq_finish_bin_job(struct drm_device *dev)
return;
vc4_move_job_to_render(dev, exec);
- vc4_submit_next_bin_job(dev);
+
+ if (!exec->perfmon)
+ vc4_submit_next_bin_job(dev);
}
static void
@@ -122,6 +124,7 @@ vc4_cancel_bin_job(struct drm_device *dev)
if (!exec)
return;
+ vc4_perfmon_stop(vc4, exec->perfmon, false);
list_move_tail(&exec->head, &vc4->bin_job_list);
vc4_submit_next_bin_job(dev);
}
@@ -135,6 +138,14 @@ vc4_irq_finish_render_job(struct drm_device *dev)
if (!exec)
return;
+ vc4_perfmon_stop(vc4, exec->perfmon, true);
+
+ /* perfmon may have stalled the binner, re-arm the dequeuing
+ * logic.
+ */
+ if (exec->perfmon)
+ vc4_submit_next_bin_job(dev);
+
vc4->finished_seqno++;
list_move_tail(&exec->head, &vc4->job_done_list);
if (exec->fence) {
new file mode 100644
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: VC4 V3D performance monitor module
+ *
+ * The V3D block provides 16 hardware counters which can count various events.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+
+void vc4_perfmon_get(struct vc4_perfmon *perfmon)
+{
+ if (perfmon)
+ refcount_inc(&perfmon->refcnt);
+}
+
+void vc4_perfmon_put(struct vc4_perfmon *perfmon)
+{
+ if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
+ kfree(perfmon);
+}
+
+void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon)
+{
+ unsigned int i;
+ u32 mask;
+
+ if (!perfmon || WARN_ON(vc4->perfmon_active))
+ return;
+
+ for (i = 0; i < perfmon->ncounters; i++)
+ V3D_WRITE(V3D_PCTRS(i), perfmon->events[i]);
+
+ mask = GENMASK(perfmon->ncounters - 1, 0);
+ V3D_WRITE(V3D_PCTRE, V3D_PCTRE_EN | mask);
+ V3D_WRITE(V3D_PCTRC, mask);
+ vc4->perfmon_active = true;
+}
+
+void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
+ bool capture)
+{
+ unsigned int i;
+
+ if (!perfmon || WARN_ON(!vc4->perfmon_active))
+ return;
+
+ if (capture) {
+ for (i = 0; i < perfmon->ncounters; i++)
+ perfmon->counters[i] += V3D_READ(V3D_PCTR(i));
+ }
+
+ V3D_WRITE(V3D_PCTRE, 0);
+ vc4->perfmon_active = false;
+}
+
+struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id)
+{
+ struct vc4_perfmon *perfmon;
+
+ mutex_lock(&vc4file->perfmon.lock);
+ perfmon = idr_find(&vc4file->perfmon.idr, id);
+ vc4_perfmon_get(perfmon);
+ mutex_unlock(&vc4file->perfmon.lock);
+
+ return perfmon;
+}
+
+void vc4_perfmon_open_file(struct vc4_file *vc4file)
+{
+ mutex_init(&vc4file->perfmon.lock);
+ idr_init(&vc4file->perfmon.idr);
+}
+
+static int vc4_perfmon_idr_del(int id, void *elem, void *data)
+{
+ struct vc4_perfmon *perfmon = elem;
+
+ vc4_perfmon_put(perfmon);
+
+ return 0;
+}
+
+void vc4_perfmon_close_file(struct vc4_file *vc4file)
+{
+ mutex_lock(&vc4file->perfmon.lock);
+ idr_for_each(&vc4file->perfmon.idr, vc4_perfmon_idr_del, NULL);
+ idr_destroy(&vc4file->perfmon.idr);
+ mutex_unlock(&vc4file->perfmon.lock);
+}
+
+int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct vc4_file *vc4file = file_priv->driver_priv;
+ struct drm_vc4_perfmon_create *req = data;
+ struct vc4_perfmon *perfmon;
+ unsigned int i;
+ int ret;
+
+ /* Number of monitored counters cannot exceed HW limits. */
+ if (req->ncounters > DRM_VC4_MAX_PERF_COUNTERS ||
+ !req->ncounters)
+ return -EINVAL;
+
+ /* Make sure all events are valid. */
+ for (i = 0; i < req->ncounters; i++) {
+ if (req->events[i] >= VC4_PERFCNT_NUM_EVENTS)
+ return -EINVAL;
+ }
+
+ perfmon = kzalloc(sizeof(*perfmon) + (req->ncounters * sizeof(u64)),
+ GFP_KERNEL);
+ if (!perfmon)
+ return -ENOMEM;
+
+ for (i = 0; i < req->ncounters; i++)
+ perfmon->events[i] = req->events[i];
+
+ perfmon->ncounters = req->ncounters;
+
+ refcount_set(&perfmon->refcnt, 1);
+
+ mutex_lock(&vc4file->perfmon.lock);
+ ret = idr_alloc(&vc4file->perfmon.idr, perfmon, 0, INT_MAX,
+ GFP_KERNEL);
+ mutex_unlock(&vc4file->perfmon.lock);
+
+ if (ret < 0) {
+ kfree(perfmon);
+ return ret;
+ }
+
+ req->id = ret;
+ return 0;
+}
+
+int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct vc4_file *vc4file = file_priv->driver_priv;
+ struct drm_vc4_perfmon_destroy *req = data;
+ struct vc4_perfmon *perfmon;
+
+ mutex_lock(&vc4file->perfmon.lock);
+ perfmon = idr_remove(&vc4file->perfmon.idr, req->id);
+ mutex_unlock(&vc4file->perfmon.lock);
+
+ if (!perfmon)
+ return -EINVAL;
+
+ vc4_perfmon_put(perfmon);
+ return 0;
+}
+
+int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct vc4_file *vc4file = file_priv->driver_priv;
+ struct drm_vc4_perfmon_get_values *req = data;
+ struct vc4_perfmon *perfmon;
+ int ret;
+
+ mutex_lock(&vc4file->perfmon.lock);
+ perfmon = idr_find(&vc4file->perfmon.idr, req->id);
+ vc4_perfmon_get(perfmon);
+ mutex_unlock(&vc4file->perfmon.lock);
+
+ if (!perfmon)
+ return -EINVAL;
+
+ if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->counters,
+ perfmon->ncounters * sizeof(u64)))
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ vc4_perfmon_put(perfmon);
+ return ret;
+}
@@ -122,38 +122,9 @@
#define V3D_VPMBASE 0x00504
#define V3D_PCTRC 0x00670
#define V3D_PCTRE 0x00674
-#define V3D_PCTR0 0x00680
-#define V3D_PCTRS0 0x00684
-#define V3D_PCTR1 0x00688
-#define V3D_PCTRS1 0x0068c
-#define V3D_PCTR2 0x00690
-#define V3D_PCTRS2 0x00694
-#define V3D_PCTR3 0x00698
-#define V3D_PCTRS3 0x0069c
-#define V3D_PCTR4 0x006a0
-#define V3D_PCTRS4 0x006a4
-#define V3D_PCTR5 0x006a8
-#define V3D_PCTRS5 0x006ac
-#define V3D_PCTR6 0x006b0
-#define V3D_PCTRS6 0x006b4
-#define V3D_PCTR7 0x006b8
-#define V3D_PCTRS7 0x006bc
-#define V3D_PCTR8 0x006c0
-#define V3D_PCTRS8 0x006c4
-#define V3D_PCTR9 0x006c8
-#define V3D_PCTRS9 0x006cc
-#define V3D_PCTR10 0x006d0
-#define V3D_PCTRS10 0x006d4
-#define V3D_PCTR11 0x006d8
-#define V3D_PCTRS11 0x006dc
-#define V3D_PCTR12 0x006e0
-#define V3D_PCTRS12 0x006e4
-#define V3D_PCTR13 0x006e8
-#define V3D_PCTRS13 0x006ec
-#define V3D_PCTR14 0x006f0
-#define V3D_PCTRS14 0x006f4
-#define V3D_PCTR15 0x006f8
-#define V3D_PCTRS15 0x006fc
+# define V3D_PCTRE_EN BIT(31)
+#define V3D_PCTR(x) (0x00680 + ((x) * 8))
+#define V3D_PCTRS(x) (0x00684 + ((x) * 8))
#define V3D_DBGE 0x00f00
#define V3D_FDBGO 0x00f04
#define V3D_FDBGB 0x00f08
@@ -68,38 +68,38 @@ static const struct {
REGDEF(V3D_VPMBASE),
REGDEF(V3D_PCTRC),
REGDEF(V3D_PCTRE),
- REGDEF(V3D_PCTR0),
- REGDEF(V3D_PCTRS0),
- REGDEF(V3D_PCTR1),
- REGDEF(V3D_PCTRS1),
- REGDEF(V3D_PCTR2),
- REGDEF(V3D_PCTRS2),
- REGDEF(V3D_PCTR3),
- REGDEF(V3D_PCTRS3),
- REGDEF(V3D_PCTR4),
- REGDEF(V3D_PCTRS4),
- REGDEF(V3D_PCTR5),
- REGDEF(V3D_PCTRS5),
- REGDEF(V3D_PCTR6),
- REGDEF(V3D_PCTRS6),
- REGDEF(V3D_PCTR7),
- REGDEF(V3D_PCTRS7),
- REGDEF(V3D_PCTR8),
- REGDEF(V3D_PCTRS8),
- REGDEF(V3D_PCTR9),
- REGDEF(V3D_PCTRS9),
- REGDEF(V3D_PCTR10),
- REGDEF(V3D_PCTRS10),
- REGDEF(V3D_PCTR11),
- REGDEF(V3D_PCTRS11),
- REGDEF(V3D_PCTR12),
- REGDEF(V3D_PCTRS12),
- REGDEF(V3D_PCTR13),
- REGDEF(V3D_PCTRS13),
- REGDEF(V3D_PCTR14),
- REGDEF(V3D_PCTRS14),
- REGDEF(V3D_PCTR15),
- REGDEF(V3D_PCTRS15),
+ REGDEF(V3D_PCTR(0)),
+ REGDEF(V3D_PCTRS(0)),
+ REGDEF(V3D_PCTR(1)),
+ REGDEF(V3D_PCTRS(1)),
+ REGDEF(V3D_PCTR(2)),
+ REGDEF(V3D_PCTRS(2)),
+ REGDEF(V3D_PCTR(3)),
+ REGDEF(V3D_PCTRS(3)),
+ REGDEF(V3D_PCTR(4)),
+ REGDEF(V3D_PCTRS(4)),
+ REGDEF(V3D_PCTR(5)),
+ REGDEF(V3D_PCTRS(5)),
+ REGDEF(V3D_PCTR(6)),
+ REGDEF(V3D_PCTRS(6)),
+ REGDEF(V3D_PCTR(7)),
+ REGDEF(V3D_PCTRS(7)),
+ REGDEF(V3D_PCTR(8)),
+ REGDEF(V3D_PCTRS(8)),
+ REGDEF(V3D_PCTR(9)),
+ REGDEF(V3D_PCTRS(9)),
+ REGDEF(V3D_PCTR(10)),
+ REGDEF(V3D_PCTRS(10)),
+ REGDEF(V3D_PCTR(11)),
+ REGDEF(V3D_PCTRS(11)),
+ REGDEF(V3D_PCTR(12)),
+ REGDEF(V3D_PCTRS(12)),
+ REGDEF(V3D_PCTR(13)),
+ REGDEF(V3D_PCTRS(13)),
+ REGDEF(V3D_PCTR(14)),
+ REGDEF(V3D_PCTRS(14)),
+ REGDEF(V3D_PCTR(15)),
+ REGDEF(V3D_PCTRS(15)),
REGDEF(V3D_DBGE),
REGDEF(V3D_FDBGO),
REGDEF(V3D_FDBGB),
@@ -42,6 +42,9 @@ extern "C" {
#define DRM_VC4_GET_TILING 0x09
#define DRM_VC4_LABEL_BO 0x0a
#define DRM_VC4_GEM_MADVISE 0x0b
+#define DRM_VC4_PERFMON_CREATE 0x0c
+#define DRM_VC4_PERFMON_DESTROY 0x0d
+#define DRM_VC4_PERFMON_GET_VALUES 0x0e
#define DRM_IOCTL_VC4_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
#define DRM_IOCTL_VC4_WAIT_SEQNO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -55,6 +58,9 @@ extern "C" {
#define DRM_IOCTL_VC4_GET_TILING DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
#define DRM_IOCTL_VC4_LABEL_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
#define DRM_IOCTL_VC4_GEM_MADVISE DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
+#define DRM_IOCTL_VC4_PERFMON_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_CREATE, struct drm_vc4_perfmon_create)
+#define DRM_IOCTL_VC4_PERFMON_DESTROY DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_DESTROY, struct drm_vc4_perfmon_destroy)
+#define DRM_IOCTL_VC4_PERFMON_GET_VALUES DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_GET_VALUES, struct drm_vc4_perfmon_get_values)
struct drm_vc4_submit_rcl_surface {
__u32 hindex; /* Handle index, or ~0 if not present. */
@@ -71,9 +77,11 @@ struct drm_vc4_submit_rcl_surface {
/**
* @VC4_BIN_CL_CHUNK: binner CL chunk
+ * @VC4_PERFMON_CHUNK: performance monitor chunk
*/
enum {
VC4_BIN_CL_CHUNK,
+ VC4_PERFMON_CHUNK,
};
/**
@@ -102,6 +110,20 @@ struct drm_vc4_submit_cl_bin_chunk {
};
/**
+ * struct drm_vc4_submit_cl_perfmon_chunk - performance monitor extension
+ *
+ * @type: extention type, should be set to %VC4_PERFMON_CHUNK
+ * @id: id of the perfmance monitor previously allocated with
+ * %DRM_IOCTL_VC4_PERFMON_CREATE
+ * @pad: unused, should be set to zero
+ */
+struct drm_vc4_submit_cl_perfmon_chunk {
+ __u32 type;
+ __u32 id;
+ __u64 pad;
+};
+
+/**
* union drm_vc4_submit_cl_chunk - CL chunk
*
* CL chunks allow us to easily extend the set of arguments one can pass
@@ -111,6 +133,7 @@ struct drm_vc4_submit_cl_bin_chunk {
union drm_vc4_submit_cl_chunk {
struct drm_vc4_submit_cl_dummy_chunk dummy;
struct drm_vc4_submit_cl_bin_chunk bin;
+ struct drm_vc4_submit_cl_perfmon_chunk perfmon;
};
/**
@@ -369,6 +392,7 @@ struct drm_vc4_get_hang_state {
#define DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER 6
#define DRM_VC4_PARAM_SUPPORTS_MADVISE 7
#define DRM_VC4_PARAM_SUPPORTS_EXTENDED_CL 8
+#define DRM_VC4_PARAM_SUPPORTS_PERFMON 9
struct drm_vc4_get_param {
__u32 param;
@@ -413,6 +437,57 @@ struct drm_vc4_gem_madvise {
__u32 pad;
};
+enum {
+ VC4_PERFCNT_FEP_VALID_PRIMS_NO_RENDER,
+ VC4_PERFCNT_FEP_VALID_PRIMS_RENDER,
+ VC4_PERFCNT_FEP_CLIPPED_QUADS,
+ VC4_PERFCNT_FEP_VALID_QUADS,
+ VC4_PERFCNT_TLB_QUADS_NOT_PASSING_STENCIL,
+ VC4_PERFCNT_TLB_QUADS_NOT_PASSING_Z_AND_STENCIL,
+ VC4_PERFCNT_TLB_QUADS_PASSING_Z_AND_STENCIL,
+ VC4_PERFCNT_TLB_QUADS_ZERO_COVERAGE,
+ VC4_PERFCNT_TLB_QUADS_NON_ZERO_COVERAGE,
+ VC4_PERFCNT_TLB_QUADS_WRITTEN_TO_COLOR_BUF,
+ VC4_PERFCNT_PLB_PRIMS_OUTSIDE_VIEWPORT,
+ VC4_PERFCNT_PLB_PRIMS_NEED_CLIPPING,
+ VC4_PERFCNT_PSE_PRIMS_REVERSED,
+ VC4_PERFCNT_QPU_TOTAL_IDLE_CYCLES,
+ VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_VERTEX_COORD_SHADING,
+ VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_FRAGMENT_SHADING,
+ VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_EXEC_VALID_INST,
+ VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_TMUS,
+ VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_SCOREBOARD,
+ VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_VARYINGS,
+ VC4_PERFCNT_QPU_TOTAL_INST_CACHE_HIT,
+ VC4_PERFCNT_QPU_TOTAL_INST_CACHE_MISS,
+ VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_HIT,
+ VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_MISS,
+ VC4_PERFCNT_TMU_TOTAL_TEXT_QUADS_PROCESSED,
+ VC4_PERFCNT_TMU_TOTAL_TEXT_CACHE_MISS,
+ VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VDW_STALLED,
+ VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VCD_STALLED,
+ VC4_PERFCNT_L2C_TOTAL_L2_CACHE_HIT,
+ VC4_PERFCNT_L2C_TOTAL_L2_CACHE_MISS,
+ VC4_PERFCNT_NUM_EVENTS,
+};
+
+#define DRM_VC4_MAX_PERF_COUNTERS 16
+
+struct drm_vc4_perfmon_create {
+ __u32 id;
+ __u32 ncounters;
+ __u8 events[DRM_VC4_MAX_PERF_COUNTERS];
+};
+
+struct drm_vc4_perfmon_destroy {
+ __u32 id;
+};
+
+struct drm_vc4_perfmon_get_values {
+ __u32 id;
+ __u64 values_ptr;
+};
+
#if defined(__cplusplus)
}
#endif
The V3D engine has various hardware counters which might be interesting to userspace performance analysis tools. Expose new ioctls to create/destroy a performance monitor object and query the counter values of this perfmance monitor. Not that a perfomance monitor is given an ID that is only valid on the file descriptor it has been allocated from. A perfmance monitor can be attached to a CL submission and the driver will enable HW counters for this request and update the performance monitor values at the end of the job. Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com> --- drivers/gpu/drm/vc4/Makefile | 1 + drivers/gpu/drm/vc4/vc4_drv.c | 26 +++++ drivers/gpu/drm/vc4/vc4_drv.h | 43 +++++++++ drivers/gpu/drm/vc4/vc4_gem.c | 33 +++++-- drivers/gpu/drm/vc4/vc4_irq.c | 13 ++- drivers/gpu/drm/vc4/vc4_perfmon.c | 195 ++++++++++++++++++++++++++++++++++++++ drivers/gpu/drm/vc4/vc4_regs.h | 35 +------ drivers/gpu/drm/vc4/vc4_v3d.c | 64 ++++++------- include/uapi/drm/vc4_drm.h | 75 +++++++++++++++ 9 files changed, 414 insertions(+), 71 deletions(-) create mode 100644 drivers/gpu/drm/vc4/vc4_perfmon.c