From patchwork Sun Aug  4 17:23:43 2013
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Rob Clark <robdclark@gmail.com>
X-Patchwork-Id: 2838449
Return-Path: <linux-arm-msm-owner@kernel.org>
X-Original-To: patchwork-linux-arm-msm@patchwork.kernel.org
Delivered-To: patchwork-parsemail@patchwork2.web.kernel.org
Received: from mail.kernel.org (mail.kernel.org [198.145.19.201])
	by patchwork2.web.kernel.org (Postfix) with ESMTP id 6DAB9BF535
	for <patchwork-linux-arm-msm@patchwork.kernel.org>;
	Sun,  4 Aug 2013 17:24:54 +0000 (UTC)
Received: from mail.kernel.org (localhost [127.0.0.1])
	by mail.kernel.org (Postfix) with ESMTP id B61D620160
	for <patchwork-linux-arm-msm@patchwork.kernel.org>;
	Sun,  4 Aug 2013 17:24:52 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by mail.kernel.org (Postfix) with ESMTP id C512820176
	for <patchwork-linux-arm-msm@patchwork.kernel.org>;
	Sun,  4 Aug 2013 17:24:49 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1752833Ab3HDRYr (ORCPT
	<rfc822;patchwork-linux-arm-msm@patchwork.kernel.org>);
	Sun, 4 Aug 2013 13:24:47 -0400
Received: from mail-qe0-f41.google.com ([209.85.128.41]:58994 "EHLO
	mail-qe0-f41.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751980Ab3HDRYr (ORCPT
	<rfc822;linux-arm-msm@vger.kernel.org>);
	Sun, 4 Aug 2013 13:24:47 -0400
Received: by mail-qe0-f41.google.com with SMTP id a11so1314704qen.0
	for <linux-arm-msm@vger.kernel.org>;
	Sun, 04 Aug 2013 10:24:46 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
	d=gmail.com; s=20120113;
	h=from:to:cc:subject:date:message-id:x-mailer:in-reply-to:references;
	bh=nL7WhHu34mLmPAaXUNGPIjP7gsViJq/WQZeBm5bSHUU=;
	b=gnHE/koMDvM4icOPfR5Us8R0K2ZB3GBM5IpksYHbNXDt/RBSd0zzd/eT0aWui8w6QH
	TCbOrg1L5elqX1JTaC9tHFDJ7ytU3jvf14u/aZpqDAXYMwzSL8FL6s0D3HCX7YUDOt8B
	DausekCp06FWlo6KN0bksDIfhlZoK9n+FuyTJUp3X62Ve7LpumZxmwlCc5qFLRMUV8mD
	JcX9Zi4pcNz2YdIWQS+1cDdTFK6RDoeyeeBXwEepYLfT6RKx2cFCQKz/ZLjKYQknI3Jk
	bPIkWfW5fC010IVQeeeZZrbRyWxOofRJePKIcaNEGz2Q6KF7jQmsCPaGNIlZqpOuLn6D
	STAQ==
X-Received: by 10.49.35.108 with SMTP id g12mr22032424qej.86.1375637086267;
	Sun, 04 Aug 2013 10:24:46 -0700 (PDT)
Received: from localhost (pool-108-20-246-35.bstnma.east.verizon.net.
	[108.20.246.35])
	by mx.google.com with ESMTPSA id a2sm1034534qek.7.2013.08.04.10.24.44
	for <multiple recipients>
	(version=TLSv1.2 cipher=RC4-SHA bits=128/128);
	Sun, 04 Aug 2013 10:24:45 -0700 (PDT)
From: Rob Clark <robdclark@gmail.com>
To: dri-devel@lists.freedesktop.org
Cc: linux-arm-msm@vger.kernel.org, Rob Clark <robdclark@gmail.com>
Subject: [PATCH 6/6] RFC: drm/msm: add perf logging debugfs
Date: Sun,  4 Aug 2013 13:23:43 -0400
Message-Id: <1375637023-6042-7-git-send-email-robdclark@gmail.com>
X-Mailer: git-send-email 1.8.3.1
In-Reply-To: <1375637023-6042-1-git-send-email-robdclark@gmail.com>
References: <1375637023-6042-1-git-send-email-robdclark@gmail.com>
Sender: linux-arm-msm-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-arm-msm.vger.kernel.org>
X-Mailing-List: linux-arm-msm@vger.kernel.org
X-Spam-Status: No, score=-6.8 required=5.0 tests=BAYES_00,
	DKIM_ADSP_CUSTOM_MED,
	DKIM_SIGNED, FREEMAIL_FROM, RCVD_IN_DNSWL_HI, RP_MATCHES_RCVD,
	T_DKIM_INVALID, UNPARSEABLE_RELAY autolearn=ham version=3.3.1
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org
X-Virus-Scanned: ClamAV using ClamSMTP

---
 drivers/gpu/drm/msm/Makefile            |   1 +
 drivers/gpu/drm/msm/adreno/a3xx_gpu.c   |  20 ++-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |   4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.h |   2 -
 drivers/gpu/drm/msm/msm_drv.c           |  15 +-
 drivers/gpu/drm/msm/msm_drv.h           |   4 +
 drivers/gpu/drm/msm/msm_gpu.c           | 110 +++++++++++++
 drivers/gpu/drm/msm/msm_gpu.h           |  33 ++++
 drivers/gpu/drm/msm/msm_perf.c          | 270 ++++++++++++++++++++++++++++++++
 9 files changed, 448 insertions(+), 11 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_perf.c
diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index f945179..a19c731 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -23,6 +23,7 @@ msm-y := \
 	msm_gem.o \
 	msm_gem_submit.o \
 	msm_gpu.o \
+	msm_perf.o \
 	msm_rd.o \
 	msm_ringbuffer.o
 
diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
index 13d61bb..2e2950f 100644
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -174,11 +174,11 @@ static int a3xx_hw_init(struct msm_gpu *gpu)
 	/* Turn on performance counters: */
 	gpu_write(gpu, REG_A3XX_RBBM_PERFCTR_CTL, 0x01);
 
-	/* Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS
-	 * we will use this to augment our hang detection:
-	 */
-	gpu_write(gpu, REG_A3XX_SP_PERFCOUNTER7_SELECT,
-			SP_FS_FULL_ALU_INSTRUCTIONS);
+	/* Enable the perfcntrs that we use.. */
+	for (i = 0; i < gpu->num_perfcntrs; i++) {
+		const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i];
+		gpu_write(gpu, perfcntr->select_reg, perfcntr->select_val);
+	}
 
 	gpu_write(gpu, REG_A3XX_RBBM_INT_0_MASK, A3XX_INT0_MASK);
 
@@ -383,6 +383,13 @@ static const struct adreno_gpu_funcs funcs = {
 	},
 };
 
+static const struct msm_gpu_perfcntr perfcntrs[] = {
+	{ REG_A3XX_SP_PERFCOUNTER6_SELECT, REG_A3XX_RBBM_PERFCTR_SP_6_LO,
+			SP_ALU_ACTIVE_CYCLES, "ALUACTIVE" },
+	{ REG_A3XX_SP_PERFCOUNTER7_SELECT, REG_A3XX_RBBM_PERFCTR_SP_7_LO,
+			SP_FS_FULL_ALU_INSTRUCTIONS, "ALUFULL" },
+};
+
 struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 {
 	struct a3xx_gpu *a3xx_gpu = NULL;
@@ -417,6 +424,9 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 	DBG("fast_rate=%u, slow_rate=%u, bus_freq=%u",
 			gpu->fast_rate, gpu->slow_rate, gpu->bus_freq);
 
+	gpu->perfcntrs = perfcntrs;
+	gpu->num_perfcntrs = ARRAY_SIZE(perfcntrs);
+
 	ret = adreno_gpu_init(dev, pdev, &a3xx_gpu->base,
 			&funcs, config->rev);
 	if (ret)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 282163e..39a65af 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -119,8 +119,6 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	struct msm_ringbuffer *ring = gpu->rb;
 	unsigned i, ibs = 0;
 
-	adreno_gpu->last_fence = submit->fence;
-
 	for (i = 0; i < submit->nr_cmds; i++) {
 		switch (submit->cmd[i].type) {
 		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
@@ -225,7 +223,7 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 			adreno_gpu->rev.patchid);
 
 	seq_printf(m, "fence:    %d/%d\n", adreno_gpu->memptrs->fence,
-			adreno_gpu->last_fence);
+			gpu->submitted_fence);
 	seq_printf(m, "rptr:     %d\n", adreno_gpu->memptrs->rptr);
 	seq_printf(m, "wptr:     %d\n", adreno_gpu->memptrs->wptr);
 	seq_printf(m, "rb wptr:  %d\n", get_wptr(gpu->rb));
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 6b49c4f..1190a70 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -54,8 +54,6 @@ struct adreno_gpu {
 	uint32_t revn;  /* numeric revision name */
 	const struct adreno_gpu_funcs *funcs;
 
-	uint32_t last_fence;
-
 	/* firmware: */
 	const struct firmware *pm4, *pfp;
 
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index adcc68b..58f47a5 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -459,12 +459,25 @@ static int msm_debugfs_init(struct drm_minor *minor)
 		return ret;
 	}
 
-	return msm_rd_debugfs_init(minor);
+	ret = msm_rd_debugfs_init(minor);
+	if (ret) {
+		dev_err(dev->dev, "could not install rd debugfs\n");
+		return ret;
+	}
+
+	ret = msm_perf_debugfs_init(minor);
+	if (ret) {
+		dev_err(dev->dev, "could not install perf debugfs\n");
+		return ret;
+	}
+
+	return 0;
 }
 
 static void msm_debugfs_cleanup(struct drm_minor *minor)
 {
 	msm_rd_debugfs_cleanup(minor);
+	msm_perf_debugfs_cleanup(minor);
 	drm_debugfs_remove_files(msm_debugfs_list,
 			ARRAY_SIZE(msm_debugfs_list), minor);
 }
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index ea6f1c7..af91cab 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -45,6 +45,7 @@
 struct msm_kms;
 struct msm_gpu;
 struct msm_rd_state;
+struct msm_perf_state;
 struct msm_gem_submit;
 
 #define NUM_DOMAINS 2    /* one for KMS, then one per gpu core (?) */
@@ -71,6 +72,7 @@ struct msm_drm_private {
 	wait_queue_head_t fence_event;
 
 	struct msm_rd_state *rd;
+	struct msm_perf_state *perf;
 
 	/* list of GEM objects: */
 	struct list_head inactive_list;
@@ -186,6 +188,8 @@ void msm_framebuffer_describe(struct drm_framebuffer *fb, struct seq_file *m);
 int msm_rd_debugfs_init(struct drm_minor *minor);
 void msm_rd_debugfs_cleanup(struct drm_minor *minor);
 void msm_rd_dump_submit(struct msm_gem_submit *submit);
+int msm_perf_debugfs_init(struct drm_minor *minor);
+void msm_perf_debugfs_cleanup(struct drm_minor *minor);
 #else
 static inline void msm_rd_dump_submit(struct msm_gem_submit *submit) {}
 #endif
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 4775415..735d6d9 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -203,6 +203,107 @@ int msm_gpu_pm_suspend(struct msm_gpu *gpu)
 }
 
 /*
+ * Performance Counters:
+ */
+
+static bool msm_gpu_active(struct msm_gpu *gpu)
+{
+	return gpu->submitted_fence > gpu->funcs->last_fence(gpu);
+}
+
+/* called under perf_lock */
+static int update_hw_cntrs(struct msm_gpu *gpu, uint32_t ncntrs, uint32_t *cntrs)
+{
+	uint32_t current_cntrs[ARRAY_SIZE(gpu->last_cntrs)];
+	int i, n = min(ncntrs, gpu->num_perfcntrs);
+
+	/* read current values: */
+	for (i = 0; i < gpu->num_perfcntrs; i++)
+		current_cntrs[i] = gpu_read(gpu, gpu->perfcntrs[i].sample_reg);
+
+	/* update cntrs: */
+	for (i = 0; i < n; i++)
+		cntrs[i] = current_cntrs[i] - gpu->last_cntrs[i];
+
+	/* save current values: */
+	for (i = 0; i < gpu->num_perfcntrs; i++)
+		gpu->last_cntrs[i] = current_cntrs[i];
+
+	return n;
+}
+
+/* called under perf_lock */
+static void update_sw_cntrs(struct msm_gpu *gpu)
+{
+	ktime_t time;
+	uint32_t elapsed;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+	if (!gpu->perfcntr_active)
+		goto out;
+
+	time = ktime_get();
+	elapsed = ktime_to_us(ktime_sub(time, gpu->last_sample.time));
+
+	gpu->totaltime += elapsed;
+	if (gpu->last_sample.active)
+		gpu->activetime += elapsed;
+
+	gpu->last_sample.active = msm_gpu_active(gpu);
+	gpu->last_sample.time = time;
+
+out:
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+}
+
+void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+	/* we could dynamically enable/disable perfcntr registers too.. */
+	gpu->last_sample.active = msm_gpu_active(gpu);
+	gpu->last_sample.time = ktime_get();
+	gpu->activetime = gpu->totaltime = 0;
+	gpu->perfcntr_active = true;
+	update_hw_cntrs(gpu, 0, NULL);
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+}
+
+void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
+{
+	gpu->perfcntr_active = false;
+}
+
+/* returns -errno or # of cntrs sampled */
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
+		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&gpu->perf_lock, flags);
+
+	if (!gpu->perfcntr_active) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	*activetime = gpu->activetime;
+	*totaltime = gpu->totaltime;
+
+	gpu->activetime = gpu->totaltime = 0;
+
+	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
+
+out:
+	spin_unlock_irqrestore(&gpu->perf_lock, flags);
+
+	return ret;
+}
+
+/*
  * Cmdstream submission/retirement:
  */
 
@@ -240,6 +341,7 @@ void msm_gpu_retire(struct msm_gpu *gpu)
 {
 	struct msm_drm_private *priv = gpu->dev->dev_private;
 	queue_work(priv->wq, &gpu->retire_work);
+	update_sw_cntrs(gpu);
 }
 
 /* add bo's to gpu's ring, and kick gpu: */
@@ -256,6 +358,10 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 
 	msm_rd_dump_submit(submit);
 
+	gpu->submitted_fence = submit->fence;
+
+	update_sw_cntrs(gpu);
+
 	ret = gpu->funcs->submit(gpu, submit, ctx);
 	priv->lastctx = ctx;
 
@@ -304,12 +410,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	struct resource *res;
 	int i, ret;
 
+	if (WARN_ON(gpu->num_perfcntrs > ARRAY_SIZE(gpu->last_cntrs)))
+		gpu->num_perfcntrs = ARRAY_SIZE(gpu->last_cntrs);
+
 	gpu->dev = drm;
 	gpu->funcs = funcs;
 	gpu->name = name;
 
 	INIT_LIST_HEAD(&gpu->active_list);
 	INIT_WORK(&gpu->retire_work, retire_worker);
+	spin_lock_init(&gpu->perf_lock);
 
 	BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks));
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 8d2cd6c..b537f7c 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -25,6 +25,7 @@
 #include "msm_ringbuffer.h"
 
 struct msm_gem_submit;
+struct msm_gpu_perfcntr;
 
 /* So far, with hardware that I've seen to date, we can have:
  *  + zero, one, or two z180 2d cores
@@ -63,12 +64,26 @@ struct msm_gpu {
 	struct drm_device *dev;
 	const struct msm_gpu_funcs *funcs;
 
+	/* performance counters (hw & sw): */
+	spinlock_t perf_lock;
+	bool perfcntr_active;
+	struct {
+		bool active;
+		ktime_t time;
+	} last_sample;
+	uint32_t totaltime, activetime;    /* sw counters */
+	uint32_t last_cntrs[5];            /* hw counters */
+	const struct msm_gpu_perfcntr *perfcntrs;
+	uint32_t num_perfcntrs;
+
 	struct msm_ringbuffer *rb;
 	uint32_t rb_iova;
 
 	/* list of GEM active objects: */
 	struct list_head active_list;
 
+	uint32_t submitted_fence;
+
 	/* worker for handling active-list retiring: */
 	struct work_struct retire_work;
 
@@ -85,6 +100,19 @@ struct msm_gpu {
 	uint32_t bsc;
 };
 
+/* Perf-Counters:
+ * The select_reg and select_val are just there for the benefit of the child
+ * class that actually enables the perf counter..  but msm_gpu base class
+ * will handle sampling/displaying the counters.
+ */
+
+struct msm_gpu_perfcntr {
+	uint32_t select_reg;
+	uint32_t sample_reg;
+	uint32_t select_val;
+	const char *name;
+};
+
 static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
 {
 	msm_writel(data, gpu->mmio + (reg << 2));
@@ -98,6 +126,11 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 int msm_gpu_pm_suspend(struct msm_gpu *gpu);
 int msm_gpu_pm_resume(struct msm_gpu *gpu);
 
+void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
+void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
+		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
+
 void msm_gpu_retire(struct msm_gpu *gpu);
 int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
new file mode 100644
index 0000000..1937bd2
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_perf.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* For profiling, userspace can:
+ *
+ *   tail -f /sys/kernel/debug/dri/<minor>/gpu
+ *
+ * This will enable performance counters/profiling to track the busy time
+ * and any gpu specific performance counters that are supported.
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/debugfs.h>
+
+#include "msm_drv.h"
+#include "msm_gpu.h"
+
+struct msm_perf_state {
+	struct drm_device *dev;
+
+	bool open;
+	int cnt;
+	struct mutex read_lock;
+
+	char buf[256];
+	int buftot, bufpos;
+
+	unsigned long next_jiffies;
+
+	struct dentry *ent;
+	struct drm_info_node *node;
+};
+
+#define SAMPLE_TIME (HZ/4)
+
+/* wait for next sample time: */
+static int wait_sample(struct msm_perf_state *perf)
+{
+	unsigned long start_jiffies = jiffies;
+
+	if (time_after(perf->next_jiffies, start_jiffies)) {
+		unsigned long remaining_jiffies =
+			perf->next_jiffies - start_jiffies;
+		int ret = schedule_timeout_interruptible(remaining_jiffies);
+		if (ret > 0) {
+			/* interrupted */
+			return -ERESTARTSYS;
+		}
+	}
+	perf->next_jiffies += SAMPLE_TIME;
+	return 0;
+}
+
+static int refill_buf(struct msm_perf_state *perf)
+{
+	struct msm_drm_private *priv = perf->dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+	char *ptr = perf->buf;
+	int rem = sizeof(perf->buf);
+	int i, n;
+
+	if ((perf->cnt++ % 32) == 0) {
+		/* Header line: */
+		n = snprintf(ptr, rem, "%%BUSY");
+		ptr += n;
+		rem -= n;
+
+		for (i = 0; i < gpu->num_perfcntrs; i++) {
+			const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i];
+			n = snprintf(ptr, rem, "\t%s", perfcntr->name);
+			ptr += n;
+			rem -= n;
+		}
+	} else {
+		/* Sample line: */
+		uint32_t activetime = 0, totaltime = 0;
+		uint32_t cntrs[5];
+		uint32_t val;
+		int ret;
+
+		/* sleep until next sample time: */
+		ret = wait_sample(perf);
+		if (ret)
+			return ret;
+
+		ret = msm_gpu_perfcntr_sample(gpu, &activetime, &totaltime,
+				ARRAY_SIZE(cntrs), cntrs);
+		if (ret < 0)
+			return ret;
+
+		val = totaltime ? 1000 * activetime / totaltime : 0;
+		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
+		ptr += n;
+		rem -= n;
+
+		for (i = 0; i < ret; i++) {
+			/* cycle counters (I think).. convert to MHz.. */
+			val = cntrs[i] / 10000;
+			n = snprintf(ptr, rem, "\t%5d.%02d",
+					val / 100, val % 100);
+			ptr += n;
+			rem -= n;
+		}
+	}
+
+	n = snprintf(ptr, rem, "\n");
+	ptr += n;
+	rem -= n;
+
+	perf->bufpos = 0;
+	perf->buftot = ptr - perf->buf;
+
+	return 0;
+}
+
+static ssize_t perf_read(struct file *file, char __user *buf,
+		size_t sz, loff_t *ppos)
+{
+	struct msm_perf_state *perf = file->private_data;
+	int n = 0, ret;
+
+	mutex_lock(&perf->read_lock);
+
+	if (perf->bufpos >= perf->buftot) {
+		ret = refill_buf(perf);
+		if (ret)
+			goto out;
+	}
+
+	n = min((int)sz, perf->buftot - perf->bufpos);
+	ret = copy_to_user(buf, &perf->buf[perf->bufpos], n);
+	if (ret)
+		goto out;
+
+	perf->bufpos += n;
+	*ppos += n;
+
+out:
+	mutex_unlock(&perf->read_lock);
+	if (ret)
+		return ret;
+	return n;
+}
+
+static int perf_open(struct inode *inode, struct file *file)
+{
+	struct msm_perf_state *perf = inode->i_private;
+	struct drm_device *dev = perf->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+	int ret = 0;
+
+	mutex_lock(&dev->struct_mutex);
+
+priv->perf = perf; // XXX this isn't really good..
+
+	if (perf->open || !gpu) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	file->private_data = perf;
+	perf->open = true;
+	perf->cnt = 0;
+	perf->buftot = 0;
+	perf->bufpos = 0;
+	msm_gpu_perfcntr_start(gpu);
+	perf->next_jiffies = jiffies + SAMPLE_TIME;
+
+out:
+	mutex_unlock(&dev->struct_mutex);
+	return ret;
+}
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+	struct msm_perf_state *perf = inode->i_private;
+	struct msm_drm_private *priv = perf->dev->dev_private;
+	msm_gpu_perfcntr_stop(priv->gpu);
+	perf->open = false;
+	return 0;
+}
+
+
+static const struct file_operations perf_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = perf_open,
+	.read = perf_read,
+	.llseek = no_llseek,
+	.release = perf_release,
+};
+
+int msm_perf_debugfs_init(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_perf_state *perf;
+
+	perf = kzalloc(sizeof(*perf), GFP_KERNEL);
+	if (!perf)
+		return -ENOMEM;
+
+	perf->dev = minor->dev;
+
+	mutex_init(&perf->read_lock);
+DBG("********** perf->dev=%p, priv=%p", perf->dev, priv);
+//	priv->perf = perf;
+
+	perf->node = kzalloc(sizeof(*perf->node), GFP_KERNEL);
+	if (!perf->node)
+		goto fail;
+
+	perf->ent = debugfs_create_file("perf", S_IFREG | S_IRUGO,
+			minor->debugfs_root, perf, &perf_debugfs_fops);
+	if (!perf->ent) {
+		DRM_ERROR("Cannot create /sys/kernel/debug/dri/%s/perf\n",
+				minor->debugfs_root->d_name.name);
+		goto fail;
+	}
+
+	perf->node->minor = minor;
+	perf->node->dent  = perf->ent;
+	perf->node->info_ent = NULL;
+
+	mutex_lock(&minor->debugfs_lock);
+	list_add(&perf->node->list, &minor->debugfs_list);
+	mutex_unlock(&minor->debugfs_lock);
+
+	return 0;
+
+fail:
+	msm_perf_debugfs_cleanup(minor);
+	return -1;
+}
+
+void msm_perf_debugfs_cleanup(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_perf_state *perf = priv->perf;
+
+	if (perf->ent)
+		debugfs_remove(perf->ent);
+
+	if (perf->node) {
+		mutex_lock(&minor->debugfs_lock);
+		list_del(&perf->node->list);
+		mutex_unlock(&minor->debugfs_lock);
+		kfree(perf->node);
+	}
+
+	mutex_destroy(&perf->read_lock);
+
+	kfree(perf);
+}
+
+#endif