diff mbox

[RFC,07/11] drm/i915: Expose PMU for Observation Architecture

Message ID 1431008154-6833-8-git-send-email-robert@sixbynine.org (mailing list archive)
State New, archived
Headers show

Commit Message

Robert Bragg May 7, 2015, 2:15 p.m. UTC
Gen graphics hardware can be set up to periodically write snapshots of
performance counters into a circular buffer and this patch exposes that
capability to userspace via the perf interface.

To start with this only enables the A (aggregating) counters with the
simplest configuration requirements.

Only Haswell is supported currently.

Signed-off-by: Robert Bragg <robert@sixbynine.org>
---
 drivers/gpu/drm/i915/Makefile           |   1 +
 drivers/gpu/drm/i915/i915_dma.c         |   6 +
 drivers/gpu/drm/i915/i915_drv.h         |  53 +++
 drivers/gpu/drm/i915/i915_gem_context.c |  45 +-
 drivers/gpu/drm/i915/i915_oa_perf.c     | 750 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_reg.h         |  68 +++
 include/uapi/drm/i915_drm.h             |  29 ++
 7 files changed, 942 insertions(+), 10 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_oa_perf.c

Comments

Chris Wilson May 7, 2015, 2:36 p.m. UTC | #1
On Thu, May 07, 2015 at 03:15:50PM +0100, Robert Bragg wrote:
> +static int init_oa_buffer(struct perf_event *event)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
> +	struct drm_i915_gem_object *bo;
> +	int ret;
> +
> +	BUG_ON(!IS_HASWELL(dev_priv->dev));
> +	BUG_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +	BUG_ON(dev_priv->oa_pmu.oa_buffer.obj);
> +
> +	spin_lock_init(&dev_priv->oa_pmu.oa_buffer.flush_lock);
> +
> +	/* NB: We over allocate the OA buffer due to the way raw sample data
> +	 * gets copied from the gpu mapped circular buffer into the perf
> +	 * circular buffer so that only one copy is required.
> +	 *
> +	 * For each perf sample (raw->size + 4) needs to be 8 byte aligned,
> +	 * where the 4 corresponds to the 32bit raw->size member that's
> +	 * added to the sample header that userspace sees.
> +	 *
> +	 * Due to the + 4 for the size member: when we copy a report to the
> +	 * userspace facing perf buffer we always copy an additional 4 bytes
> +	 * from the subsequent report to make up for the miss alignment, but
> +	 * when a report is at the end of the gpu mapped buffer we need to
> +	 * read 4 bytes past the end of the buffer.
> +	 */
> +	bo = i915_gem_alloc_object(dev_priv->dev, OA_BUFFER_SIZE + PAGE_SIZE);
> +	if (bo == NULL) {
> +		DRM_ERROR("Failed to allocate OA buffer\n");
> +		ret = -ENOMEM;
> +		goto err;
> +	}
> +	dev_priv->oa_pmu.oa_buffer.obj = bo;
> +
> +	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> +	if (ret)
> +		goto err_unref;
> +
> +	/* PreHSW required 512K alignment, HSW requires 16M */
> +	ret = i915_gem_obj_ggtt_pin(bo, SZ_16M, 0);
> +	if (ret)
> +		goto err_unref;
> +
> +	dev_priv->oa_pmu.oa_buffer.gtt_offset = i915_gem_obj_ggtt_offset(bo);
> +	dev_priv->oa_pmu.oa_buffer.addr = vmap_oa_buffer(bo);

You can look forward to both i915_gem_object_create_internal() and
i915_gem_object_pin_vmap()

> +
> +	/* Pre-DevBDW: OABUFFER must be set with counters off,
> +	 * before OASTATUS1, but after OASTATUS2 */
> +	I915_WRITE(GEN7_OASTATUS2, dev_priv->oa_pmu.oa_buffer.gtt_offset |
> +		   GEN7_OASTATUS2_GGTT); /* head */
> +	I915_WRITE(GEN7_OABUFFER, dev_priv->oa_pmu.oa_buffer.gtt_offset);
> +	I915_WRITE(GEN7_OASTATUS1, dev_priv->oa_pmu.oa_buffer.gtt_offset |
> +		   GEN7_OASTATUS1_OABUFFER_SIZE_16M); /* tail */
> +
> +	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
> +			 dev_priv->oa_pmu.oa_buffer.gtt_offset,
> +			 dev_priv->oa_pmu.oa_buffer.addr);
> +
> +	return 0;
> +
> +err_unref:
> +	drm_gem_object_unreference_unlocked(&bo->base);

But what I really what to say was:
mutex deadlock^^^
-Chris
Chris Wilson May 7, 2015, 2:58 p.m. UTC | #2
On Thu, May 07, 2015 at 03:15:50PM +0100, Robert Bragg wrote:
> +	/* We bypass the default perf core perf_paranoid_cpu() ||
> +	 * CAP_SYS_ADMIN check by using the PERF_PMU_CAP_IS_DEVICE
> +	 * flag and instead authenticate based on whether the current
> +	 * pid owns the specified context, or require CAP_SYS_ADMIN
> +	 * when collecting cross-context metrics.
> +	 */
> +	dev_priv->oa_pmu.specific_ctx = NULL;
> +	if (oa_attr.single_context) {
> +		u32 ctx_id = oa_attr.ctx_id;
> +		unsigned int drm_fd = oa_attr.drm_fd;
> +		struct fd fd = fdget(drm_fd);
> +
> +		if (fd.file) {

Specify a ctx and not providing the right fd should be its own error,
either EBADF or EINVAL.

> +			dev_priv->oa_pmu.specific_ctx =
> +				lookup_context(dev_priv, fd.file, ctx_id);
> +		}

Missing fdput

> +	}
> +
> +	if (!dev_priv->oa_pmu.specific_ctx && !capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
> +	mutex_lock(&dev_priv->dev->struct_mutex);

i915_mutex_interruptible, probably best to couple into the GPU error
handling here as well especially as init_oa_buffer() will go onto touch
GPU internals.

> +	ret = init_oa_buffer(event);
> +	mutex_unlock(&dev_priv->dev->struct_mutex);
> +
> +	if (ret)
> +		return ret;
> +
> +	BUG_ON(dev_priv->oa_pmu.exclusive_event);
> +	dev_priv->oa_pmu.exclusive_event = event;
> +
> +	event->destroy = i915_oa_event_destroy;
> +
> +	/* PRM - observability performance counters:
> +	 *
> +	 *   OACONTROL, performance counter enable, note:
> +	 *
> +	 *   "When this bit is set, in order to have coherent counts,
> +	 *   RC6 power state and trunk clock gating must be disabled.
> +	 *   This can be achieved by programming MMIO registers as
> +	 *   0xA094=0 and 0xA090[31]=1"
> +	 *
> +	 *   In our case we are expected that taking pm + FORCEWAKE
> +	 *   references will effectively disable RC6 and trunk clock
> +	 *   gating.
> +	 */
> +	intel_runtime_pm_get(dev_priv);
> +	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);

That is a nuisance. Aside: Why isn't OA inside the powerctx? Is a subset
valid with forcewake? It does perturb the system greatly to disable rc6,
so I wonder if it could be made optional?

> +
> +	return 0;
> +}
> +
> +static void update_oacontrol(struct drm_i915_private *dev_priv)
> +{
> +	BUG_ON(!spin_is_locked(&dev_priv->oa_pmu.lock));
> +
> +	if (dev_priv->oa_pmu.event_active) {
> +		unsigned long ctx_id = 0;
> +		bool pinning_ok = false;
> +
> +		if (dev_priv->oa_pmu.specific_ctx) {
> +			struct intel_context *ctx =
> +				dev_priv->oa_pmu.specific_ctx;
> +			struct drm_i915_gem_object *obj =
> +				ctx->legacy_hw_ctx.rcs_state;

If only there was ctx->legacy_hw_ctx.rcs_vma...

> +
> +			if (i915_gem_obj_is_pinned(obj)) {
> +				ctx_id = i915_gem_obj_ggtt_offset(obj);
> +				pinning_ok = true;
> +			}
> +		}
> +
> +		if ((ctx_id == 0 || pinning_ok)) {
> +			bool periodic = dev_priv->oa_pmu.periodic;
> +			u32 period_exponent = dev_priv->oa_pmu.period_exponent;
> +			u32 report_format = dev_priv->oa_pmu.oa_buffer.format;
> +
> +			I915_WRITE(GEN7_OACONTROL,
> +				   (ctx_id & GEN7_OACONTROL_CTX_MASK) |
> +				   (period_exponent <<
> +				    GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
> +				   (periodic ?
> +				    GEN7_OACONTROL_TIMER_ENABLE : 0) |
> +				   (report_format <<
> +				    GEN7_OACONTROL_FORMAT_SHIFT) |
> +				   (ctx_id ?
> +				    GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
> +				   GEN7_OACONTROL_ENABLE);

I notice you don't use any write barriers...
-Chris
Robert Bragg May 18, 2015, 4:21 p.m. UTC | #3
On 7 May 2015 15:37, "Chris Wilson" <chris@chris-wilson.co.uk> wrote:
>
> On Thu, May 07, 2015 at 03:15:50PM +0100, Robert Bragg wrote:
> > +static int init_oa_buffer(struct perf_event *event)
> > +{
> > +     struct drm_i915_private *dev_priv =
> > +             container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
> > +     struct drm_i915_gem_object *bo;
> > +     int ret;
> > +
> > +     BUG_ON(!IS_HASWELL(dev_priv->dev));
> > +     BUG_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> > +     BUG_ON(dev_priv->oa_pmu.oa_buffer.obj);
> > +
> > +     spin_lock_init(&dev_priv->oa_pmu.oa_buffer.flush_lock);
> > +
> > +     /* NB: We over allocate the OA buffer due to the way raw sample
data
> > +      * gets copied from the gpu mapped circular buffer into the perf
> > +      * circular buffer so that only one copy is required.
> > +      *
> > +      * For each perf sample (raw->size + 4) needs to be 8 byte
aligned,
> > +      * where the 4 corresponds to the 32bit raw->size member that's
> > +      * added to the sample header that userspace sees.
> > +      *
> > +      * Due to the + 4 for the size member: when we copy a report to
the
> > +      * userspace facing perf buffer we always copy an additional 4
bytes
> > +      * from the subsequent report to make up for the miss alignment,
but
> > +      * when a report is at the end of the gpu mapped buffer we need to
> > +      * read 4 bytes past the end of the buffer.
> > +      */
> > +     bo = i915_gem_alloc_object(dev_priv->dev, OA_BUFFER_SIZE +
PAGE_SIZE);
> > +     if (bo == NULL) {
> > +             DRM_ERROR("Failed to allocate OA buffer\n");
> > +             ret = -ENOMEM;
> > +             goto err;
> > +     }
> > +     dev_priv->oa_pmu.oa_buffer.obj = bo;
> > +
> > +     ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> > +     if (ret)
> > +             goto err_unref;
> > +
> > +     /* PreHSW required 512K alignment, HSW requires 16M */
> > +     ret = i915_gem_obj_ggtt_pin(bo, SZ_16M, 0);
> > +     if (ret)
> > +             goto err_unref;
> > +
> > +     dev_priv->oa_pmu.oa_buffer.gtt_offset =
i915_gem_obj_ggtt_offset(bo);
> > +     dev_priv->oa_pmu.oa_buffer.addr = vmap_oa_buffer(bo);
>
> You can look forward to both i915_gem_object_create_internal() and
> i915_gem_object_pin_vmap()

Okey, will do, thanks.

>
> > +
> > +     /* Pre-DevBDW: OABUFFER must be set with counters off,
> > +      * before OASTATUS1, but after OASTATUS2 */
> > +     I915_WRITE(GEN7_OASTATUS2, dev_priv->oa_pmu.oa_buffer.gtt_offset |
> > +                GEN7_OASTATUS2_GGTT); /* head */
> > +     I915_WRITE(GEN7_OABUFFER, dev_priv->oa_pmu.oa_buffer.gtt_offset);
> > +     I915_WRITE(GEN7_OASTATUS1, dev_priv->oa_pmu.oa_buffer.gtt_offset |
> > +                GEN7_OASTATUS1_OABUFFER_SIZE_16M); /* tail */
> > +
> > +     DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr
= %p",
> > +                      dev_priv->oa_pmu.oa_buffer.gtt_offset,
> > +                      dev_priv->oa_pmu.oa_buffer.addr);
> > +
> > +     return 0;
> > +
> > +err_unref:
> > +     drm_gem_object_unreference_unlocked(&bo->base);
>
> But what I really what to say was:
> mutex deadlock^^^

Yikes, I've pushed an updated patch addressing this and can reply with a
new patch here in a bit.

Thanks,
- Robert


> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
Robert Bragg May 18, 2015, 4:36 p.m. UTC | #4
On 7 May 2015 15:58, "Chris Wilson" <chris@chris-wilson.co.uk> wrote:
>
> On Thu, May 07, 2015 at 03:15:50PM +0100, Robert Bragg wrote:
> > +     /* We bypass the default perf core perf_paranoid_cpu() ||
> > +      * CAP_SYS_ADMIN check by using the PERF_PMU_CAP_IS_DEVICE
> > +      * flag and instead authenticate based on whether the current
> > +      * pid owns the specified context, or require CAP_SYS_ADMIN
> > +      * when collecting cross-context metrics.
> > +      */
> > +     dev_priv->oa_pmu.specific_ctx = NULL;
> > +     if (oa_attr.single_context) {
> > +             u32 ctx_id = oa_attr.ctx_id;
> > +             unsigned int drm_fd = oa_attr.drm_fd;
> > +             struct fd fd = fdget(drm_fd);
> > +
> > +             if (fd.file) {
>
> Specify a ctx and not providing the right fd should be its own error,
> either EBADF or EINVAL.

Right, I went for both in the end; EBADF if fdget fails and EINVAL if
the fd is ok but we fail to lookup a context with it.

>
> > +                     dev_priv->oa_pmu.specific_ctx =
> > +                             lookup_context(dev_priv, fd.file, ctx_id);
> > +             }
>
> Missing fdput

Ah yes; fixed.

>
> > +     }
> > +
> > +     if (!dev_priv->oa_pmu.specific_ctx && !capable(CAP_SYS_ADMIN))
> > +             return -EACCES;
> > +
> > +     mutex_lock(&dev_priv->dev->struct_mutex);
>
> i915_mutex_interruptible, probably best to couple into the GPU error
> handling here as well especially as init_oa_buffer() will go onto touch
> GPU internals.

Ok, using i915_mutex_interruptible makes sense, I've also moved the
locking into init_oa_buffer.

About the GPU error handling, do you have any thoughts on what could
be most helpful here? I'm thinking a.t.m of extending
i915_capture_reg_state() in i915_gpu_error.c to capture the OACONTROL
+ OASTATUS state and perhaps all the UCGCTL unit clock gating state
too.

>
> > +     ret = init_oa_buffer(event);
> > +     mutex_unlock(&dev_priv->dev->struct_mutex);
> > +
> > +     if (ret)
> > +             return ret;
> > +
> > +     BUG_ON(dev_priv->oa_pmu.exclusive_event);
> > +     dev_priv->oa_pmu.exclusive_event = event;
> > +
> > +     event->destroy = i915_oa_event_destroy;
> > +
> > +     /* PRM - observability performance counters:
> > +      *
> > +      *   OACONTROL, performance counter enable, note:
> > +      *
> > +      *   "When this bit is set, in order to have coherent counts,
> > +      *   RC6 power state and trunk clock gating must be disabled.
> > +      *   This can be achieved by programming MMIO registers as
> > +      *   0xA094=0 and 0xA090[31]=1"
> > +      *
> > +      *   In our case we are expected that taking pm + FORCEWAKE
> > +      *   references will effectively disable RC6 and trunk clock
> > +      *   gating.
> > +      */
> > +     intel_runtime_pm_get(dev_priv);
> > +     intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
>
> That is a nuisance. Aside: Why isn't OA inside the powerctx? Is a subset
> valid with forcewake? It does perturb the system greatly to disable rc6,
> so I wonder if it could be made optional?

Yes, it's a shame.

I probably only really know enough about the OA unit design to be
dangerous and won't try and comment in detail here, but I think
there's more to it than not saving state in a power context. As I
understand it, there were a number of design changes made to enable
OA+RC6 support for BDW+, including having the OA unit automatically
write out reports to the OA buffer when entering RC6.

I think just FORCEWAKE_RENDER would work here, but only say that
because it looks like HSW only has the render forcewake domain from
what I could tell.

I think I need to update the comment above these lines as I don't
think these will affect crclk gating; these just handle disabling RC6.

The WIP patch I sent out basically represents me trying to get to the
bottom of the clock gating constraints we have.

At this point I think I need to disable CS unit gating via UCGCTL1, as
well as DOP gating for the render trunk clock via MISCCPCTL but I'm
not entirely confident about that just yet. At least empirically I see
these fixing some issues in rudimentary testing.

>
> > +
> > +     return 0;
> > +}
> > +
> > +static void update_oacontrol(struct drm_i915_private *dev_priv)
> > +{
> > +     BUG_ON(!spin_is_locked(&dev_priv->oa_pmu.lock));
> > +
> > +     if (dev_priv->oa_pmu.event_active) {
> > +             unsigned long ctx_id = 0;
> > +             bool pinning_ok = false;
> > +
> > +             if (dev_priv->oa_pmu.specific_ctx) {
> > +                     struct intel_context *ctx =
> > +                             dev_priv->oa_pmu.specific_ctx;
> > +                     struct drm_i915_gem_object *obj =
> > +                             ctx->legacy_hw_ctx.rcs_state;
>
> If only there was ctx->legacy_hw_ctx.rcs_vma...

ok, not sure if this is a prod to add that, a heads up that this is
coming or seething because some prior attempt to add this was nack'd.


>
> > +
> > +                     if (i915_gem_obj_is_pinned(obj)) {
> > +                             ctx_id = i915_gem_obj_ggtt_offset(obj);
> > +                             pinning_ok = true;
> > +                     }
> > +             }
> > +
> > +             if ((ctx_id == 0 || pinning_ok)) {
> > +                     bool periodic = dev_priv->oa_pmu.periodic;
> > +                     u32 period_exponent = dev_priv->oa_pmu.period_exponent;
> > +                     u32 report_format = dev_priv->oa_pmu.oa_buffer.format;
> > +
> > +                     I915_WRITE(GEN7_OACONTROL,
> > +                                (ctx_id & GEN7_OACONTROL_CTX_MASK) |
> > +                                (period_exponent <<
> > +                                 GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
> > +                                (periodic ?
> > +                                 GEN7_OACONTROL_TIMER_ENABLE : 0) |
> > +                                (report_format <<
> > +                                 GEN7_OACONTROL_FORMAT_SHIFT) |
> > +                                (ctx_id ?
> > +                                 GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
> > +                                GEN7_OACONTROL_ENABLE);
>
> I notice you don't use any write barriers...

ok, so I still haven't put write barriers within update_oacontrol()
itself, but I've now added mmiowb()s just before unlocking which is
done outside of the update_oacontrol(). I think a barrier just within
update_oacontrol() could be ok a.t.m while the pinning hooks currently
just use update_oacontol(), but in case we might introduce more
overlapping mmio configuration within these critical sections, waiting
until the unlock might be preferable. On the other hand, a.t.m the
pinning callbacks now have redundant wb()s while there is no specific
context filtering - not sure if that should be a concern.

Looking at this, I also didn't feel happy with the way I reset
oa_pmu->specific_context when destroying an event, considering that
->specific_context being set is what determines whether the pinning
callbacks may call update_oacontrol() asynchronously with respect to
the pmu methods. Although we know oacontrol will be disabled by the
time we come to destroy an event, it didn't seem great that that we
could be continuing to run update_oacontrol() up to the point where we
are resetting all the clock gating, power management and NOA enable
state. I'll attach a separate patch to see if you agree this is worth
changing.

Thanks for the comments.

- Robert

> -Chris
> --
> Chris Wilson, Intel Open Source Technology Centre
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a69002e..8af9f4a 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -16,6 +16,7 @@  i915-y := i915_drv.o \
 
 i915-$(CONFIG_COMPAT)   += i915_ioc32.o
 i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o
+i915-$(CONFIG_PERF_EVENTS) += i915_oa_perf.o
 
 # GEM code
 i915-y += i915_cmd_parser.o \
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index e44116f..1de6639 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -817,6 +817,11 @@  int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	mutex_init(&dev_priv->dpio_lock);
 	mutex_init(&dev_priv->modeset_restore_lock);
 
+	/* Must at least be registered before trying to pin any context
+	 * otherwise i915_oa_context_pin_notify() will lock an un-initialized
+	 * spinlock, upsetting lockdep checks */
+	i915_oa_pmu_register(dev);
+
 	intel_pm_setup(dev);
 
 	intel_display_crc_init(dev);
@@ -1062,6 +1067,7 @@  int i915_driver_unload(struct drm_device *dev)
 		return ret;
 	}
 
+	i915_oa_pmu_unregister(dev);
 	intel_power_domains_fini(dev_priv);
 
 	intel_gpu_ips_teardown();
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f5020a8..3d8e4ed 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -49,6 +49,7 @@ 
 #include <linux/hashtable.h>
 #include <linux/intel-iommu.h>
 #include <linux/kref.h>
+#include <linux/perf_event.h>
 #include <linux/pm_qos.h>
 
 /* General customization:
@@ -1816,6 +1817,35 @@  struct drm_i915_private {
 	 */
 	struct workqueue_struct *dp_wq;
 
+#ifdef CONFIG_PERF_EVENTS
+	struct {
+		struct pmu pmu;
+		spinlock_t lock;
+		struct hrtimer timer;
+		struct pt_regs dummy_regs;
+
+		struct perf_event *exclusive_event;
+		struct intel_context *specific_ctx;
+		bool event_active;
+
+		bool periodic;
+		u32 period_exponent;
+
+		u32 metrics_set;
+
+		struct {
+			struct drm_i915_gem_object *obj;
+			u32 gtt_offset;
+			u8 *addr;
+			u32 head;
+			u32 tail;
+			int format;
+			int format_size;
+			spinlock_t flush_lock;
+		} oa_buffer;
+	} oa_pmu;
+#endif
+
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
 	struct {
 		int (*execbuf_submit)(struct drm_device *dev, struct drm_file *file,
@@ -2975,6 +3005,20 @@  int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				    struct drm_file *file_priv);
 
+#ifdef CONFIG_PERF_EVENTS
+void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
+				struct intel_context *context);
+void i915_oa_context_unpin_notify(struct drm_i915_private *dev_priv,
+				  struct intel_context *context);
+#else
+static inline void
+i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
+			   struct intel_context *context) {}
+static inline void
+i915_oa_context_unpin_notify(struct drm_i915_private *dev_priv,
+			     struct intel_context *context) {}
+#endif
+
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct drm_device *dev,
 					  struct i915_address_space *vm,
@@ -3084,6 +3128,15 @@  int i915_parse_cmds(struct intel_engine_cs *ring,
 		    u32 batch_len,
 		    bool is_master);
 
+/* i915_oa_perf.c */
+#ifdef CONFIG_PERF_EVENTS
+extern void i915_oa_pmu_register(struct drm_device *dev);
+extern void i915_oa_pmu_unregister(struct drm_device *dev);
+#else
+static inline void i915_oa_pmu_register(struct drm_device *dev) {}
+static inline void i915_oa_pmu_unregister(struct drm_device *dev) {}
+#endif
+
 /* i915_suspend.c */
 extern int i915_save_state(struct drm_device *dev);
 extern int i915_restore_state(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index dd5bff6..3624435 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -133,6 +133,33 @@  static int get_context_size(struct drm_device *dev)
 	return ret;
 }
 
+static int i915_gem_context_pin_state(struct drm_device *dev,
+				      struct intel_context *ctx)
+{
+	int ret;
+
+	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
+
+	ret = i915_gem_obj_ggtt_pin(ctx->legacy_hw_ctx.rcs_state,
+				    get_context_alignment(dev), 0);
+	if (ret)
+		return ret;
+
+	i915_oa_context_pin_notify(dev->dev_private, ctx);
+
+	return 0;
+}
+
+static void i915_gem_context_unpin_state(struct drm_device *dev,
+					 struct intel_context *ctx)
+{
+	/* Ensure that we stop the OA unit referencing the context *before*
+	 * actually unpinning the ctx */
+	i915_oa_context_unpin_notify(dev->dev_private, ctx);
+
+	i915_gem_object_ggtt_unpin(ctx->legacy_hw_ctx.rcs_state);
+}
+
 void i915_gem_context_free(struct kref *ctx_ref)
 {
 	struct intel_context *ctx = container_of(ctx_ref,
@@ -260,8 +287,7 @@  i915_gem_create_context(struct drm_device *dev,
 		 * be available. To avoid this we always pin the default
 		 * context.
 		 */
-		ret = i915_gem_obj_ggtt_pin(ctx->legacy_hw_ctx.rcs_state,
-					    get_context_alignment(dev), 0);
+		ret = i915_gem_context_pin_state(dev, ctx);
 		if (ret) {
 			DRM_DEBUG_DRIVER("Couldn't pin %d\n", ret);
 			goto err_destroy;
@@ -287,7 +313,7 @@  i915_gem_create_context(struct drm_device *dev,
 
 err_unpin:
 	if (is_global_default_ctx && ctx->legacy_hw_ctx.rcs_state)
-		i915_gem_object_ggtt_unpin(ctx->legacy_hw_ctx.rcs_state);
+		i915_gem_context_unpin_state(dev, ctx);
 err_destroy:
 	i915_gem_context_unreference(ctx);
 	return ERR_PTR(ret);
@@ -314,7 +340,7 @@  void i915_gem_context_reset(struct drm_device *dev)
 
 		if (lctx) {
 			if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
-				i915_gem_object_ggtt_unpin(lctx->legacy_hw_ctx.rcs_state);
+				i915_gem_context_unpin_state(dev, lctx);
 
 			i915_gem_context_unreference(lctx);
 			ring->last_context = NULL;
@@ -388,12 +414,12 @@  void i915_gem_context_fini(struct drm_device *dev)
 		if (dev_priv->ring[RCS].last_context == dctx) {
 			/* Fake switch to NULL context */
 			WARN_ON(dctx->legacy_hw_ctx.rcs_state->active);
-			i915_gem_object_ggtt_unpin(dctx->legacy_hw_ctx.rcs_state);
+			i915_gem_context_unpin_state(dev, dctx);
 			i915_gem_context_unreference(dctx);
 			dev_priv->ring[RCS].last_context = NULL;
 		}
 
-		i915_gem_object_ggtt_unpin(dctx->legacy_hw_ctx.rcs_state);
+		i915_gem_context_unpin_state(dev, dctx);
 	}
 
 	for (i = 0; i < I915_NUM_RINGS; i++) {
@@ -650,8 +676,7 @@  static int do_switch(struct intel_engine_cs *ring,
 
 	/* Trying to pin first makes error handling easier. */
 	if (ring == &dev_priv->ring[RCS]) {
-		ret = i915_gem_obj_ggtt_pin(to->legacy_hw_ctx.rcs_state,
-					    get_context_alignment(ring->dev), 0);
+		ret = i915_gem_context_pin_state(ring->dev, to);
 		if (ret)
 			return ret;
 	}
@@ -763,7 +788,7 @@  static int do_switch(struct intel_engine_cs *ring,
 			from->legacy_hw_ctx.rcs_state->last_read_req) != ring);
 
 		/* obj is kept alive until the next request by its active ref */
-		i915_gem_object_ggtt_unpin(from->legacy_hw_ctx.rcs_state);
+		i915_gem_context_unpin_state(ring->dev, from);
 		i915_gem_context_unreference(from);
 	}
 
@@ -786,7 +811,7 @@  done:
 
 unpin_out:
 	if (ring->id == RCS)
-		i915_gem_object_ggtt_unpin(to->legacy_hw_ctx.rcs_state);
+		i915_gem_context_unpin_state(ring->dev, to);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
new file mode 100644
index 0000000..a1cf55f
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -0,0 +1,750 @@ 
+#include <linux/perf_event.h>
+#include <linux/sizes.h>
+
+#include "i915_drv.h"
+#include "intel_ringbuffer.h"
+
+/* Must be a power of two */
+#define OA_BUFFER_SIZE	     SZ_16M
+#define OA_TAKEN(tail, head) ((tail - head) & (OA_BUFFER_SIZE - 1))
+
+#define FREQUENCY 200
+#define PERIOD max_t(u64, 10000, NSEC_PER_SEC / FREQUENCY)
+
+static int hsw_perf_format_sizes[] = {
+	64,  /* A13_HSW */
+	128, /* A29_HSW */
+	128, /* A13_B8_C8_HSW */
+	-1,  /* Disallowed since 192 bytes doesn't factor into buffer size
+		(A29_B8_C8_HSW) */
+	64,  /* B4_C8_HSW */
+	256, /* A45_B8_C8_HSW */
+	128, /* B4_C8_A16_HSW */
+	64   /* C4_B8_HSW */
+};
+
+static void forward_one_oa_snapshot_to_event(struct drm_i915_private *dev_priv,
+					     u8 *snapshot,
+					     struct perf_event *event)
+{
+	struct perf_sample_data data;
+	int snapshot_size = dev_priv->oa_pmu.oa_buffer.format_size;
+	struct perf_raw_record raw;
+
+	WARN_ON(snapshot_size == 0);
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+
+	/* Note: the combined u32 raw->size member + raw data itself must be 8
+	 * byte aligned. (See note in init_oa_buffer for more details) */
+	raw.size = snapshot_size + 4;
+	raw.data = snapshot;
+
+	data.raw = &raw;
+
+	perf_event_overflow(event, &data, &dev_priv->oa_pmu.dummy_regs);
+}
+
+static u32 forward_oa_snapshots(struct drm_i915_private *dev_priv,
+				u32 head,
+				u32 tail)
+{
+	struct perf_event *exclusive_event = dev_priv->oa_pmu.exclusive_event;
+	int snapshot_size = dev_priv->oa_pmu.oa_buffer.format_size;
+	u8 *oa_buf_base = dev_priv->oa_pmu.oa_buffer.addr;
+	u32 mask = (OA_BUFFER_SIZE - 1);
+	u8 *snapshot;
+	u32 taken;
+
+	head -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
+	tail -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
+
+	/* Note: the gpu doesn't wrap the tail according to the OA buffer size
+	 * so when we need to make sure our head/tail values are in-bounds we
+	 * use the above mask.
+	 */
+
+	while ((taken = OA_TAKEN(tail, head))) {
+		/* The tail increases in 64 byte increments, not in
+		 * format_size steps. */
+		if (taken < snapshot_size)
+			break;
+
+		snapshot = oa_buf_base + (head & mask);
+		head += snapshot_size;
+
+		/* We currently only allow exclusive access to the counters
+		 * so only have one event to forward too... */
+		if (dev_priv->oa_pmu.event_active)
+			forward_one_oa_snapshot_to_event(dev_priv, snapshot,
+							 exclusive_event);
+	}
+
+	return dev_priv->oa_pmu.oa_buffer.gtt_offset + head;
+}
+
+static void flush_oa_snapshots(struct drm_i915_private *dev_priv,
+			       bool skip_if_flushing)
+{
+	unsigned long flags;
+	u32 oastatus2;
+	u32 oastatus1;
+	u32 head;
+	u32 tail;
+
+	/* Can either flush via hrtimer callback or pmu methods/fops */
+	if (skip_if_flushing) {
+
+		/* If the hrtimer triggers at the same time that we are
+		 * responding to a userspace initiated flush then we can
+		 * just bail out...
+		 */
+		if (!spin_trylock_irqsave(&dev_priv->oa_pmu.oa_buffer.flush_lock,
+					  flags))
+			return;
+	} else
+		spin_lock_irqsave(&dev_priv->oa_pmu.oa_buffer.flush_lock, flags);
+
+	WARN_ON(!dev_priv->oa_pmu.oa_buffer.addr);
+
+	oastatus2 = I915_READ(GEN7_OASTATUS2);
+	oastatus1 = I915_READ(GEN7_OASTATUS1);
+
+	head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
+	tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
+
+	if (oastatus1 & (GEN7_OASTATUS1_OABUFFER_OVERFLOW |
+			 GEN7_OASTATUS1_REPORT_LOST)) {
+
+		/* XXX: How can we convey report-lost errors to userspace?  It
+		 * doesn't look like perf's _REPORT_LOST mechanism is
+		 * appropriate in this case; that's just for cases where we
+		 * run out of space for samples in the perf circular buffer.
+		 *
+		 * Maybe we can claim a special report-id and use that to
+		 * forward status flags?
+		 */
+		pr_debug("OA buffer read error: addr = %p, head = %u, offset = %u, tail = %u cnt o'flow = %d, buf o'flow = %d, rpt lost = %d\n",
+			 dev_priv->oa_pmu.oa_buffer.addr,
+			 head,
+			 head - dev_priv->oa_pmu.oa_buffer.gtt_offset,
+			 tail,
+			 oastatus1 & GEN7_OASTATUS1_COUNTER_OVERFLOW ? 1 : 0,
+			 oastatus1 & GEN7_OASTATUS1_OABUFFER_OVERFLOW ? 1 : 0,
+			 oastatus1 & GEN7_OASTATUS1_REPORT_LOST ? 1 : 0);
+
+		I915_WRITE(GEN7_OASTATUS1, oastatus1 &
+			   ~(GEN7_OASTATUS1_OABUFFER_OVERFLOW |
+			     GEN7_OASTATUS1_REPORT_LOST));
+	}
+
+	head = forward_oa_snapshots(dev_priv, head, tail);
+
+	I915_WRITE(GEN7_OASTATUS2, (head & GEN7_OASTATUS2_HEAD_MASK) |
+				    GEN7_OASTATUS2_GGTT);
+
+	spin_unlock_irqrestore(&dev_priv->oa_pmu.oa_buffer.flush_lock, flags);
+}
+
+static void
+oa_buffer_destroy(struct drm_i915_private *i915)
+{
+	mutex_lock(&i915->dev->struct_mutex);
+
+	vunmap(i915->oa_pmu.oa_buffer.addr);
+	i915_gem_object_ggtt_unpin(i915->oa_pmu.oa_buffer.obj);
+	drm_gem_object_unreference(&i915->oa_pmu.oa_buffer.obj->base);
+
+	i915->oa_pmu.oa_buffer.obj = NULL;
+	i915->oa_pmu.oa_buffer.gtt_offset = 0;
+	i915->oa_pmu.oa_buffer.addr = NULL;
+
+	mutex_unlock(&i915->dev->struct_mutex);
+}
+
+static void i915_oa_event_destroy(struct perf_event *event)
+{
+	struct drm_i915_private *i915 =
+		container_of(event->pmu, typeof(*i915), oa_pmu.pmu);
+
+	WARN_ON(event->parent);
+
+	oa_buffer_destroy(i915);
+
+	i915->oa_pmu.specific_ctx = NULL;
+
+	BUG_ON(i915->oa_pmu.exclusive_event != event);
+	i915->oa_pmu.exclusive_event = NULL;
+
+	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
+	intel_runtime_pm_put(i915);
+}
+
+static void *vmap_oa_buffer(struct drm_i915_gem_object *obj)
+{
+	int i;
+	void *addr = NULL;
+	struct sg_page_iter sg_iter;
+	struct page **pages;
+
+	pages = drm_malloc_ab(obj->base.size >> PAGE_SHIFT, sizeof(*pages));
+	if (pages == NULL) {
+		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
+		goto finish;
+	}
+
+	i = 0;
+	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
+		pages[i] = sg_page_iter_page(&sg_iter);
+		i++;
+	}
+
+	addr = vmap(pages, i, 0, PAGE_KERNEL);
+	if (addr == NULL) {
+		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
+		goto finish;
+	}
+
+finish:
+	if (pages)
+		drm_free_large(pages);
+	return addr;
+}
+
+static int init_oa_buffer(struct perf_event *event)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
+	struct drm_i915_gem_object *bo;
+	int ret;
+
+	BUG_ON(!IS_HASWELL(dev_priv->dev));
+	BUG_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
+	BUG_ON(dev_priv->oa_pmu.oa_buffer.obj);
+
+	spin_lock_init(&dev_priv->oa_pmu.oa_buffer.flush_lock);
+
+	/* NB: We over allocate the OA buffer due to the way raw sample data
+	 * gets copied from the gpu mapped circular buffer into the perf
+	 * circular buffer so that only one copy is required.
+	 *
+	 * For each perf sample (raw->size + 4) needs to be 8 byte aligned,
+	 * where the 4 corresponds to the 32bit raw->size member that's
+	 * added to the sample header that userspace sees.
+	 *
+	 * Due to the + 4 for the size member: when we copy a report to the
+	 * userspace facing perf buffer we always copy an additional 4 bytes
+	 * from the subsequent report to make up for the miss alignment, but
+	 * when a report is at the end of the gpu mapped buffer we need to
+	 * read 4 bytes past the end of the buffer.
+	 */
+	bo = i915_gem_alloc_object(dev_priv->dev, OA_BUFFER_SIZE + PAGE_SIZE);
+	if (bo == NULL) {
+		DRM_ERROR("Failed to allocate OA buffer\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+	dev_priv->oa_pmu.oa_buffer.obj = bo;
+
+	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
+	if (ret)
+		goto err_unref;
+
+	/* PreHSW required 512K alignment, HSW requires 16M */
+	ret = i915_gem_obj_ggtt_pin(bo, SZ_16M, 0);
+	if (ret)
+		goto err_unref;
+
+	dev_priv->oa_pmu.oa_buffer.gtt_offset = i915_gem_obj_ggtt_offset(bo);
+	dev_priv->oa_pmu.oa_buffer.addr = vmap_oa_buffer(bo);
+
+	/* Pre-DevBDW: OABUFFER must be set with counters off,
+	 * before OASTATUS1, but after OASTATUS2 */
+	I915_WRITE(GEN7_OASTATUS2, dev_priv->oa_pmu.oa_buffer.gtt_offset |
+		   GEN7_OASTATUS2_GGTT); /* head */
+	I915_WRITE(GEN7_OABUFFER, dev_priv->oa_pmu.oa_buffer.gtt_offset);
+	I915_WRITE(GEN7_OASTATUS1, dev_priv->oa_pmu.oa_buffer.gtt_offset |
+		   GEN7_OASTATUS1_OABUFFER_SIZE_16M); /* tail */
+
+	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
+			 dev_priv->oa_pmu.oa_buffer.gtt_offset,
+			 dev_priv->oa_pmu.oa_buffer.addr);
+
+	return 0;
+
+err_unref:
+	drm_gem_object_unreference_unlocked(&bo->base);
+err:
+	return ret;
+}
+
+static enum hrtimer_restart hrtimer_sample(struct hrtimer *hrtimer)
+{
+	struct drm_i915_private *i915 =
+		container_of(hrtimer, typeof(*i915), oa_pmu.timer);
+
+	flush_oa_snapshots(i915, true);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD));
+	return HRTIMER_RESTART;
+}
+
+static struct intel_context *
+lookup_context(struct drm_i915_private *dev_priv,
+	       struct file *user_filp,
+	       u32 ctx_user_handle)
+{
+	struct intel_context *ctx;
+
+	mutex_lock(&dev_priv->dev->struct_mutex);
+	list_for_each_entry(ctx, &dev_priv->context_list, link) {
+		struct drm_file *drm_file;
+
+		if (!ctx->file_priv)
+			continue;
+
+		drm_file = ctx->file_priv->file;
+
+		if (user_filp->private_data == drm_file &&
+		    ctx->user_handle == ctx_user_handle) {
+			mutex_unlock(&dev_priv->dev->struct_mutex);
+			return ctx;
+		}
+	}
+	mutex_unlock(&dev_priv->dev->struct_mutex);
+
+	return NULL;
+}
+
+static int i915_oa_copy_attr(drm_i915_oa_attr_t __user *uattr,
+			     drm_i915_oa_attr_t *attr)
+{
+	u32 size;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, I915_OA_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (size < I915_OA_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uattr + sizeof(*attr);
+		end  = (void __user *)uattr + size;
+
+		for (; addr < end; addr++) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+		size = sizeof(*attr);
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+	if (attr->__reserved_1)
+		return -EINVAL;
+
+out:
+	return ret;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	ret = -E2BIG;
+	goto out;
+}
+
+static int i915_oa_event_init(struct perf_event *event)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
+	drm_i915_oa_attr_t oa_attr;
+	u64 report_format;
+	int ret = 0;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	ret = i915_oa_copy_attr(to_user_ptr(event->attr.config), &oa_attr);
+	if (ret)
+		return ret;
+
+	/* To avoid the complexity of having to accurately filter
+	 * counter snapshots and marshal to the appropriate client
+	 * we currently only allow exclusive access */
+	if (dev_priv->oa_pmu.oa_buffer.obj)
+		return -EBUSY;
+
+	report_format = oa_attr.format;
+	dev_priv->oa_pmu.oa_buffer.format = report_format;
+	dev_priv->oa_pmu.metrics_set = oa_attr.metrics_set;
+
+	if (IS_HASWELL(dev_priv->dev)) {
+		int snapshot_size;
+
+		if (report_format >= ARRAY_SIZE(hsw_perf_format_sizes))
+			return -EINVAL;
+
+		snapshot_size = hsw_perf_format_sizes[report_format];
+		if (snapshot_size < 0)
+			return -EINVAL;
+
+		dev_priv->oa_pmu.oa_buffer.format_size = snapshot_size;
+
+		if (oa_attr.metrics_set > I915_OA_METRICS_SET_3D)
+			return -EINVAL;
+	} else {
+		BUG(); /* pmu shouldn't have been registered */
+		return -ENODEV;
+	}
+
+	/* Since we are limited to an exponential scale for
+	 * programming the OA sampling period we don't allow userspace
+	 * to pass a precise attr.sample_period. */
+	if (event->attr.freq ||
+	    (event->attr.sample_period != 0 &&
+	     event->attr.sample_period != 1))
+		return -EINVAL;
+
+	dev_priv->oa_pmu.periodic = event->attr.sample_period;
+
+	/* Instead of allowing userspace to configure the period via
+	 * attr.sample_period we instead accept an exponent whereby
+	 * the sample_period will be:
+	 *
+	 *   80ns * 2^(period_exponent + 1)
+	 *
+	 * Programming a period of 160 nanoseconds would not be very
+	 * polite, so higher frequencies are reserved for root.
+	 */
+	if (dev_priv->oa_pmu.periodic) {
+		u64 period_exponent = oa_attr.timer_exponent;
+
+		if (period_exponent > 63)
+			return -EINVAL;
+
+		if (period_exponent < 15 && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		dev_priv->oa_pmu.period_exponent = period_exponent;
+	} else if (oa_attr.timer_exponent)
+		return -EINVAL;
+
+	/* We bypass the default perf core perf_paranoid_cpu() ||
+	 * CAP_SYS_ADMIN check by using the PERF_PMU_CAP_IS_DEVICE
+	 * flag and instead authenticate based on whether the current
+	 * pid owns the specified context, or require CAP_SYS_ADMIN
+	 * when collecting cross-context metrics.
+	 */
+	dev_priv->oa_pmu.specific_ctx = NULL;
+	if (oa_attr.single_context) {
+		u32 ctx_id = oa_attr.ctx_id;
+		unsigned int drm_fd = oa_attr.drm_fd;
+		struct fd fd = fdget(drm_fd);
+
+		if (fd.file) {
+			dev_priv->oa_pmu.specific_ctx =
+				lookup_context(dev_priv, fd.file, ctx_id);
+		}
+	}
+
+	if (!dev_priv->oa_pmu.specific_ctx && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	mutex_lock(&dev_priv->dev->struct_mutex);
+	ret = init_oa_buffer(event);
+	mutex_unlock(&dev_priv->dev->struct_mutex);
+
+	if (ret)
+		return ret;
+
+	BUG_ON(dev_priv->oa_pmu.exclusive_event);
+	dev_priv->oa_pmu.exclusive_event = event;
+
+	event->destroy = i915_oa_event_destroy;
+
+	/* PRM - observability performance counters:
+	 *
+	 *   OACONTROL, performance counter enable, note:
+	 *
+	 *   "When this bit is set, in order to have coherent counts,
+	 *   RC6 power state and trunk clock gating must be disabled.
+	 *   This can be achieved by programming MMIO registers as
+	 *   0xA094=0 and 0xA090[31]=1"
+	 *
+	 *   In our case we are expected that taking pm + FORCEWAKE
+	 *   references will effectively disable RC6 and trunk clock
+	 *   gating.
+	 */
+	intel_runtime_pm_get(dev_priv);
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+	return 0;
+}
+
+static void update_oacontrol(struct drm_i915_private *dev_priv)
+{
+	BUG_ON(!spin_is_locked(&dev_priv->oa_pmu.lock));
+
+	if (dev_priv->oa_pmu.event_active) {
+		unsigned long ctx_id = 0;
+		bool pinning_ok = false;
+
+		if (dev_priv->oa_pmu.specific_ctx) {
+			struct intel_context *ctx =
+				dev_priv->oa_pmu.specific_ctx;
+			struct drm_i915_gem_object *obj =
+				ctx->legacy_hw_ctx.rcs_state;
+
+			if (i915_gem_obj_is_pinned(obj)) {
+				ctx_id = i915_gem_obj_ggtt_offset(obj);
+				pinning_ok = true;
+			}
+		}
+
+		if ((ctx_id == 0 || pinning_ok)) {
+			bool periodic = dev_priv->oa_pmu.periodic;
+			u32 period_exponent = dev_priv->oa_pmu.period_exponent;
+			u32 report_format = dev_priv->oa_pmu.oa_buffer.format;
+
+			I915_WRITE(GEN7_OACONTROL,
+				   (ctx_id & GEN7_OACONTROL_CTX_MASK) |
+				   (period_exponent <<
+				    GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
+				   (periodic ?
+				    GEN7_OACONTROL_TIMER_ENABLE : 0) |
+				   (report_format <<
+				    GEN7_OACONTROL_FORMAT_SHIFT) |
+				   (ctx_id ?
+				    GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
+				   GEN7_OACONTROL_ENABLE);
+			return;
+		}
+	}
+
+	I915_WRITE(GEN7_OACONTROL, 0);
+}
+
+static void i915_oa_event_start(struct perf_event *event, int flags)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
+	unsigned long lock_flags;
+	u32 oastatus1, tail;
+
+	/* PRM - observability performance counters:
+	 *
+	 *   OACONTROL, specific context enable:
+	 *
+	 *   "OA unit level clock gating must be ENABLED when using
+	 *   specific ContextID feature."
+	 *
+	 * Assuming we don't ever disable OA unit level clock gating
+	 * lets just assert that this condition is met...
+	 */
+	WARN_ONCE(I915_READ(GEN6_UCGCTL3) & GEN6_OACSUNIT_CLOCK_GATE_DISABLE,
+		  "disabled OA unit level clock gating will result in incorrect per-context OA counters");
+
+	/* XXX: On Haswell, when threshold disable mode is desired,
+	 * instead of setting the threshold enable to '0', we need to
+	 * program it to '1' and set OASTARTTRIG1 bits 15:0 to 0
+	 * (threshold value of 0)
+	 */
+	I915_WRITE(OASTARTTRIG6, (OASTARTTRIG6_B4_TO_B7_THRESHOLD_ENABLE |
+				  OASTARTTRIG6_B4_CUSTOM_EVENT_ENABLE));
+	I915_WRITE(OASTARTTRIG5, 0); /* threshold value */
+
+	I915_WRITE(OASTARTTRIG2, (OASTARTTRIG2_B0_TO_B3_THRESHOLD_ENABLE |
+				  OASTARTTRIG2_B0_CUSTOM_EVENT_ENABLE));
+	I915_WRITE(OASTARTTRIG1, 0); /* threshold value */
+
+	/* Setup B0 as the gpu clock counter... */
+	I915_WRITE(OACEC0_0, OACEC0_0_B0_COMPARE_GREATER_OR_EQUAL); /* to 0 */
+	I915_WRITE(OACEC0_1, 0xfffe); /* Select NOA[0] */
+
+	spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
+
+	dev_priv->oa_pmu.event_active = true;
+	update_oacontrol(dev_priv);
+
+	/* Reset the head ptr to ensure we don't forward reports relating
+	 * to a previous perf event */
+	oastatus1 = I915_READ(GEN7_OASTATUS1);
+	tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
+	I915_WRITE(GEN7_OASTATUS2, (tail & GEN7_OASTATUS2_HEAD_MASK) |
+				    GEN7_OASTATUS2_GGTT);
+
+	spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+
+	if (event->attr.sample_period)
+		__hrtimer_start_range_ns(&dev_priv->oa_pmu.timer,
+					 ns_to_ktime(PERIOD), 0,
+					 HRTIMER_MODE_REL_PINNED, 0);
+
+	event->hw.state = 0;
+}
+
+static void i915_oa_event_stop(struct perf_event *event, int flags)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
+	unsigned long lock_flags;
+
+	spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
+	dev_priv->oa_pmu.event_active = false;
+	update_oacontrol(dev_priv);
+	spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+
+	if (event->attr.sample_period) {
+		hrtimer_cancel(&dev_priv->oa_pmu.timer);
+		flush_oa_snapshots(dev_priv, false);
+	}
+
+	event->hw.state = PERF_HES_STOPPED;
+}
+
+static int i915_oa_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		i915_oa_event_start(event, flags);
+
+	return 0;
+}
+
+static void i915_oa_event_del(struct perf_event *event, int flags)
+{
+	i915_oa_event_stop(event, flags);
+}
+
+static void i915_oa_event_read(struct perf_event *event)
+{
+	struct drm_i915_private *i915 =
+		container_of(event->pmu, typeof(*i915), oa_pmu.pmu);
+
+	/* XXX: What counter would be useful here? */
+	local64_set(&event->count, 0);
+}
+
+static void i915_oa_event_flush(struct perf_event *event)
+{
+	struct drm_i915_private *i915 =
+		container_of(event->pmu, typeof(*i915), oa_pmu.pmu);
+
+	/* We want userspace to be able to use a read() to explicitly
+	 * flush OA counter snapshots... */
+	if (event->attr.sample_period)
+		flush_oa_snapshots(i915, true);
+}
+
+static int i915_oa_event_event_idx(struct perf_event *event)
+{
+	return 0;
+}
+
+void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
+				struct intel_context *context)
+{
+	unsigned long flags;
+
+	if (dev_priv->oa_pmu.pmu.event_init == NULL)
+		return;
+
+	spin_lock_irqsave(&dev_priv->oa_pmu.lock, flags);
+
+	if (dev_priv->oa_pmu.specific_ctx == context)
+		update_oacontrol(dev_priv);
+
+	spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, flags);
+}
+
+void i915_oa_context_unpin_notify(struct drm_i915_private *dev_priv,
+				  struct intel_context *context)
+{
+	unsigned long flags;
+
+	if (dev_priv->oa_pmu.pmu.event_init == NULL)
+		return;
+
+	spin_lock_irqsave(&dev_priv->oa_pmu.lock, flags);
+
+	if (dev_priv->oa_pmu.specific_ctx == context)
+		update_oacontrol(dev_priv);
+
+	spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, flags);
+}
+
+void i915_oa_pmu_register(struct drm_device *dev)
+{
+	struct drm_i915_private *i915 = to_i915(dev);
+
+	if (!IS_HASWELL(dev))
+		return;
+
+	/* We need to be careful about forwarding cpu metrics to
+	 * userspace considering that PERF_PMU_CAP_IS_DEVICE bypasses
+	 * the events/core security check that stops an unprivileged
+	 * process collecting metrics for other processes.
+	 */
+	i915->oa_pmu.dummy_regs = *task_pt_regs(current);
+
+	hrtimer_init(&i915->oa_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	i915->oa_pmu.timer.function = hrtimer_sample;
+
+	spin_lock_init(&i915->oa_pmu.lock);
+
+	i915->oa_pmu.pmu.capabilities  = PERF_PMU_CAP_IS_DEVICE;
+
+	/* Effectively disallow opening an event with a specific pid
+	 * since we aren't interested in processes running on the cpu...
+	 */
+	i915->oa_pmu.pmu.task_ctx_nr   = perf_invalid_context;
+
+	i915->oa_pmu.pmu.event_init    = i915_oa_event_init;
+	i915->oa_pmu.pmu.add	       = i915_oa_event_add;
+	i915->oa_pmu.pmu.del	       = i915_oa_event_del;
+	i915->oa_pmu.pmu.start	       = i915_oa_event_start;
+	i915->oa_pmu.pmu.stop	       = i915_oa_event_stop;
+	i915->oa_pmu.pmu.read	       = i915_oa_event_read;
+	i915->oa_pmu.pmu.flush	       = i915_oa_event_flush;
+	i915->oa_pmu.pmu.event_idx     = i915_oa_event_event_idx;
+
+	if (perf_pmu_register(&i915->oa_pmu.pmu, "i915_oa", -1))
+		i915->oa_pmu.pmu.event_init = NULL;
+}
+
+void i915_oa_pmu_unregister(struct drm_device *dev)
+{
+	struct drm_i915_private *i915 = to_i915(dev);
+
+	if (i915->oa_pmu.pmu.event_init == NULL)
+		return;
+
+	perf_pmu_unregister(&i915->oa_pmu.pmu);
+	i915->oa_pmu.pmu.event_init = NULL;
+}
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 2fa1669..9e427cc 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -516,6 +516,73 @@ 
 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
 
 #define GEN7_OACONTROL 0x2360
+#define  GEN7_OACONTROL_CTX_MASK	    0xFFFFF000
+#define  GEN7_OACONTROL_TIMER_PERIOD_MASK   0x3F
+#define  GEN7_OACONTROL_TIMER_PERIOD_SHIFT  6
+#define  GEN7_OACONTROL_TIMER_ENABLE	    (1<<5)
+#define  GEN7_OACONTROL_FORMAT_A13	    (0<<2)
+#define  GEN7_OACONTROL_FORMAT_A29	    (1<<2)
+#define  GEN7_OACONTROL_FORMAT_A13_B8_C8    (2<<2)
+#define  GEN7_OACONTROL_FORMAT_A29_B8_C8    (3<<2)
+#define  GEN7_OACONTROL_FORMAT_B4_C8	    (4<<2)
+#define  GEN7_OACONTROL_FORMAT_A45_B8_C8    (5<<2)
+#define  GEN7_OACONTROL_FORMAT_B4_C8_A16    (6<<2)
+#define  GEN7_OACONTROL_FORMAT_C4_B8	    (7<<2)
+#define  GEN7_OACONTROL_FORMAT_SHIFT	    2
+#define  GEN7_OACONTROL_PER_CTX_ENABLE	    (1<<1)
+#define  GEN7_OACONTROL_ENABLE		    (1<<0)
+
+#define OASTARTTRIG5 0x02720
+#define  OASTARTTRIG5_THRESHOLD_VALUE_MASK	0xffff
+
+#define OASTARTTRIG6 0x02724
+#define  OASTARTTRIG6_B4_TO_B7_THRESHOLD_ENABLE (1<<23)
+#define  OASTARTTRIG6_B4_CUSTOM_EVENT_ENABLE	(1<<28)
+
+#define OASTARTTRIG1 0x02710
+#define  OASTARTTRIG1_THRESHOLD_VALUE_MASK	0xffff
+
+#define OASTARTTRIG2 0x02714
+#define  OASTARTTRIG2_B0_TO_B3_THRESHOLD_ENABLE (1<<23)
+#define  OASTARTTRIG2_B0_CUSTOM_EVENT_ENABLE	(1<<28)
+
+#define OACEC0_0 0x2770
+#define  OACEC0_0_B0_COMPARE_ANY_EQUAL		0
+#define  OACEC0_0_B0_COMPARE_OR			0
+#define  OACEC0_0_B0_COMPARE_GREATER_THAN	1
+#define  OACEC0_0_B0_COMPARE_EQUAL		2
+#define  OACEC0_0_B0_COMPARE_GREATER_OR_EQUAL	3
+#define  OACEC0_0_B0_COMPARE_LESS_THAN		4
+#define  OACEC0_0_B0_COMPARE_NOT_EQUAL		5
+#define  OACEC0_0_B0_COMPARE_LESS_OR_EQUAL	6
+#define  OACEC0_0_B0_COMPARE_VALUE_MASK		0xffff
+#define  OACEC0_0_B0_COMPARE_VALUE_SHIFT	3
+
+#define OACEC0_1 0x2774
+
+#define GEN7_OABUFFER 0x23B0 /* R/W */
+#define  GEN7_OABUFFER_OVERRUN_DISABLE	    (1<<3)
+#define  GEN7_OABUFFER_EDGE_TRIGGER	    (1<<2)
+#define  GEN7_OABUFFER_STOP_RESUME_ENABLE   (1<<1)
+#define  GEN7_OABUFFER_RESUME		    (1<<0)
+
+#define GEN7_OASTATUS1 0x2364
+#define  GEN7_OASTATUS1_TAIL_MASK	    0xffffffc0
+#define  GEN7_OASTATUS1_OABUFFER_SIZE_128K  (0<<3)
+#define  GEN7_OASTATUS1_OABUFFER_SIZE_256K  (1<<3)
+#define  GEN7_OASTATUS1_OABUFFER_SIZE_512K  (2<<3)
+#define  GEN7_OASTATUS1_OABUFFER_SIZE_1M    (3<<3)
+#define  GEN7_OASTATUS1_OABUFFER_SIZE_2M    (4<<3)
+#define  GEN7_OASTATUS1_OABUFFER_SIZE_4M    (5<<3)
+#define  GEN7_OASTATUS1_OABUFFER_SIZE_8M    (6<<3)
+#define  GEN7_OASTATUS1_OABUFFER_SIZE_16M   (7<<3)
+#define  GEN7_OASTATUS1_COUNTER_OVERFLOW    (1<<2)
+#define  GEN7_OASTATUS1_OABUFFER_OVERFLOW   (1<<1)
+#define  GEN7_OASTATUS1_REPORT_LOST	    (1<<0)
+
+#define GEN7_OASTATUS2 0x2368
+#define GEN7_OASTATUS2_HEAD_MASK    0xffffffc0
+#define GEN7_OASTATUS2_GGTT	    0x1
 
 #define _GEN7_PIPEA_DE_LOAD_SL	0x70068
 #define _GEN7_PIPEB_DE_LOAD_SL	0x71068
@@ -6497,6 +6564,7 @@  enum skl_disp_power_wells {
 # define GEN6_RCCUNIT_CLOCK_GATE_DISABLE		(1 << 11)
 
 #define GEN6_UCGCTL3				0x9408
+# define GEN6_OACSUNIT_CLOCK_GATE_DISABLE		(1 << 20)
 
 #define GEN7_UCGCTL4				0x940c
 #define  GEN7_L3BANK2X_CLOCK_GATE_DISABLE	(1<<25)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 4851d66..f78f232 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -58,6 +58,35 @@ 
 #define I915_ERROR_UEVENT		"ERROR"
 #define I915_RESET_UEVENT		"RESET"
 
+/**
+ * DOC: perf events configuration exposed by i915 through /sys/bus/event_sources/drivers/i915_oa
+ *
+ */
+
+#define I915_OA_FORMAT_A13_HSW		0
+#define I915_OA_FORMAT_A29_HSW		1
+#define I915_OA_FORMAT_A13_B8_C8_HSW	2
+#define I915_OA_FORMAT_B4_C8_HSW	4
+#define I915_OA_FORMAT_A45_B8_C8_HSW	5
+#define I915_OA_FORMAT_B4_C8_A16_HSW	6
+#define I915_OA_FORMAT_C4_B8_HSW	7
+
+#define I915_OA_ATTR_SIZE_VER0		32  /* sizeof first published struct */
+
+typedef struct _drm_i915_oa_attr {
+	__u32 size;
+
+	__u32 format;
+	__u32 metrics_set;
+	__u32 timer_exponent;
+
+	__u32 drm_fd;
+	__u32 ctx_id;
+
+	__u64 single_context : 1,
+	      __reserved_1 : 63;
+} drm_i915_oa_attr_t;
+
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */
 #define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use